by Josh Dillon, last updated February 7, 2023
This notebook is designed to figure out a single full-day RFI mask using the best autocorelations, taking individual file_calibration notebook results as a prior but then potentially undoing flags.
Here's a set of links to skip to particular figures and tables:
import time
tstart = time.time()
import os
os.environ['HDF5_USE_FILE_LOCKING'] = 'FALSE'
import h5py
import hdf5plugin # REQUIRED to have the compression plugins available
import numpy as np
import pandas as pd
import glob
import os
import matplotlib.pyplot as plt
import matplotlib
import copy
from pyuvdata import UVFlag, UVData, UVCal
from hera_cal import io, utils, abscal
from hera_cal.smooth_cal import CalibrationSmoother, dpss_filters, solve_2D_DPSS
from hera_qm import ant_class, xrfi
from hera_filters import dspec
from IPython.display import display, HTML
%matplotlib inline
display(HTML("<style>.container { width:100% !important; }</style>"))
_ = np.seterr(all='ignore') # get rid of red warnings
%config InlineBackend.figure_format = 'retina'
# get filenames
SUM_FILE = os.environ.get("SUM_FILE", None)
# SUM_FILE = '/users/jsdillon/lustre/H6C/abscal/2459853/zen.2459853.25518.sum.uvh5' # If sum_file is not defined in the environment variables, define it here.
SUM_SUFFIX = os.environ.get("SUM_SUFFIX", 'sum.uvh5')
SUM_AUTOS_SUFFIX = os.environ.get("SUM_AUTOS_SUFFIX", 'sum.autos.uvh5')
DIFF_AUTOS_SUFFIX = os.environ.get("DIFF_AUTOS_SUFFIX", 'diff.autos.uvh5')
CAL_SUFFIX = os.environ.get("CAL_SUFFIX", 'sum.omni.calfits')
ANT_CLASS_SUFFIX = os.environ.get("ANT_CLASS_SUFFIX", 'sum.ant_class.csv')
OUT_FLAG_SUFFIX = os.environ.get("OUT_FLAG_SUFFIX", 'sum.flag_waterfall.h5')
sum_glob = '.'.join(SUM_FILE.split('.')[:-3]) + '.*.' + SUM_SUFFIX
auto_sums_glob = sum_glob.replace(SUM_SUFFIX, SUM_AUTOS_SUFFIX)
auto_diffs_glob = sum_glob.replace(SUM_SUFFIX, DIFF_AUTOS_SUFFIX)
cal_files_glob = sum_glob.replace(SUM_SUFFIX, CAL_SUFFIX)
ant_class_csvs_glob = sum_glob.replace(SUM_SUFFIX, ANT_CLASS_SUFFIX)
# A priori flag settings
FM_LOW_FREQ = float(os.environ.get("FM_LOW_FREQ", 87.5)) # in MHz
FM_HIGH_FREQ = float(os.environ.get("FM_HIGH_FREQ", 108.0)) # in MHz
FM_freq_range = [FM_LOW_FREQ * 1e6, FM_HIGH_FREQ * 1e6]
MAX_SOLAR_ALT = float(os.environ.get("MAX_SOLAR_ALT", 0.0)) # in degrees
# DPSS settings
FREQ_FILTER_SCALE = float(os.environ.get("FREQ_FILTER_SCALE", 5.0)) # in MHz
TIME_FILTER_SCALE = float(os.environ.get("TIME_FILTER_SCALE", 450.0))# in s
EIGENVAL_CUTOFF = float(os.environ.get("EIGENVAL_CUTOFF", 1e-12))
# Outlier flagging settings
MIN_FRAC_OF_AUTOS = float(os.environ.get("MIN_FRAC_OF_AUTOS", .25))
MAX_AUTO_L2 = float(os.environ.get("MAX_AUTRO_L2", 1.2))
Z_THRESH = float(os.environ.get("Z_THRESH", 5.0))
WS_Z_THRESH = float(os.environ.get("WS_Z_THRESH", 4.0))
AVG_Z_THRESH = float(os.environ.get("AVG_Z_THRESH", 1.5))
REPEAT_FLAG_Z_THRESH = float(os.environ.get("REPEAT_FLAG_Z_THESH", 2.0))
MAX_FREQ_FLAG_FRAC = float(os.environ.get("MAX_FREQ_FLAG_FRAC", .25))
MAX_TIME_FLAG_FRAC = float(os.environ.get("MAX_TIME_FLAG_FRAC", .1))
for setting in ['FM_LOW_FREQ', 'FM_HIGH_FREQ', 'MAX_SOLAR_ALT', 'FREQ_FILTER_SCALE', 'TIME_FILTER_SCALE',
'EIGENVAL_CUTOFF', 'MIN_FRAC_OF_AUTOS', 'MAX_AUTO_L2', 'Z_THRESH', 'WS_Z_THRESH', 'AVG_Z_THRESH', 'REPEAT_FLAG_Z_THRESH',
'MAX_FREQ_FLAG_FRAC ', 'MAX_TIME_FLAG_FRAC ']:
print(f'{setting} = {eval(setting)}')
FM_LOW_FREQ = 87.5 FM_HIGH_FREQ = 108.0 MAX_SOLAR_ALT = 0.0 FREQ_FILTER_SCALE = 5.0 TIME_FILTER_SCALE = 450.0 EIGENVAL_CUTOFF = 1e-12 MIN_FRAC_OF_AUTOS = 0.25 MAX_AUTO_L2 = 1.2 Z_THRESH = 5.0 WS_Z_THRESH = 4.0 AVG_Z_THRESH = 1.5 REPEAT_FLAG_Z_THRESH = 2.0 MAX_FREQ_FLAG_FRAC = 0.25 MAX_TIME_FLAG_FRAC = 0.1
auto_sums = sorted(glob.glob(auto_sums_glob))
print(f'Found {len(auto_sums)} *.{SUM_AUTOS_SUFFIX} files starting with {auto_sums[0]}.')
auto_diffs = sorted(glob.glob(auto_diffs_glob))
print(f'Found {len(auto_diffs)} *.{DIFF_AUTOS_SUFFIX} files starting with {auto_diffs[0]}.')
cal_files = sorted(glob.glob(cal_files_glob))
print(f'Found {len(cal_files)} *.{CAL_SUFFIX} files starting with {cal_files[0]}.')
ant_class_csvs = sorted(glob.glob(ant_class_csvs_glob))
print(f'Found {len(ant_class_csvs)} *.{ANT_CLASS_SUFFIX} files starting with {ant_class_csvs[0]}.')
Found 1655 *.sum.autos.uvh5 files starting with /mnt/sn1/2460100/zen.2460100.21285.sum.autos.uvh5. Found 1655 *.diff.autos.uvh5 files starting with /mnt/sn1/2460100/zen.2460100.21285.diff.autos.uvh5. Found 1655 *.sum.omni.calfits files starting with /mnt/sn1/2460100/zen.2460100.21285.sum.omni.calfits. Found 1655 *.sum.ant_class.csv files starting with /mnt/sn1/2460100/zen.2460100.21285.sum.ant_class.csv.
# Load ant_class csvs
tables = [pd.read_csv(f).dropna(axis=0, how='all') for f in ant_class_csvs]
table_cols = tables[0].columns[1::2]
class_cols = tables[0].columns[2::2]
# Figure out antennas that were not flagged when the sun was down, or were only flagged for Even/Odd Zeros or Redcal chi^2
ap_strs = np.array(tables[0]['Antenna'])
ant_flags = np.array([t[class_cols] for t in tables]) == 'bad'
sun_low_enough = np.array([t['Solar Alt'] < MAX_SOLAR_ALT for t in tables])
ants = sorted(set(int(a[:-1]) for a in ap_strs))
candidate_autos = set()
for i, ap_str in enumerate(ap_strs):
has_other_flags = np.any([(ant_flags[:, i, cc] & sun_low_enough[:, i]) for cc, colname in enumerate(class_cols)
if colname not in ['Antenna Class', 'Even/Odd Zeros Class','Redcal chi^2 Class']])
if not has_other_flags:
ap = int(ap_str[:-1]), utils.comply_pol(ap_str[-1])
candidate_autos.add(utils.join_bl(ap, ap))
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Cell In [7], line 3 1 # Figure out antennas that were not flagged when the sun was down, or were only flagged for Even/Odd Zeros or Redcal chi^2 2 ap_strs = np.array(tables[0]['Antenna']) ----> 3 ant_flags = np.array([t[class_cols] for t in tables]) == 'bad' 4 sun_low_enough = np.array([t['Solar Alt'] < MAX_SOLAR_ALT for t in tables]) 5 ants = sorted(set(int(a[:-1]) for a in ap_strs)) ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (1655,) + inhomogeneous part.
# Load sum and diff autos, checking to see whether any of them show packet loss
good_data = {}
info_dicts = {}
for sf, df in list(zip(auto_sums, auto_diffs)):
rv = io.read_hera_hdf5(sf, bls=candidate_autos)
good_data[sf] = rv['data']
info_dicts[sf] = rv['info']
diff = io.read_hera_hdf5(df, bls=candidate_autos)['data']
zeros_class = ant_class.even_odd_zeros_checker(good_data[sf], diff)
for ant in zeros_class.bad_ants:
candidate_autos.remove(utils.join_bl(ant, ant))
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In [8], line 5 3 info_dicts = {} 4 for sf, df in list(zip(auto_sums, auto_diffs)): ----> 5 rv = io.read_hera_hdf5(sf, bls=candidate_autos) 6 good_data[sf] = rv['data'] 7 info_dicts[sf] = rv['info'] NameError: name 'candidate_autos' is not defined
# load calibration solutions
cs = CalibrationSmoother(cal_files, load_cspa=False, load_chisq=False, pick_refant=False)
initial_cal_flags = np.all([f for f in cs.flag_grids.values()], axis=0)
def average_autos(per_file_autos, bls_to_use, auto_sums, cs):
'''Averages autos over baselines, matching the time_grid in CalibrationSmoother cs.'''
avg_per_file_autos = {sf: np.mean([per_file_autos[sf][bl] for bl in bls_to_use], axis=0) for sf in auto_sums}
avg_autos = np.zeros((len(cs.time_grid), len(cs.freqs)), dtype=float)
for sf, cf in zip(auto_sums, cs.cals):
avg_autos[cs.time_indices[cf], :] = np.abs(avg_per_file_autos[sf])
return avg_autos
avg_candidate_auto = average_autos(good_data, candidate_autos, auto_sums, cs)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In [12], line 1 ----> 1 avg_candidate_auto = average_autos(good_data, candidate_autos, auto_sums, cs) NameError: name 'candidate_autos' is not defined
def flag_FM(flags, freqs, freq_range=[87.5e6, 108e6]):
'''Apply flags to all frequencies within freq_range (in Hz).'''
flags[:, np.logical_and(freqs >= freq_range[0], freqs <= freq_range[1])] = True
flag_FM(initial_cal_flags, cs.freqs, freq_range=FM_freq_range)
def flag_sun(flags, times, max_solar_alt=0):
'''Apply flags to all times where the solar altitude is greater than max_solar_alt (in degrees).'''
solar_altitudes_degrees = utils.get_sun_alt(times)
flags[solar_altitudes_degrees >= max_solar_alt, :] = True
flag_sun(initial_cal_flags, cs.time_grid, max_solar_alt=MAX_SOLAR_ALT)
def predict_auto_noise(auto, dt, df, nsamples=1):
'''Predict noise on an (antenna-averaged) autocorrelation. The product of Delta t and Delta f
must be unitless. For N autocorrelations averaged together, use nsamples=N.'''
int_count = int(dt * df) * nsamples
return np.abs(auto) / np.sqrt(int_count / 2)
# Figure out noise and weights
int_time = 24 * 3600 * np.median(np.diff(cs.time_grid))
chan_res = np.median(np.diff(cs.freqs))
noise = predict_auto_noise(avg_candidate_auto, int_time, chan_res, nsamples=1)
wgts = np.where(initial_cal_flags, 0, noise**-2)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In [18], line 4 2 int_time = 24 * 3600 * np.median(np.diff(cs.time_grid)) 3 chan_res = np.median(np.diff(cs.freqs)) ----> 4 noise = predict_auto_noise(avg_candidate_auto, int_time, chan_res, nsamples=1) 5 wgts = np.where(initial_cal_flags, 0, noise**-2) NameError: name 'avg_candidate_auto' is not defined
# Filter every autocorrelation individually
XTXinv = None
models = {}
sqrt_mean_sqs = {}
for bl in candidate_autos:
auto_here = average_autos(good_data, [bl], auto_sums, cs)
time_filters, freq_filters = dpss_filters(freqs=cs.freqs, # Hz
times=cs.time_grid, # JD
freq_scale=FREQ_FILTER_SCALE,
time_scale=TIME_FILTER_SCALE,
eigenval_cutoff=EIGENVAL_CUTOFF)
models[bl], fit_info = solve_2D_DPSS(auto_here, wgts, time_filters, freq_filters, XTXinv=XTXinv)
XTXinv = fit_info['XTXinv']
noise_model = predict_auto_noise(models[bl], int_time, chan_res, nsamples=1)
sqrt_mean_sqs[bl] = np.nanmean(np.where(initial_cal_flags, np.nan, (auto_here - models[bl]) / noise_model)**2)**.5
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In [19], line 5 3 models = {} 4 sqrt_mean_sqs = {} ----> 5 for bl in candidate_autos: 6 auto_here = average_autos(good_data, [bl], auto_sums, cs) 7 time_filters, freq_filters = dpss_filters(freqs=cs.freqs, # Hz 8 times=cs.time_grid, # JD 9 freq_scale=FREQ_FILTER_SCALE, 10 time_scale=TIME_FILTER_SCALE, 11 eigenval_cutoff=EIGENVAL_CUTOFF) NameError: name 'candidate_autos' is not defined
# Pick best autocorrelations to filter on
L2_bound = max(np.quantile(list(sqrt_mean_sqs.values()), MIN_FRAC_OF_AUTOS), MAX_AUTO_L2)
good_auto_bls = [bl for bl in candidate_autos if sqrt_mean_sqs[bl] <= L2_bound]
print(f'Using {len(good_auto_bls)} out of {len(candidate_autos)} candidate autocorrelations ({len(good_auto_bls) / len(candidate_autos):.2%}).')
--------------------------------------------------------------------------- IndexError Traceback (most recent call last) Cell In [20], line 2 1 # Pick best autocorrelations to filter on ----> 2 L2_bound = max(np.quantile(list(sqrt_mean_sqs.values()), MIN_FRAC_OF_AUTOS), MAX_AUTO_L2) 3 good_auto_bls = [bl for bl in candidate_autos if sqrt_mean_sqs[bl] <= L2_bound] 4 print(f'Using {len(good_auto_bls)} out of {len(candidate_autos)} candidate autocorrelations ({len(good_auto_bls) / len(candidate_autos):.2%}).') File <__array_function__ internals>:200, in quantile(*args, **kwargs) File ~/mambaforge/envs/RTP/lib/python3.10/site-packages/numpy/lib/function_base.py:4461, in quantile(a, q, axis, out, overwrite_input, method, keepdims, interpolation) 4459 if not _quantile_is_valid(q): 4460 raise ValueError("Quantiles must be in the range [0, 1]") -> 4461 return _quantile_unchecked( 4462 a, q, axis, out, overwrite_input, method, keepdims) File ~/mambaforge/envs/RTP/lib/python3.10/site-packages/numpy/lib/function_base.py:4473, in _quantile_unchecked(a, q, axis, out, overwrite_input, method, keepdims) 4465 def _quantile_unchecked(a, 4466 q, 4467 axis=None, (...) 4470 method="linear", 4471 keepdims=False): 4472 """Assumes that q is in [0, 1], and is an ndarray""" -> 4473 return _ureduce(a, 4474 func=_quantile_ureduce_func, 4475 q=q, 4476 keepdims=keepdims, 4477 axis=axis, 4478 out=out, 4479 overwrite_input=overwrite_input, 4480 method=method) File ~/mambaforge/envs/RTP/lib/python3.10/site-packages/numpy/lib/function_base.py:3752, in _ureduce(a, func, keepdims, **kwargs) 3749 index_out = (0, ) * nd 3750 kwargs['out'] = out[(Ellipsis, ) + index_out] -> 3752 r = func(a, **kwargs) 3754 if out is not None: 3755 return out File ~/mambaforge/envs/RTP/lib/python3.10/site-packages/numpy/lib/function_base.py:4639, in _quantile_ureduce_func(a, q, axis, out, overwrite_input, method) 4637 else: 4638 arr = a.copy() -> 4639 result = _quantile(arr, 4640 quantiles=q, 4641 axis=axis, 4642 method=method, 4643 out=out) 4644 return result File ~/mambaforge/envs/RTP/lib/python3.10/site-packages/numpy/lib/function_base.py:4745, in _quantile(arr, quantiles, axis, method, out) 4737 arr.partition( 4738 np.unique(np.concatenate(([0, -1], 4739 previous_indexes.ravel(), 4740 next_indexes.ravel(), 4741 ))), 4742 axis=DATA_AXIS) 4743 if np.issubdtype(arr.dtype, np.inexact): 4744 slices_having_nans = np.isnan( -> 4745 take(arr, indices=-1, axis=DATA_AXIS) 4746 ) 4747 else: 4748 slices_having_nans = None File <__array_function__ internals>:200, in take(*args, **kwargs) File ~/mambaforge/envs/RTP/lib/python3.10/site-packages/numpy/core/fromnumeric.py:190, in take(a, indices, axis, out, mode) 93 @array_function_dispatch(_take_dispatcher) 94 def take(a, indices, axis=None, out=None, mode='raise'): 95 """ 96 Take elements from an array along an axis. 97 (...) 188 [5, 7]]) 189 """ --> 190 return _wrapfunc(a, 'take', indices, axis=axis, out=out, mode=mode) File ~/mambaforge/envs/RTP/lib/python3.10/site-packages/numpy/core/fromnumeric.py:57, in _wrapfunc(obj, method, *args, **kwds) 54 return _wrapit(obj, method, *args, **kwds) 56 try: ---> 57 return bound(*args, **kwds) 58 except TypeError: 59 # A TypeError occurs if the object does have such a method in its 60 # class, but its signature is not identical to that of NumPy's. This (...) 64 # Call _wrapit from within the except clause to ensure a potential 65 # exception has a traceback chain. 66 return _wrapit(obj, method, *args, **kwds) IndexError: cannot do a non-empty take from an empty axes.
extent = [cs.freqs[0]/1e6, cs.freqs[-1]/1e6, cs.time_grid[-1] - int(cs.time_grid[0]), cs.time_grid[0] - int(cs.time_grid[0])]
def plot_all_filtered_bls(N_per_row=8):
N_rows = int(np.ceil(len(candidate_autos) / N_per_row))
fig, axes = plt.subplots(N_rows, N_per_row, figsize=(14, 3 * N_rows), dpi=100,
sharex=True, sharey=True, gridspec_kw={'wspace': 0, 'hspace': .18})
for i, (ax, bl) in enumerate(zip(axes.flatten(), sorted(sqrt_mean_sqs.keys(), key=lambda bl: sqrt_mean_sqs[bl]))):
auto_here = average_autos(good_data, [bl], auto_sums, cs)
noise_model = predict_auto_noise(models[bl], int_time, chan_res, nsamples=1)
im = ax.imshow(np.where(initial_cal_flags, np.nan, (auto_here - models[bl]) / noise_model).real,
aspect='auto', interpolation='none', cmap='bwr', vmin=-10, vmax=10, extent=extent)
ax.set_title(f'{bl[0]}{bl[2][0]}: {sqrt_mean_sqs[bl]:.3}', color=('k' if sqrt_mean_sqs[bl] <= L2_bound else 'r'), fontsize=10)
if i == 0:
plt.colorbar(im, ax=axes, location='top', label=r'Autocorrelation z-score after DPSS filtering (with $\langle z^2 \rangle^{1/2}$)', extend='both', aspect=40, pad=.015)
if i % N_per_row == 0:
ax.set_ylabel(f'JD - {int(cs.time_grid[0])}')
for ax in axes[-1, :]:
ax.set_xlabel('Frequency (MHz)')
plt.tight_layout()
This figure shows the z-score waterfall of each antenna. Also shown is the square root of the mean of the square of each waterfall, as a metric of its instability. Antennas in red are excluded from the average of most stable antennas that are used for subsequent flagging.
plot_all_filtered_bls()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In [23], line 1 ----> 1 plot_all_filtered_bls() Cell In [22], line 2, in plot_all_filtered_bls(N_per_row) 1 def plot_all_filtered_bls(N_per_row=8): ----> 2 N_rows = int(np.ceil(len(candidate_autos) / N_per_row)) 3 fig, axes = plt.subplots(N_rows, N_per_row, figsize=(14, 3 * N_rows), dpi=100, 4 sharex=True, sharey=True, gridspec_kw={'wspace': 0, 'hspace': .18}) 6 for i, (ax, bl) in enumerate(zip(axes.flatten(), sorted(sqrt_mean_sqs.keys(), key=lambda bl: sqrt_mean_sqs[bl]))): NameError: name 'candidate_autos' is not defined
# Compute average autocorrelation and DPSS filter it
avg_auto = average_autos(good_data, good_auto_bls, auto_sums, cs)
model, fit_info = solve_2D_DPSS(avg_auto, wgts, time_filters, freq_filters, XTXinv=XTXinv)
noise_model = predict_auto_noise(np.abs(model), int_time, chan_res, nsamples=len(good_auto_bls))
zscore = ((avg_auto - model) / noise_model).real
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In [24], line 2 1 # Compute average autocorrelation and DPSS filter it ----> 2 avg_auto = average_autos(good_data, good_auto_bls, auto_sums, cs) 3 model, fit_info = solve_2D_DPSS(avg_auto, wgts, time_filters, freq_filters, XTXinv=XTXinv) 4 noise_model = predict_auto_noise(np.abs(model), int_time, chan_res, nsamples=len(good_auto_bls)) NameError: name 'good_auto_bls' is not defined
def plot_z_score(flags, zscore):
plt.figure(figsize=(14,10), dpi=100)
plt.imshow(np.where(flags, np.nan, zscore.real), aspect='auto', cmap='bwr', interpolation='none', vmin=-10, vmax=10, extent=extent)
plt.colorbar(location='top', label='z score', extend='both')
plt.xlabel('Frequency (MHz)')
plt.ylabel(f'JD - {int(cs.time_grid[0])}')
plt.tight_layout()
This plot shows the z-score of a DPSS-filtered, deeply averaged autocorrelation, where the noise is inferred from the integration time, channel width, and DPSS model.
plot_z_score(initial_cal_flags, zscore)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In [26], line 1 ----> 1 plot_z_score(initial_cal_flags, zscore) NameError: name 'zscore' is not defined
# flag outliers and perform watershed for lesser outliers neighboring flags
round_1_flags = copy.deepcopy(initial_cal_flags)
round_1_flags[zscore > Z_THRESH] = True
ws_flags = xrfi._ws_flag_waterfall(zscore, round_1_flags, WS_Z_THRESH)
round_1_flags |= ws_flags
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In [27], line 3 1 # flag outliers and perform watershed for lesser outliers neighboring flags 2 round_1_flags = copy.deepcopy(initial_cal_flags) ----> 3 round_1_flags[zscore > Z_THRESH] = True 4 ws_flags = xrfi._ws_flag_waterfall(zscore, round_1_flags, WS_Z_THRESH) 5 round_1_flags |= ws_flags NameError: name 'zscore' is not defined
def iteratively_flag_on_averaged_zscore(flags, zscore, avg_z_thresh=1.5, verbose=True):
'''Flag whole integrations or channels based on average z-score. This is done
iteratively to prevent bad times affecting channel averages or vice versa.'''
flagged_chan_count = 0
flagged_int_count = 0
while True:
zspec = np.nanmean(np.where(flags, np.nan, zscore), axis=0)
ztseries = np.nanmean(np.where(flags, np.nan, zscore), axis=1)
if (np.nanmax(zspec) < avg_z_thresh) and (np.nanmax(ztseries) < avg_z_thresh):
break
if np.nanmax(zspec) >= np.nanmax(ztseries):
flagged_chan_count += np.sum(zspec >= max(ztseries))
flags[:, zspec >= np.nanmax(ztseries)] = True
else:
flagged_int_count += np.sum(ztseries >= max(zspec))
flags[ztseries >= np.nanmax(zspec), :] = True
if verbose:
print(f'Flagging an additional {flagged_int_count} integrations and {flagged_chan_count} channels.')
def impose_max_chan_flag_frac(flags, max_flag_frac=.25, verbose=True):
'''Flag channels already flagged more than max_flag_frac (excluding completely flagged times).'''
unflagged_times = ~np.all(flags, axis=1)
frequently_flagged_chans = np.mean(flags[unflagged_times, :], axis=0) >= max_flag_frac
if verbose:
print(f'Flagging {np.sum(frequently_flagged_chans) - np.sum(np.all(flags, axis=0))} channels previously flagged {max_flag_frac:.2%} or more.')
flags[:, frequently_flagged_chans] = True
def impose_max_time_flag_frac(flags, max_flag_frac=.25, verbose=True):
'''Flag times already flagged more than max_flag_frac (excluding completely flagged channels).'''
unflagged_chans = ~np.all(flags, axis=0)
frequently_flagged_times = np.mean(flags[:, unflagged_chans], axis=1) >= max_flag_frac
if verbose:
print(f'Flagging {np.sum(frequently_flagged_times) - np.sum(np.all(flags, axis=1))} times previously flagged {max_flag_frac:.2%} or more.')
flags[frequently_flagged_times, :] = True
# Flag whole integrations or channels
iteratively_flag_on_averaged_zscore(round_1_flags, zscore, avg_z_thresh=AVG_Z_THRESH, verbose=True)
impose_max_chan_flag_frac(round_1_flags, max_flag_frac=MAX_FREQ_FLAG_FRAC, verbose=True)
impose_max_time_flag_frac(round_1_flags, max_flag_frac=MAX_TIME_FLAG_FRAC, verbose=True)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In [29], line 2 1 # Flag whole integrations or channels ----> 2 iteratively_flag_on_averaged_zscore(round_1_flags, zscore, avg_z_thresh=AVG_Z_THRESH, verbose=True) 3 impose_max_chan_flag_frac(round_1_flags, max_flag_frac=MAX_FREQ_FLAG_FRAC, verbose=True) 4 impose_max_time_flag_frac(round_1_flags, max_flag_frac=MAX_TIME_FLAG_FRAC, verbose=True) NameError: name 'zscore' is not defined
This is the same as Figure 2, but includes additional flags identified based on a full 2D DPSS filter of this waterfall.
plot_z_score(round_1_flags, zscore)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In [30], line 1 ----> 1 plot_z_score(round_1_flags, zscore) NameError: name 'zscore' is not defined
noise = predict_auto_noise(avg_auto, int_time, chan_res, nsamples=len(good_auto_bls))
wgts = wgts = np.where(round_1_flags, 0, noise**-2)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In [31], line 1 ----> 1 noise = predict_auto_noise(avg_auto, int_time, chan_res, nsamples=len(good_auto_bls)) 2 wgts = wgts = np.where(round_1_flags, 0, noise**-2) NameError: name 'avg_auto' is not defined
time_filters, freq_filters = dpss_filters(freqs=cs.freqs, # Hz
times=cs.time_grid, # JD
freq_scale=FREQ_FILTER_SCALE,
time_scale=TIME_FILTER_SCALE,
eigenval_cutoff=EIGENVAL_CUTOFF)
model, fit_info = solve_2D_DPSS(avg_auto, wgts, time_filters, freq_filters)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In [32], line 6 1 time_filters, freq_filters = dpss_filters(freqs=cs.freqs, # Hz 2 times=cs.time_grid, # JD 3 freq_scale=FREQ_FILTER_SCALE, 4 time_scale=TIME_FILTER_SCALE, 5 eigenval_cutoff=EIGENVAL_CUTOFF) ----> 6 model, fit_info = solve_2D_DPSS(avg_auto, wgts, time_filters, freq_filters) NameError: name 'avg_auto' is not defined
noise_model = predict_auto_noise(np.abs(model), int_time, chan_res, nsamples=len(good_auto_bls))
zscore = ((avg_auto - model) / noise_model).real
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In [33], line 1 ----> 1 noise_model = predict_auto_noise(np.abs(model), int_time, chan_res, nsamples=len(good_auto_bls)) 2 zscore = ((avg_auto - model) / noise_model).real NameError: name 'model' is not defined
round_2_flags = np.zeros_like(round_1_flags)
# flag any integrations fully-flagged by the notebooks (also accounts for missing data)
round_2_flags[np.all(initial_cal_flags, axis=1), :] = True
# flag on FM and sun-up data
flag_FM(round_2_flags, cs.freqs, freq_range=FM_freq_range)
flag_sun(round_2_flags, cs.time_grid, max_solar_alt=MAX_SOLAR_ALT)
# flag any round 1 flags that are still moderately high z-score
round_2_flags[round_1_flags & (zscore > REPEAT_FLAG_Z_THRESH)] = True
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In [34], line 11 8 flag_sun(round_2_flags, cs.time_grid, max_solar_alt=MAX_SOLAR_ALT) 10 # flag any round 1 flags that are still moderately high z-score ---> 11 round_2_flags[round_1_flags & (zscore > REPEAT_FLAG_Z_THRESH)] = True NameError: name 'zscore' is not defined
# Flag outliers and then perform watershed flagging
round_2_flags[zscore.real > Z_THRESH] = True
ws_flags = xrfi._ws_flag_waterfall(zscore.real, round_2_flags, WS_Z_THRESH)
round_2_flags |= ws_flags
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In [35], line 2 1 # Flag outliers and then perform watershed flagging ----> 2 round_2_flags[zscore.real > Z_THRESH] = True 3 ws_flags = xrfi._ws_flag_waterfall(zscore.real, round_2_flags, WS_Z_THRESH) 4 round_2_flags |= ws_flags NameError: name 'zscore' is not defined
# Flag whole integrations or channels
iteratively_flag_on_averaged_zscore(round_2_flags, zscore, avg_z_thresh=AVG_Z_THRESH, verbose=True)
impose_max_chan_flag_frac(round_2_flags, max_flag_frac=MAX_FREQ_FLAG_FRAC, verbose=True)
impose_max_time_flag_frac(round_2_flags, max_flag_frac=MAX_TIME_FLAG_FRAC, verbose=True)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In [36], line 2 1 # Flag whole integrations or channels ----> 2 iteratively_flag_on_averaged_zscore(round_2_flags, zscore, avg_z_thresh=AVG_Z_THRESH, verbose=True) 3 impose_max_chan_flag_frac(round_2_flags, max_flag_frac=MAX_FREQ_FLAG_FRAC, verbose=True) 4 impose_max_time_flag_frac(round_2_flags, max_flag_frac=MAX_TIME_FLAG_FRAC, verbose=True) NameError: name 'zscore' is not defined
This is the same as Figures 2 and 3, but now includes only the final set of flags.
plot_z_score(round_2_flags, zscore)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In [37], line 1 ----> 1 plot_z_score(round_2_flags, zscore) NameError: name 'zscore' is not defined
def summarize_flagging():
plt.figure(figsize=(14,10), dpi=100)
cmap = matplotlib.colors.ListedColormap(((0, 0, 0),) + matplotlib.cm.get_cmap("Set2").colors[:3])
plt.imshow(np.where(initial_cal_flags & round_2_flags, 1, np.where(initial_cal_flags, 2, np.where(round_2_flags, 3, 0))),
aspect='auto', cmap=cmap, interpolation='none', extent=extent)
plt.clim([-.5, 3.5])
cbar = plt.colorbar(location='top', aspect=40, pad=.02)
cbar.set_ticks([0, 1, 2, 3])
cbar.set_ticklabels(['Unflagged', 'Flagged by both file_calibration and here', 'Flagged by file_calibration only', 'Flagged here only'])
plt.xlabel('Frequency (MHz)')
plt.ylabel(f'JD - {int(cs.time_grid[0])}')
plt.tight_layout()
This plot shows which times and frequencies were flagged by either the file_calibration notebook (which also includes a priori flags imposed here like FM), which ones were flagged only in this notebook, and which ones were flagged consistently (and often independently) in both.
summarize_flagging()
add_to_history = 'Produced by full_day_rfi notebook with the following environment:\n' + '=' * 65 + '\n' + os.popen('conda env export').read() + '=' * 65
out_flag_files = [auto_sum.replace(SUM_AUTOS_SUFFIX, OUT_FLAG_SUFFIX) for auto_sum in auto_sums]
for auto_sum, cal, out_flag_file in zip(auto_sums, cs.cals, out_flag_files):
# create UVFlag object based on UVData
uvd = UVData()
uvd.read(auto_sum)
uvf = UVFlag(uvd, waterfall=True, mode='flag')
uvf.use_future_array_shapes()
# fill out flags
for p in range(uvf.Npols):
uvf.flag_array[:, :, p] = round_2_flags[cs.time_indices[cal], :]
# write to disk
uvf.history += add_to_history
uvf.write(out_flag_file, clobber=True)
print(f'Saved {len(out_flag_files)} *.{OUT_FLAG_SUFFIX} files starting with {out_flag_files[0]}.')
Saved 1655 *.sum.flag_waterfall.h5 files starting with /mnt/sn1/2460100/zen.2460100.21285.sum.flag_waterfall.h5.
for repo in ['hera_cal', 'hera_qm', 'hera_filters', 'hera_notebook_templates', 'pyuvdata']:
exec(f'from {repo} import __version__')
print(f'{repo}: {__version__}')
hera_cal: 3.2.3.dev158+gd5cadd5 hera_qm: 2.1.1.dev3+gb291d34 hera_filters: 0.1.0 hera_notebook_templates: 0.1.dev486+gfb8560a pyuvdata: 2.3.0
print(f'Finished execution in {(time.time() - tstart) / 60:.2f} minutes.')
Finished execution in 10.05 minutes.