Full-Day Autocorrelation Checking¶
by Josh Dillon and Steven Murray, last updated March 14, 2024
This notebook is designed to assess per-day data quality from just autocorrelations, enabling a quick assessment of whether the day is worth pushing through further analysis. In particular, it is designed to look for times that are particularly contaminated by broadband RFI (e.g. from lightning), picking out fraction of days worth analyzing. It's output is a an a priori flag yaml file readable by hera_qm.metrics_io
functions read_a_priori_chan_flags()
, read_a_priori_int_flags()
, and read_a_priori_ant_flags()
.
Here's a set of links to skip to particular figures:
• Figure 1: Preliminary Array Flag Fraction Summary¶
• Figure 2: z-Score of DPSS-Filtered, Averaged Good Autocorrelation and Initial Flags¶
• Figure 3: Proposed A Priori Time Flags Based on Frequency-Averaged and Convolved z-Score Magnitude¶
import time
tstart = time.time()
import os
os.environ['HDF5_USE_FILE_LOCKING'] = 'FALSE'
import h5py
import hdf5plugin # REQUIRED to have the compression plugins available
import numpy as np
import pandas as pd
import glob
import os
import matplotlib.pyplot as plt
import matplotlib
import copy
from scipy.ndimage import convolve
from hera_cal import io, utils
from hera_cal.smooth_cal import dpss_filters, solve_2D_DPSS
from hera_qm import ant_class, xrfi, metrics_io
from hera_qm.time_series_metrics import true_stretches, impose_max_flag_gap, metric_convolution_flagging
from hera_filters import dspec
import warnings
from IPython.display import display, HTML
%matplotlib inline
display(HTML("<style>.container { width:100% !important; }</style>"))
_ = np.seterr(all='ignore') # get rid of red warnings
%config InlineBackend.figure_format = 'retina'
Parse input and output files¶
# get filenames
AUTO_FILE = os.environ.get("AUTO_FILE", None)
# AUTO_FILE = '/lustre/aoc/projects/hera/h6c-analysis/IDR2/2459869/zen.2459869.25299.sum.autos.uvh5'
SUM_AUTOS_SUFFIX = os.environ.get("SUM_AUTOS_SUFFIX", 'sum.autos.uvh5')
DIFF_AUTOS_SUFFIX = os.environ.get("DIFF_AUTOS_SUFFIX", 'diff.autos.uvh5')
OUT_YAML_SUFFIX = os.environ.get("OUT_YAML_SUFFIX", '_apriori_flags.yaml')
OUT_YAML_DIR = os.environ.get("OUT_YAML_DIR", None)
# OUT_YAML_DIR = '/lustre/aoc/projects/hera/jsdillon/H6C/'
if OUT_YAML_DIR is None:
OUT_YAML_DIR = os.path.dirname(AUTO_FILE)
auto_sums_glob = '.'.join(AUTO_FILE.split('.')[:-4]) + '.*.' + SUM_AUTOS_SUFFIX
auto_diffs_glob = auto_sums_glob.replace(SUM_AUTOS_SUFFIX, DIFF_AUTOS_SUFFIX)
out_yaml_file = os.path.join(OUT_YAML_DIR, AUTO_FILE.split('.')[-5] + OUT_YAML_SUFFIX)
auto_sums = sorted(glob.glob(auto_sums_glob))
print(f'Found {len(auto_sums)} *.{SUM_AUTOS_SUFFIX} files starting with {auto_sums[0]}.')
auto_diffs = sorted(glob.glob(auto_diffs_glob))
print(f'Found {len(auto_diffs)} *.{DIFF_AUTOS_SUFFIX} files starting with {auto_diffs[0]}.')
Found 1572 *.sum.autos.uvh5 files starting with /mnt/sn1/data1/2460451/zen.2460451.16874.sum.autos.uvh5. Found 1572 *.diff.autos.uvh5 files starting with /mnt/sn1/data1/2460451/zen.2460451.16874.diff.autos.uvh5.
Parse settings¶
# bounds on zeros in spectra
good_zeros_per_eo_spectrum = (0, int(os.environ.get("MAX_ZEROS_PER_EO_SPEC_GOOD", 2)))
suspect_zeros_per_eo_spectrum = (0, int(os.environ.get("MAX_ZEROS_PER_EO_SPEC_SUSPECT", 8)))
# bounds on autocorrelation power
auto_power_good = (float(os.environ.get("AUTO_POWER_GOOD_LOW", 5)), float(os.environ.get("AUTO_POWER_GOOD_HIGH", 30)))
auto_power_suspect = (float(os.environ.get("AUTO_POWER_SUSPECT_LOW", 1)), float(os.environ.get("AUTO_POWER_SUSPECT_HIGH", 60)))
# bounds on autocorrelation slope
auto_slope_good = (float(os.environ.get("AUTO_SLOPE_GOOD_LOW", -0.4)), float(os.environ.get("AUTO_SLOPE_GOOD_HIGH", 0.4)))
auto_slope_suspect = (float(os.environ.get("AUTO_SLOPE_SUSPECT_LOW", -0.6)), float(os.environ.get("AUTO_SLOPE_SUSPECT_HIGH", 0.6)))
# bounds on autocorrelation RFI
auto_rfi_good = (0, float(os.environ.get("AUTO_RFI_GOOD", 0.015)))
auto_rfi_suspect = (0, float(os.environ.get("AUTO_RFI_SUSPECT", 0.03)))
# bounds on autocorrelation shape
auto_shape_good = (0, float(os.environ.get("AUTO_SHAPE_GOOD", 0.1)))
auto_shape_suspect = (0, float(os.environ.get("AUTO_SHAPE_SUSPECT", 0.2)))
# parse RFI settings for antenna flagging
RFI_DPSS_HALFWIDTH = float(os.environ.get("RFI_DPSS_HALFWIDTH", 300e-9)) # in s
RFI_NSIG = float(os.environ.get("RFI_NSIG", 6))
# parse settings for identifying mislabeled data by X-engine
BAD_XENGINE_ZCUT = float(os.environ.get("BAD_XENGINE_ZCUT", 10))
# DPSS settings for full-day filtering
FREQ_FILTER_SCALE = float(os.environ.get("FREQ_FILTER_SCALE", 5.0)) # in MHz
TIME_FILTER_SCALE = float(os.environ.get("TIME_FILTER_SCALE", 450.0)) # in s
EIGENVAL_CUTOFF = float(os.environ.get("EIGENVAL_CUTOFF", 1e-12))
# A priori flag settings
FM_LOW_FREQ = float(os.environ.get("FM_LOW_FREQ", 87.5)) # in MHz
FM_HIGH_FREQ = float(os.environ.get("FM_HIGH_FREQ", 108.0)) # in MHz
FM_freq_range = [FM_LOW_FREQ * 1e6, FM_HIGH_FREQ * 1e6]
MAX_SOLAR_ALT = float(os.environ.get("MAX_SOLAR_ALT", 0.0)) # in degrees
SMOOTHED_ABS_Z_THRESH = float(os.environ.get("SMOOTHED_ABS_Z_THRESH", 10))
WHOLE_DAY_FLAG_THRESH = float(os.environ.get("WHOLE_DAY_FLAG_THRESH", 0.5))
for setting in ['good_zeros_per_eo_spectrum', 'suspect_zeros_per_eo_spectrum', 'auto_power_good', 'auto_power_suspect',
'auto_slope_good', 'auto_slope_suspect', 'auto_rfi_good', 'auto_rfi_suspect',
'auto_shape_good', 'auto_shape_suspect', 'BAD_XENGINE_ZCUT', 'RFI_DPSS_HALFWIDTH', 'RFI_NSIG',
'FREQ_FILTER_SCALE', 'TIME_FILTER_SCALE', 'EIGENVAL_CUTOFF', 'FM_freq_range',
'MAX_SOLAR_ALT', 'SMOOTHED_ABS_Z_THRESH', 'WHOLE_DAY_FLAG_THRESH']:
print(f'{setting} = {eval(setting)}')
good_zeros_per_eo_spectrum = (0, 2) suspect_zeros_per_eo_spectrum = (0, 8) auto_power_good = (5.0, 30.0) auto_power_suspect = (1.0, 60.0) auto_slope_good = (-0.4, 0.4) auto_slope_suspect = (-0.6, 0.6) auto_rfi_good = (0, 1.5) auto_rfi_suspect = (0, 2.0) auto_shape_good = (0, 0.1) auto_shape_suspect = (0, 0.2) BAD_XENGINE_ZCUT = 10.0 RFI_DPSS_HALFWIDTH = 3e-07 RFI_NSIG = 4.0 FREQ_FILTER_SCALE = 5.0 TIME_FILTER_SCALE = 450.0 EIGENVAL_CUTOFF = 1e-12 FM_freq_range = [87500000.0, 108000000.0] MAX_SOLAR_ALT = 0.0 SMOOTHED_ABS_Z_THRESH = 10.0 WHOLE_DAY_FLAG_THRESH = 0.5
Classify Antennas and Find RFI Per-File¶
cache = {}
def classify_autos_and_preliminary_rfi(auto_sum_file, auto_diff_file):
hd_sum = io.HERADataFastReader(auto_sum_file)
sum_autos, _, _ = hd_sum.read(read_flags=False, read_nsamples=False)
hd_diff = io.HERADataFastReader(auto_diff_file)
diff_autos, _, _ = hd_diff.read(read_flags=False, read_nsamples=False)
ants = sorted(set([ant for bl in hd_sum.bls for ant in utils.split_bl(bl)]))
zeros_class = ant_class.even_odd_zeros_checker(sum_autos, diff_autos, good=good_zeros_per_eo_spectrum, suspect=suspect_zeros_per_eo_spectrum)
auto_power_class = ant_class.auto_power_checker(sum_autos, good=auto_power_good, suspect=auto_power_suspect)
auto_slope_class = ant_class.auto_slope_checker(sum_autos, good=auto_slope_good, suspect=auto_slope_suspect, edge_cut=100, filt_size=17)
auto_rfi_class = ant_class.auto_rfi_checker(sum_autos, good=auto_rfi_good, suspect=auto_rfi_suspect,
filter_half_widths=[RFI_DPSS_HALFWIDTH], nsig=RFI_NSIG, cache=cache)
auto_class = auto_power_class + auto_slope_class + auto_rfi_class
final_class = zeros_class + auto_class
if len(final_class.good_ants) + len(final_class.suspect_ants) > 0:
# Compute int_count for all unflagged autocorrelations averaged together
int_time = 24 * 3600 * np.median(np.diff(sum_autos.times_by_bl[hd_sum.bls[0][0:2]]))
chan_res = np.median(np.diff(sum_autos.freqs))
int_count = int(int_time * chan_res) * (len(final_class.good_ants) + len(final_class.suspect_ants))
avg_auto = {(-1, -1, 'ee'): np.mean([sum_autos[bl] for bl in hd_sum.bls if final_class[utils.split_bl(bl)[0]] != 'bad'], axis=0)}
# Flag RFI first with channel differences and then with DPSS
antenna_flags, _ = xrfi.flag_autos(avg_auto, int_count=int_count, nsig=(RFI_NSIG * 5))
_, rfi_flags = xrfi.flag_autos(avg_auto, int_count=int_count, flag_method='dpss_flagger',
flags=antenna_flags, freqs=sum_autos.freqs, filter_centers=[0],
filter_half_widths=[RFI_DPSS_HALFWIDTH], eigenval_cutoff=[1e-9], nsig=RFI_NSIG)
# Classify on auto shapes
auto_shape_class = ant_class.auto_shape_checker(sum_autos, good=auto_shape_good, suspect=auto_shape_suspect,
flag_spectrum=np.sum(rfi_flags, axis=0).astype(bool), antenna_class=final_class)
final_class += auto_shape_class
# Classify on excess power in X-engine diffs
xengine_diff_class = ant_class.non_noiselike_diff_by_xengine_checker(sum_autos, diff_autos, flag_waterfall=rfi_flags,
antenna_class=final_class,
xengine_chans=96, bad_xengine_zcut=BAD_XENGINE_ZCUT)
final_class += xengine_diff_class
else:
rfi_flags = np.ones((len(sum_autos.times), len(sum_autos.freqs)), dtype=bool)
return final_class, rfi_flags
classifications = {}
preliminary_rfi_flags = {}
with warnings.catch_warnings():
warnings.simplefilter("ignore")
for auto_sum_file, auto_diff_file in list(zip(auto_sums, auto_diffs)):
classifications[auto_sum_file], preliminary_rfi_flags[auto_sum_file] = classify_autos_and_preliminary_rfi(auto_sum_file, auto_diff_file)
all_ants = set([ant for auto_sum_file in classifications for ant in classifications[auto_sum_file]])
ant_flag_fracs = {ant: 0 for ant in all_ants}
for classification in classifications.values():
for ant in classification.bad_ants:
ant_flag_fracs[ant] += 1
ant_flag_fracs = {ant: ant_flag_fracs[ant] / len(classifications) for ant in all_ants}
def flag_frac_array_plot():
fig, axes = plt.subplots(1, 2, figsize=(14, 8), dpi=100, gridspec_kw={'width_ratios': [2, 1]})
def flag_frac_panel(ax, antnums, radius=7, legend=False):
ang_dict = {'Jee': (225, 405), 'Jnn': (45, 225)}
hd_sum = io.HERADataFastReader(auto_sums[-1])
xpos = np.array([hd_sum.antpos[ant[0]][0] for ant in all_ants if ant[0] in antnums])
ypos = np.array([hd_sum.antpos[ant[0]][1] for ant in all_ants if ant[0] in antnums])
scatter = ax.scatter(xpos, ypos, c='w', s=0)
for ant in all_ants:
antnum, pol = ant
if antnum in antnums:
ax.add_artist(matplotlib.patches.Wedge(tuple(hd_sum.antpos[antnum][0:2]), radius, *ang_dict[pol], color='grey'))
flag_frac = ant_flag_fracs[ant]
if flag_frac > .05:
ax.add_artist(matplotlib.patches.Wedge(tuple(hd_sum.antpos[antnum][0:2]), radius * np.sqrt(flag_frac), *ang_dict[pol], color='r'))
ax.text(hd_sum.antpos[antnum][0], hd_sum.antpos[antnum][1], str(antnum), color='w', va='center', ha='center', zorder=100)
ax.axis('equal')
ax.set_xlim([np.min(xpos) - radius * 2, np.max(xpos) + radius * 2])
ax.set_ylim([np.min(ypos) - radius * 2, np.max(ypos) + radius * 2])
ax.set_xlabel("East-West Position (meters)", size=12)
ax.set_ylabel("North-South Position (meters)", size=12)
if legend:
legend_objs = []
legend_labels = []
legend_objs.append(matplotlib.lines.Line2D([0], [0], marker='o', color='w', markeredgecolor='grey', markerfacecolor='grey', markersize=15))
unflagged_nights = lambda pol: np.sum([1 - ant_flag_fracs[ant] for ant in all_ants if ant[-1] == pol])
legend_labels.append((' \u2571\n').join([f'{unflagged_nights(pol):.1f} unflagged {pol[-1]}-polarized\nantenna-nights.' for pol in ['Jee', 'Jnn']]))
legend_objs.append(matplotlib.lines.Line2D([0], [0], marker='o', color='w', markeredgecolor='red', markerfacecolor='red', markersize=15))
unflagged_nights = lambda pol: np.sum([ant_flag_fracs[ant] for ant in all_ants if ant[-1] == pol])
legend_labels.append((' \u2571\n').join([f'{unflagged_nights(pol):.1f} flagged {pol[-1]}-polarized\nantenna-nights.' for pol in ['Jee', 'Jnn']]))
ax.legend(legend_objs, legend_labels, ncol=1, fontsize=12)
flag_frac_panel(axes[0], sorted(set([ant[0] for ant in all_ants if ant[0] < 320])), radius=7)
flag_frac_panel(axes[1], sorted(set([ant[0] for ant in all_ants if ant[0] >= 320])), radius=50, legend=True)
plt.tight_layout()
Figure 1: Preliminary Array Flag Fraction Summary¶
Per-antenna flagging fraction of data based purely on metrics that only use autocorrelations. This is likely an underestimate of flags, since it ignores low correlation, cross-polarized antennas, and high redcal $\chi^2$, among other factors.
flag_frac_array_plot()