Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Minor fixes for handling of older SMA data #1399

Merged
merged 6 commits into from
Feb 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,12 @@ positions are near surface of whatever celestial body their positions are refere
(either the Earth or Moon, currently).

### Changed
- Made the determination of whether or not to create a flex-pol dataset when reading
in a MIR more robust (particularly with pre-V3 data formats).
- Reading in of MIR data sets into `UVData` objects will now use pyuvdata-calculated
values for `lst_array` via (`UVData.set_lsts_from_time_array`) instead of those read in
from the file due to known precision issues in the later (~25 ms), so long as the two
agree within this known precision limit.
- `UVFlag.to_baseline` and `UVFlag.to_antenna` are now more robust to differences
in antenna metadata sorting.
- Made `MirParser` more robust against metadata indexing errors.
Expand Down Expand Up @@ -68,6 +74,8 @@ fully tested and didn't work properly.
- Having `freq_array` and `channel_width` defined on wide-band UVCal objects.

### Fixed
- A bug where `time_array` was not being correctly calculated for older (pre-V3) MIR
data formats.
- A small bug in UVFlag that could occur when reading in an older UVFlag HDF5
file with missing antenna metadata.
- A small bug (mostly affecting continuous integration) that threw an error when the
Expand Down
1 change: 1 addition & 0 deletions pyuvdata/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4143,6 +4143,7 @@ def test_uvw_track_generator_errs():
@pytest.mark.parametrize("use_uvw", [False, True])
@pytest.mark.parametrize("use_earthloc", [False, True])
@pytest.mark.filterwarnings("ignore:The lst_array is not self-consistent")
@pytest.mark.filterwarnings("ignore:> 25 ms errors detected reading in LST values")
def test_uvw_track_generator(flip_u, use_uvw, use_earthloc):
sma_mir = UVData.from_file(os.path.join(DATA_PATH, "sma_test.mir"))
sma_mir.set_lsts_from_time_array()
Expand Down
176 changes: 103 additions & 73 deletions pyuvdata/uvdata/mir.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,16 +343,22 @@ def _init_from_mir_parser(
if len(pol_dict) == 2:
# If dual-pol, then we need to check if the tunings are split, because
# the two polarizations will effectively be concat'd across the freq
# axis instead of the pol axis.
# axis instead of the pol axis. First, see if we have two diff receivers
rxa_mask = ant1_rxa_mask & ant2_rxa_mask
rxb_mask = ~(ant1_rxa_mask | ant2_rxa_mask)

if np.any(rxa_mask) and np.any(rxb_mask):
# If we have both VV and HH data, check to see that the tunings of the
# two receivers match.
loa_freq = np.median(mir_data.sp_data["gunnLO"][rxa_mask])
lob_freq = np.median(mir_data.sp_data["gunnLO"][rxb_mask])
pol_split_tuning = not np.isclose(loa_freq, lob_freq)
# If we have both VV and HH data, check to see that the frequencies of
# each of the spectral chunks match. If they do, then we can concat
# across the polarization axis, but if _not_, we should treat this as
# a pol-split data set.
fsky_vals = mir_data.sp_data["fsky"]
chunk_vals = mir_data.sp_data["corrchunk"]
loa_chunks = set(zip(fsky_vals[rxa_mask], chunk_vals[rxa_mask]))
lob_chunks = set(zip(fsky_vals[rxb_mask], chunk_vals[rxb_mask]))
pol_split_tuning = not (
loa_chunks.issubset(lob_chunks) or lob_chunks.issubset(loa_chunks)
)

# Map MIR pol code to pyuvdata/AIPS polarization number
pol_code_dict = {}
Expand Down Expand Up @@ -410,6 +416,16 @@ def _init_from_mir_parser(
spdx_dict = {}
spw_dict = {}
for spdx in set(spdx_list):
# We need to do a some extra handling here, because a single correlator
# can produce multiple spectral windows (e.g., LSB/USB). The scheme below
# will negate the corr band number if LSB, will set the corr band number to
# 255 if the values arise from the pseudo-wideband values, and will add 512
# if the pols are split-tuned. This scheme, while a little funky, guarantees
# that each unique freq range has its own spectral window number.
spw_id = 255 if (spdx[0] == 0) else spdx[0]
spw_id *= (-1) ** (1 + spdx[1])
spw_id += 512 if (pol_split_tuning and spdx[2] == 1) else 0

data_mask = np.array([spdx == item for item in spdx_list])

# Grab values, get them into appropriate types
Expand All @@ -419,25 +435,21 @@ def _init_from_mir_parser(

# Make sure that something weird hasn't happened with the metadata (this
# really should never happen, only one value should exist per window).
assert np.allclose(spw_fsky, mir_data.sp_data["fsky"][data_mask])
assert np.allclose(spw_fres, mir_data.sp_data["fres"][data_mask])
assert np.allclose(spw_nchan, mir_data.sp_data["nch"][data_mask])
for val, item in zip(
[spw_fsky, spw_fres, spw_nchan], ["fsky", "fres", "nch"]
):
if not np.allclose(val, mir_data.sp_data[item][data_mask]):
warnings.warn(
"Discrepancy in %s for win %i sb %i pol %i. Values of "
"`freq_array` and `channel_width` should be checked for "
"channels corresponding to spw_id %i." % (item, *spdx, spw_id)
)

# Get the data in the right units and dtype
spw_fsky = float(spw_fsky * 1e9) # GHz -> Hz
spw_fres = float(spw_fres * 1e6) # MHz -> Hz
spw_nchan = int(spw_nchan)

# We need to do a some extra handling here, because a single correlator
# can produce multiple spectral windows (e.g., LSB/USB). The scheme below
# will negate the corr band number if LSB, will set the corr band number to
# 255 if the values arise from the pseudo-wideband values, and will add 512
# if the pols are split-tuned. This scheme, while a little funky, guarantees
# that each unique freq range has its own spectral window number.
spw_id = 255 if (spdx[0] == 0) else spdx[0]
spw_id *= (-1) ** (1 + spdx[1])
spw_id += 512 if (pol_split_tuning and spdx[2] == 1) else 0

# Populate the channel width array
channel_width = abs(spw_fres) + np.zeros(spw_nchan, dtype=np.float64)

Expand Down Expand Up @@ -499,57 +511,6 @@ def _init_from_mir_parser(
for key in spdx_dict:
spdx_dict[key]["ch_slice"] = spw_dict[spdx_dict[key]["spw_id"]]["ch_slice"]

# Create arrays to plug visibilities and flags into. The array is shaped this
# way since when reading in a MIR file, we scan through the blt-axis the
# slowest and the freq-axis the fastest (i.e., the data is roughly ordered by
# blt, pol, freq).
self.data_array = np.zeros((Nblts, Npols, Nfreqs), dtype=np.complex64)
self.flag_array = np.ones((Nblts, Npols, Nfreqs), dtype=bool)
self.nsample_array = np.zeros((Nblts, Npols, Nfreqs), dtype=np.float32)

# Get a list of the current inhid values for later
inhid_list = mir_data.in_data["inhid"].copy()

# Store a backup of the selection masks
backup_masks = {}
for item, obj in mir_data._metadata_attrs.items():
backup_masks[item] = obj._mask.copy()

# If no data is loaded, we want to load subsets of data to start rather than
# the whole block in one go, since this will save us 2x in memory.
inhid_step = len(inhid_list)
if (mir_data.vis_data is None) and (mir_data.auto_data is None):
inhid_step = (inhid_step // 8) + 1

for start in range(0, len(inhid_list), inhid_step):
# If no data is loaded, load up a quarter of the data at a time. This
# keeps the "extra" memory usage below that for the nsample_array attr,
# which is generated and filled _after_ this step (thus no extra memory
# should be required)
if (mir_data.vis_data is None) and (mir_data.auto_data is None):
# Note that the masks are combined via and operation.
mir_data.select(
where=("inhid", "eq", inhid_list[start : start + inhid_step])
)

# Call this convenience function in case we need to run the data filling
# multiple times (if we are loading up subsets of data)
self._prep_and_insert_data(
mir_data,
sphid_dict,
spdx_dict,
blhid_blt_order,
apply_flags=apply_flags,
apply_tsys=apply_tsys,
apply_dedoppler=apply_dedoppler,
)

for item, obj in mir_data._metadata_attrs.items():
# Because the select operation messes with the masks, we want to restore
# those in case we mucked with them earlier (so that subsequent selects
# behave as expected).
obj._mask = backup_masks[item].copy()

# Now assign our flexible arrays to the object itself
self.freq_array = freq_array
self.Nfreqs = Nfreqs
Expand Down Expand Up @@ -684,7 +645,6 @@ def _init_from_mir_parser(
for blt_key in blt_temp_dict.keys():
temp_dict = blt_temp_dict[blt_key]
integration_time[blt_key] = temp_dict["rinteg"]
# TODO: Using MIR V3 convention for lst, will need make it V2 compatible.
lst_array[blt_key] = temp_dict["lst"] * (np.pi / 12.0) # Hours -> rad
mjd_array[blt_key] = temp_dict["mjd"]
ant_1_array[blt_key] = temp_dict["iant1"]
Expand All @@ -702,9 +662,30 @@ def _init_from_mir_parser(
self.baseline_array = self.antnums_to_baseline(
self.ant_1_array, self.ant_2_array, attempt256=False
)
self.integration_time = integration_time
self.lst_array = lst_array
self.time_array = Time(mjd_array, scale="tt", format="mjd").utc.jd
self.integration_time = integration_time

# There is a minor issue w/ MIR datasets where the LSTs are calculated via
# a polled average rather than calculated for the mid-point, which results
# in some imprecision (and some nuisance warnings). Fix this by calculating the
# LSTs here, checking to make sure that they agree to within the expected
# precision (sampling rate is 20 Hz, and the max error to worry about is half a
# sample: 25 ms, or in radians, 2*pi/(40 * 86400)) = pi / 1728000.
# TODO: Re-evaluate this if/when MIR data writer stops calculating LST this way
self.set_lsts_from_time_array()
if not np.allclose(lst_array, self.lst_array, rtol=0, atol=np.pi / 1728000.0):
# If this check fails, it means that there's something off w/ the lst values
# (to a larger degree than expected), and we'll pass them back to the user,
# who can inspect them directly and decide what to do.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should get printed somewhere with suggestions on what to check.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, I've added a warning message here in aa4bd2f -- it's a little on the long side, although it hopefully gives enough guidance to the user on what to do and when to worry (it looks like the polling rate wasn't always 20 Hz, so this still triggers for some older data sets, but still in most cases it's just a problem with the metadata, not actually the visibilities).

warnings.warn(
"> 25 ms errors detected reading in LST values from MIR data. "
"This typically signifies a minor metadata recording error (which can "
"be mitigated by calling the `set_lsts_from_time_array` method with "
"`update_vis=False`), though additional errors about uvw-position "
"accuracy may signal more significant issues with metadata accuracy "
"that could have substantial impact on downstream analysis."
)
self.lst_array = lst_array

self.polarization_array = polarization_array
self.flex_spw_polarization_array = flex_pol
Expand Down Expand Up @@ -789,6 +770,55 @@ def _init_from_mir_parser(
self.filename = [os.path.basename(basename)]
self._filename.form = (1,)

# Finally, start the heavy lifting of loading the full data. We start this by
# creating arrays to plug visibilities and flags into. The array is shaped this
# way since when reading in a MIR file, we scan through the blt-axis the
# slowest and the freq-axis the fastest (i.e., the data is roughly ordered by
# blt, pol, freq).
self.data_array = np.zeros((Nblts, Npols, Nfreqs), dtype=np.complex64)
self.flag_array = np.ones((Nblts, Npols, Nfreqs), dtype=bool)
self.nsample_array = np.zeros((Nblts, Npols, Nfreqs), dtype=np.float32)

# Get a list of the current inhid values for later
inhid_list = mir_data.in_data["inhid"].copy()

# Store a backup of the selection masks
mir_data.save_mask("pre-select")

# If no data is loaded, we want to load subsets of data to start rather than
# the whole block in one go, since this will save us 2x in memory.
inhid_step = len(inhid_list)
if (mir_data.vis_data is None) and (mir_data.auto_data is None):
inhid_step = (inhid_step // 8) + 1

for start in range(0, len(inhid_list), inhid_step):
# If no data is loaded, load up a quarter of the data at a time. This
# keeps the "extra" memory usage below that for the nsample_array attr,
# which is generated and filled _after_ this step (thus no extra memory
# should be required)
if (mir_data.vis_data is None) and (mir_data.auto_data is None):
# Note that the masks are combined via and operation.
mir_data.select(
where=("inhid", "eq", inhid_list[start : start + inhid_step])
)

# Call this convenience function in case we need to run the data filling
# multiple times (if we are loading up subsets of data)
self._prep_and_insert_data(
mir_data,
sphid_dict,
spdx_dict,
blhid_blt_order,
apply_flags=apply_flags,
apply_tsys=apply_tsys,
apply_dedoppler=apply_dedoppler,
)

# Because the select operation messes with the masks, we want to restore
# those in case we mucked with them earlier (so that subsequent selects
# behave as expected).
mir_data.restore_mask("pre-select")

# We call transpose here since vis_data above is shape (Nblts, Npols, Nfreqs),
# and we need to get it to (Nblts,Nfreqs, Npols) to match what UVData expects.
self.data_array = np.transpose(self.data_array, (0, 2, 1))
Expand Down
11 changes: 7 additions & 4 deletions pyuvdata/uvdata/mir_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4148,15 +4148,18 @@ def _make_v3_compliant(self):
for key, value in mjd_day_dict.items():
if isinstance(value, str):
mjd_day_dict[key] = Time(
datetime.strptime(value, "%b %d, %Y"), scale="tt"
).tt.mjd
datetime.strptime(value, "%b %d, %Y"), scale="utc"
).mjd

mjd_arr = (self.in_data["dhrs"] / 24.0) + np.array(
[mjd_day_dict[idx] for idx in self.in_data["iref_time"]]
)

# Tally the JD dates, since that's used for various helper functions
jd_arr = Time(mjd_arr, format="mjd", scale="utc").utc.jd
jd_arr = Time(mjd_arr, format="mjd", scale="utc").jd

# Also, convert MJD back into the expected TT timescale
mjd_arr = Time(mjd_arr, format="mjd", scale="utc").tt.mjd

# Calculate the LST at the time of obs
lst_arr = (12.0 / np.pi) * uvutils.get_lst_for_time(
Expand All @@ -4183,7 +4186,7 @@ def _make_v3_compliant(self):
# bl_data updates: ant1rx, ant2rx, u, v, w
# First, update the antenna receiver if these values are unfilled (true in some
# earlier tracks, no version demarcation notes it).
if np.all(self.bl_data["ant1rx"] == 0) and np.all(self.bl_data["ant2rx"] == 0):
if np.all(self.bl_data["ant1rx"] == 0) or np.all(self.bl_data["ant2rx"] == 0):
ipol = self.bl_data["ipol"]
irec = self.bl_data["irec"]

Expand Down
6 changes: 4 additions & 2 deletions pyuvdata/uvdata/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,10 @@ def sma_mir_main():
testfile = os.path.join(DATA_PATH, "sma_test.mir")
with uvtest.check_warnings(
UserWarning,
match="The lst_array is not self-consistent with the time_array and telescope "
"location. Consider recomputing with the `set_lsts_from_time_array` method.",
match=[
"> 25 ms errors detected reading in LST values from MIR data. ",
"The lst_array is not self-consistent with the time_array and telescope ",
],
):
uv_object.read(testfile, use_future_array_shapes=True)
uv_object.set_lsts_from_time_array()
Expand Down
Loading
Loading