RadioAstronomySoftwareGroup · kartographer · Feb 13, 2024 · Feb 11, 2024 · Feb 11, 2024 · Feb 11, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -36,6 +36,12 @@ positions are near surface of whatever celestial body their positions are refere
 (either the Earth or Moon, currently).
 
 ### Changed
+- Made the determination of whether or not to create a flex-pol dataset when reading
+in a MIR more robust (particularly with pre-V3 data formats).
+- Reading in of MIR data sets into `UVData` objects will now use pyuvdata-calculated
+values for `lst_array` via (`UVData.set_lsts_from_time_array`) instead of those read in
+from the file due to known precision issues in the later (~25 ms), so long as the two
+agree within this known precision limit.
 - `UVFlag.to_baseline` and `UVFlag.to_antenna` are now more robust to differences
 in antenna metadata sorting.
 - Made `MirParser` more robust against metadata indexing errors.
@@ -68,6 +74,8 @@ fully tested and didn't work properly.
 - Having `freq_array` and `channel_width` defined on wide-band UVCal objects.
 
 ### Fixed
+- A bug where `time_array` was not being correctly calculated for older (pre-V3) MIR
+data formats.
 - A small bug in UVFlag that could occur when reading in an older UVFlag HDF5
 file with missing antenna metadata.
 - A small bug (mostly affecting continuous integration) that threw an error when the

diff --git a/pyuvdata/tests/test_utils.py b/pyuvdata/tests/test_utils.py
@@ -4143,6 +4143,7 @@ def test_uvw_track_generator_errs():
 @pytest.mark.parametrize("use_uvw", [False, True])
 @pytest.mark.parametrize("use_earthloc", [False, True])
 @pytest.mark.filterwarnings("ignore:The lst_array is not self-consistent")
+@pytest.mark.filterwarnings("ignore:> 25 ms errors detected reading in LST values")
 def test_uvw_track_generator(flip_u, use_uvw, use_earthloc):
     sma_mir = UVData.from_file(os.path.join(DATA_PATH, "sma_test.mir"))
     sma_mir.set_lsts_from_time_array()

diff --git a/pyuvdata/uvdata/mir.py b/pyuvdata/uvdata/mir.py
@@ -343,16 +343,22 @@ def _init_from_mir_parser(
         if len(pol_dict) == 2:
             # If dual-pol, then we need to check if the tunings are split, because
             # the two polarizations will effectively be concat'd across the freq
-            # axis instead of the pol axis.
+            # axis instead of the pol axis. First, see if we have two diff receivers
             rxa_mask = ant1_rxa_mask & ant2_rxa_mask
             rxb_mask = ~(ant1_rxa_mask | ant2_rxa_mask)
 
             if np.any(rxa_mask) and np.any(rxb_mask):
-                # If we have both VV and HH data, check to see that the tunings of the
-                # two receivers match.
-                loa_freq = np.median(mir_data.sp_data["gunnLO"][rxa_mask])
-                lob_freq = np.median(mir_data.sp_data["gunnLO"][rxb_mask])
-                pol_split_tuning = not np.isclose(loa_freq, lob_freq)
+                # If we have both VV and HH data, check to see that the frequencies of
+                # each of the spectral chunks match. If they do, then we can concat
+                # across the polarization axis, but if _not_, we should treat this as
+                # a pol-split data set.
+                fsky_vals = mir_data.sp_data["fsky"]
+                chunk_vals = mir_data.sp_data["corrchunk"]
+                loa_chunks = set(zip(fsky_vals[rxa_mask], chunk_vals[rxa_mask]))
+                lob_chunks = set(zip(fsky_vals[rxb_mask], chunk_vals[rxb_mask]))
+                pol_split_tuning = not (
+                    loa_chunks.issubset(lob_chunks) or lob_chunks.issubset(loa_chunks)
+                )
 
         # Map MIR pol code to pyuvdata/AIPS polarization number
         pol_code_dict = {}
@@ -410,6 +416,16 @@ def _init_from_mir_parser(
         spdx_dict = {}
         spw_dict = {}
         for spdx in set(spdx_list):
+            # We need to do a some extra handling here, because a single correlator
+            # can produce multiple spectral windows (e.g., LSB/USB). The scheme below
+            # will negate the corr band number if LSB, will set the corr band number to
+            # 255 if the values arise from the pseudo-wideband values, and will add 512
+            # if the pols are split-tuned. This scheme, while a little funky, guarantees
+            # that each unique freq range has its own spectral window number.
+            spw_id = 255 if (spdx[0] == 0) else spdx[0]
+            spw_id *= (-1) ** (1 + spdx[1])
+            spw_id += 512 if (pol_split_tuning and spdx[2] == 1) else 0
+
             data_mask = np.array([spdx == item for item in spdx_list])
 
             # Grab values, get them into appropriate types
@@ -419,25 +435,21 @@ def _init_from_mir_parser(
 
             # Make sure that something weird hasn't happened with the metadata (this
             # really should never happen, only one value should exist per window).
-            assert np.allclose(spw_fsky, mir_data.sp_data["fsky"][data_mask])
-            assert np.allclose(spw_fres, mir_data.sp_data["fres"][data_mask])
-            assert np.allclose(spw_nchan, mir_data.sp_data["nch"][data_mask])
+            for val, item in zip(
+                [spw_fsky, spw_fres, spw_nchan], ["fsky", "fres", "nch"]
+            ):
+                if not np.allclose(val, mir_data.sp_data[item][data_mask]):
+                    warnings.warn(
+                        "Discrepancy in %s for win %i sb %i pol %i. Values of "
+                        "`freq_array` and `channel_width` should be checked for "
+                        "channels corresponding to spw_id %i." % (item, *spdx, spw_id)
+                    )
 
             # Get the data in the right units and dtype
             spw_fsky = float(spw_fsky * 1e9)  # GHz -> Hz
             spw_fres = float(spw_fres * 1e6)  # MHz -> Hz
             spw_nchan = int(spw_nchan)
 
-            # We need to do a some extra handling here, because a single correlator
-            # can produce multiple spectral windows (e.g., LSB/USB). The scheme below
-            # will negate the corr band number if LSB, will set the corr band number to
-            # 255 if the values arise from the pseudo-wideband values, and will add 512
-            # if the pols are split-tuned. This scheme, while a little funky, guarantees
-            # that each unique freq range has its own spectral window number.
-            spw_id = 255 if (spdx[0] == 0) else spdx[0]
-            spw_id *= (-1) ** (1 + spdx[1])
-            spw_id += 512 if (pol_split_tuning and spdx[2] == 1) else 0
-
             # Populate the channel width array
             channel_width = abs(spw_fres) + np.zeros(spw_nchan, dtype=np.float64)
 
@@ -499,57 +511,6 @@ def _init_from_mir_parser(
         for key in spdx_dict:
             spdx_dict[key]["ch_slice"] = spw_dict[spdx_dict[key]["spw_id"]]["ch_slice"]
 
-        # Create arrays to plug visibilities and flags into. The array is shaped this
-        # way since when reading in a MIR file, we scan through the blt-axis the
-        # slowest and the freq-axis the fastest (i.e., the data is roughly ordered by
-        # blt, pol, freq).
-        self.data_array = np.zeros((Nblts, Npols, Nfreqs), dtype=np.complex64)
-        self.flag_array = np.ones((Nblts, Npols, Nfreqs), dtype=bool)
-        self.nsample_array = np.zeros((Nblts, Npols, Nfreqs), dtype=np.float32)
-
-        # Get a list of the current inhid values for later
-        inhid_list = mir_data.in_data["inhid"].copy()
-
-        # Store a backup of the selection masks
-        backup_masks = {}
-        for item, obj in mir_data._metadata_attrs.items():
-            backup_masks[item] = obj._mask.copy()
-
-        # If no data is loaded, we want to load subsets of data to start rather than
-        # the whole block in one go, since this will save us 2x in memory.
-        inhid_step = len(inhid_list)
-        if (mir_data.vis_data is None) and (mir_data.auto_data is None):
-            inhid_step = (inhid_step // 8) + 1
-
-        for start in range(0, len(inhid_list), inhid_step):
-            # If no data is loaded, load up a quarter of the data at a time. This
-            # keeps the "extra" memory usage below that for the nsample_array attr,
-            # which is generated and filled _after_ this step (thus no extra memory
-            # should be required)
-            if (mir_data.vis_data is None) and (mir_data.auto_data is None):
-                # Note that the masks are combined via and operation.
-                mir_data.select(
-                    where=("inhid", "eq", inhid_list[start : start + inhid_step])
-                )
-
-            # Call this convenience function in case we need to run the data filling
-            # multiple times (if we are loading up subsets of data)
-            self._prep_and_insert_data(
-                mir_data,
-                sphid_dict,
-                spdx_dict,
-                blhid_blt_order,
-                apply_flags=apply_flags,
-                apply_tsys=apply_tsys,
-                apply_dedoppler=apply_dedoppler,
-            )
-
-            for item, obj in mir_data._metadata_attrs.items():
-                # Because the select operation messes with the masks, we want to restore
-                # those in case we mucked with them earlier (so that subsequent selects
-                # behave as expected).
-                obj._mask = backup_masks[item].copy()
-
         # Now assign our flexible arrays to the object itself
         self.freq_array = freq_array
         self.Nfreqs = Nfreqs
@@ -684,7 +645,6 @@ def _init_from_mir_parser(
         for blt_key in blt_temp_dict.keys():
             temp_dict = blt_temp_dict[blt_key]
             integration_time[blt_key] = temp_dict["rinteg"]
-            # TODO: Using MIR V3 convention for lst, will need make it V2 compatible.
             lst_array[blt_key] = temp_dict["lst"] * (np.pi / 12.0)  # Hours -> rad
             mjd_array[blt_key] = temp_dict["mjd"]
             ant_1_array[blt_key] = temp_dict["iant1"]
@@ -702,9 +662,30 @@ def _init_from_mir_parser(
         self.baseline_array = self.antnums_to_baseline(
             self.ant_1_array, self.ant_2_array, attempt256=False
         )
-        self.integration_time = integration_time
-        self.lst_array = lst_array
         self.time_array = Time(mjd_array, scale="tt", format="mjd").utc.jd
+        self.integration_time = integration_time
+
+        # There is a minor issue w/ MIR datasets where the LSTs are calculated via
+        # a polled average rather than calculated for the mid-point, which results
+        # in some imprecision (and some nuisance warnings).  Fix this by calculating the
+        # LSTs here, checking to make sure that they agree to within the expected
+        # precision (sampling rate is 20 Hz, and the max error to worry about is half a
+        # sample: 25 ms, or in radians, 2*pi/(40 * 86400)) = pi / 1728000.
+        # TODO: Re-evaluate this if/when MIR data writer stops calculating LST this way
+        self.set_lsts_from_time_array()
+        if not np.allclose(lst_array, self.lst_array, rtol=0, atol=np.pi / 1728000.0):
+            # If this check fails, it means that there's something off w/ the lst values
+            # (to a larger degree than expected), and we'll pass them back to the user,
+            # who can inspect them directly and decide what to do.
+            warnings.warn(
+                "> 25 ms errors detected reading in LST values from MIR data. "
+                "This typically signifies a minor metadata recording error (which can "
+                "be mitigated by calling the `set_lsts_from_time_array` method with "
+                "`update_vis=False`), though additional errors about uvw-position "
+                "accuracy may signal more significant issues with metadata accuracy "
+                "that could have substantial impact on downstream analysis."
+            )
+            self.lst_array = lst_array
 
         self.polarization_array = polarization_array
         self.flex_spw_polarization_array = flex_pol
@@ -789,6 +770,55 @@ def _init_from_mir_parser(
         self.filename = [os.path.basename(basename)]
         self._filename.form = (1,)
 
+        # Finally, start the heavy lifting of loading the full data. We start this by
+        # creating arrays to plug visibilities and flags into. The array is shaped this
+        # way since when reading in a MIR file, we scan through the blt-axis the
+        # slowest and the freq-axis the fastest (i.e., the data is roughly ordered by
+        # blt, pol, freq).
+        self.data_array = np.zeros((Nblts, Npols, Nfreqs), dtype=np.complex64)
+        self.flag_array = np.ones((Nblts, Npols, Nfreqs), dtype=bool)
+        self.nsample_array = np.zeros((Nblts, Npols, Nfreqs), dtype=np.float32)
+
+        # Get a list of the current inhid values for later
+        inhid_list = mir_data.in_data["inhid"].copy()
+
+        # Store a backup of the selection masks
+        mir_data.save_mask("pre-select")
+
+        # If no data is loaded, we want to load subsets of data to start rather than
+        # the whole block in one go, since this will save us 2x in memory.
+        inhid_step = len(inhid_list)
+        if (mir_data.vis_data is None) and (mir_data.auto_data is None):
+            inhid_step = (inhid_step // 8) + 1
+
+        for start in range(0, len(inhid_list), inhid_step):
+            # If no data is loaded, load up a quarter of the data at a time. This
+            # keeps the "extra" memory usage below that for the nsample_array attr,
+            # which is generated and filled _after_ this step (thus no extra memory
+            # should be required)
+            if (mir_data.vis_data is None) and (mir_data.auto_data is None):
+                # Note that the masks are combined via and operation.
+                mir_data.select(
+                    where=("inhid", "eq", inhid_list[start : start + inhid_step])
+                )
+
+            # Call this convenience function in case we need to run the data filling
+            # multiple times (if we are loading up subsets of data)
+            self._prep_and_insert_data(
+                mir_data,
+                sphid_dict,
+                spdx_dict,
+                blhid_blt_order,
+                apply_flags=apply_flags,
+                apply_tsys=apply_tsys,
+                apply_dedoppler=apply_dedoppler,
+            )
+
+            # Because the select operation messes with the masks, we want to restore
+            # those in case we mucked with them earlier (so that subsequent selects
+            # behave as expected).
+            mir_data.restore_mask("pre-select")
+
         # We call transpose here since vis_data above is shape (Nblts, Npols, Nfreqs),
         # and we need to get it to (Nblts,Nfreqs, Npols) to match what UVData expects.
         self.data_array = np.transpose(self.data_array, (0, 2, 1))

diff --git a/pyuvdata/uvdata/mir_parser.py b/pyuvdata/uvdata/mir_parser.py
@@ -4148,15 +4148,18 @@ def _make_v3_compliant(self):
         for key, value in mjd_day_dict.items():
             if isinstance(value, str):
                 mjd_day_dict[key] = Time(
-                    datetime.strptime(value, "%b %d, %Y"), scale="tt"
-                ).tt.mjd
+                    datetime.strptime(value, "%b %d, %Y"), scale="utc"
+                ).mjd
 
         mjd_arr = (self.in_data["dhrs"] / 24.0) + np.array(
             [mjd_day_dict[idx] for idx in self.in_data["iref_time"]]
         )
 
         # Tally the JD dates, since that's used for various helper functions
-        jd_arr = Time(mjd_arr, format="mjd", scale="utc").utc.jd
+        jd_arr = Time(mjd_arr, format="mjd", scale="utc").jd
+
+        # Also, convert MJD back into the expected TT timescale
+        mjd_arr = Time(mjd_arr, format="mjd", scale="utc").tt.mjd
 
         # Calculate the LST at the time of obs
         lst_arr = (12.0 / np.pi) * uvutils.get_lst_for_time(
@@ -4183,7 +4186,7 @@ def _make_v3_compliant(self):
         # bl_data updates: ant1rx, ant2rx, u, v, w
         # First, update the antenna receiver if these values are unfilled (true in some
         # earlier tracks, no version demarcation notes it).
-        if np.all(self.bl_data["ant1rx"] == 0) and np.all(self.bl_data["ant2rx"] == 0):
+        if np.all(self.bl_data["ant1rx"] == 0) or np.all(self.bl_data["ant2rx"] == 0):
             ipol = self.bl_data["ipol"]
             irec = self.bl_data["irec"]
 

diff --git a/pyuvdata/uvdata/tests/conftest.py b/pyuvdata/uvdata/tests/conftest.py
@@ -106,8 +106,10 @@ def sma_mir_main():
     testfile = os.path.join(DATA_PATH, "sma_test.mir")
     with uvtest.check_warnings(
         UserWarning,
-        match="The lst_array is not self-consistent with the time_array and telescope "
-        "location. Consider recomputing with the `set_lsts_from_time_array` method.",
+        match=[
+            "> 25 ms errors detected reading in LST values from MIR data. ",
+            "The lst_array is not self-consistent with the time_array and telescope ",
+        ],
     ):
         uv_object.read(testfile, use_future_array_shapes=True)
     uv_object.set_lsts_from_time_array()