Merge pull request #49 from SuperDARN/release/1.2

Release pyDARNio v1.2.0
SuperDARN · Nov 29, 2022 · bd91eb2 · bd91eb2
2 parents 48d5941 + 3e17dee
commit bd91eb2
Show file tree

Hide file tree

Showing 14 changed files with 1,631 additions and 88 deletions.
diff --git a/.zenodo.json b/.zenodo.json
@@ -1,11 +1,22 @@
 {
     "creators":[
         {
-            "name": "SuperDARN Data Standards Working Group"
+            "name": "SuperDARN Data Visualization Working Group"
         },
         {
             "affiliation": "University of Saskatchewan",
-            "name": "Rohel, R.A."
+            "name": "Rohel, R.A.",
+            "orcid": "0000-0003-2208-1553"
+        },
+        {
+            "affiliation": "The University Centre in Svalbard",
+            "name": "Bland, E.C.",
+            "orcid": "0000-0002-0252-0400"
+        },
+        {
+            "affiliation": "University of Saskatchewan",
+            "name": "Martin, C.J.",
+            "orcid": "0000-0002-8278-9783"
         },
         {
             "affiliation": "John Hopkins University",
@@ -34,15 +45,15 @@
             "affiliation": "University of Saskatchewan",
             "name": "Kotyk, K."
         },
-        {
-            "affiliation": "University of Saskatchewan",
-            "name": "Martin, C.J.",
-            "orcid": "0000-0002-8278-9783"
-        },
         {
             "affiliation": "University of Saskatchewan",
             "name": "Schmidt, M.T.",
             "orcid": "0000-0002-3265-977X"
-        }
+        },
+        {
+            "affiliation": "Virginia Tech",
+            "name": "Shi, X.",
+            "orcid": "0000-0001-8425-8241"
+        },
     ]
-}
+}
diff --git a/README.md b/README.md
@@ -9,14 +9,12 @@ Python data IO library for the Super Dual Auroral Radar Network (SuperDARN).
 
 ## Changelog
 
-## Version 1.1.1 - Release!
-
-pyDARNio is released! This is a patch release to address the following issues:
-- Bug fix with initializing empty arrays when converting HDF5 files from site- to array-structured
-- Bug fixes with converting files from HDF5 to DMAP
-  - correctly check blanked_samples
-  - support multiple beams per record
-  - replacement of far-range lag0 data in rawacf conversion
+## Version 1.2.0 - Release!
+
+pyDARNio has a new release! This is a minor release which includes:
+- Handling of extra fields in fitacf files coming in the next RST release, including the name of the ACF fitting algorithm used (`algorithm`), the value of TDIFF used to calculate the elevation angles (`tdiff`), and more descriptive field names for the elevation angle fields in FitACF 3.0 (`elv_error` & `elv_fitted`)
+- More flexibility around optional fields for all file types
+- Updates and speed improvements for Borealis file io
 
 ## Documentation
 

diff --git a/pydarnio/__init__.py b/pydarnio/__init__.py
@@ -43,3 +43,4 @@
 from .borealis.borealis import BorealisRead
 from .borealis.borealis import BorealisWrite
 from .borealis.borealis_convert import BorealisConvert
+from .borealis.borealis_restructure import BorealisRestructure
diff --git a/pydarnio/borealis/base_format.py b/pydarnio/borealis/base_format.py
@@ -41,7 +41,8 @@
 
 from collections import OrderedDict
 from datetime import datetime
-from typing import Callable
+from typing import Callable, List
+import h5py
 
 from pydarnio import borealis_exceptions
 
@@ -173,6 +174,14 @@ class BaseFormat():
     array_array_dtypes: dict
         fields in the array files that are made of numpy arrays, with their
         given data type.
+    string_fields: list
+        List of all fields that contain string data.
+    single_string_fields: list
+        List of single element fields which are string types.
+    array_string_fields: list
+        List of array fields which are string types.
+    bool_types: list
+        List of single element fields that have boolean data types.
     _site_to_array(data_dict): dict
         Convert an OrderedDict of site data to array data using the information
         provided for the specific data format.
@@ -437,6 +446,39 @@ def site_specific_fields_generate(cls):
         """
         return {}
 
+    @classmethod
+    def array_specific_fields_iterative_generator(cls):
+        """
+        Retrieve a generator function for each field of the format that belongs
+        only in the array structured files and requires access to the
+        information in each record.
+
+        Returns
+        -------
+        A dictionary of any fields that are array specific and can only be
+        generated by looking at the records of the file. The key is the name
+        of the array specific field and the value in the dictionary is a
+        function which retrieves the required information from a record. The
+        function must take a single record (site data dictionary) to generate a
+        value for that field in the arrays format. This class method is used
+        when restructuring from site to array style.
+        The array specific field generator functions are unique to the format
+        so should be overwritten by the child class.
+
+        Notes
+        -----
+        All fields possible = single_element_types + array_dtypes
+        Fields are then classified into four types to determine
+        how to restructure: shared_fields (all records have the same value),
+        unshared_fields (all records have unique value/array),
+        array_specific_fields (any fields unique to array files, used mainly
+        where dimensions may vary between records so the number to parse needs
+        to be stored), and site_specific_fields (any fields unique to site
+        files, used mainly where dimensions of flattened arrays need to be
+        stored).
+        """
+        return {}
+
     # STATIC METHODS THAT VARY BY FORMAT
     # i.e. methods used in restructuring that the format to/from site
     # structure for interpreting site data. These formats
@@ -508,6 +550,81 @@ def flatten_site_arrays(records: OrderedDict) -> OrderedDict:
         new_records = copy.deepcopy(records)
         return new_records
 
+    @staticmethod
+    def site_get_max_dims(filename: str, unshared_parameters: List[str]):
+        """
+        Checks the records in a site file for all unshared parameter fields to
+        find the maximum dimensions. Used for initializing arrays in site to
+        array conversion to avoid padding arrays midway through conversion.
+
+        Parameters
+        ----------
+        filename: str
+            Name of the site file being checked
+        unshared_parameters: List[str]
+            List of parameter names that are not shared between all the records
+            in the site restructured file, i.e. may have different dimensions
+            between records.
+        Returns
+        -------
+        fields_max_dims: dict
+            dictionary containing field names (str) as keys with maximum
+            dimensions required to restructure to array file as values (tuples)
+        max_num_sequences: int
+            integer, max number of sequences of all records in the site file
+        max_num_beams: int
+            integer, max number of beams of all records in the site file
+        Raises
+        ------
+
+        """
+        fields_max_dims = {key: () for key in unshared_parameters}
+        max_num_sequences = 0
+        max_num_beams = 0
+
+        # Open site file to read with h5py, iterate over all records in the
+        # file, and iterate through all fields required to find max dims
+        # needed for conversion to array file.
+        with h5py.File(filename, 'r') as site_file:
+            for rec_idx, record_name in enumerate(site_file):
+                for field, dims in fields_max_dims.items():
+                    try:
+                        # TypeError on booleans (ie: scan_start_marker)
+                        # KeyError if field is dataset instead of attribute
+                        field_value = site_file[record_name].attrs[field]
+                        if field == 'num_sequences':
+                            max_num_sequences = max(max_num_sequences, field_value)
+                    except (KeyError, TypeError) as e:
+                        try:
+                            # Raises KeyError if field DNE as dataset
+                            field_shape = site_file[record_name][field].shape
+                            if field == 'pulse_phase_offset':
+                                # Borealis files are written with deepdish, and this field is sometimes written
+                                # as an empty array. If read in by h5py, h5py reads the dimensions as the data
+                                # so here we check to catch that case.
+                                actual_size = site_file[record_name][field].size
+                                num_sequences = site_file[record_name]['data_dimensions'][1]
+                                num_pulses = site_file[record_name]['pulses'].size
+                                if actual_size != num_sequences * num_pulses:
+                                    if actual_size == 1:    # This is the special case
+                                        field_shape = (0,)
+                                    else:
+                                        raise ValueError(f'Unexpected shape of field {field}: {field_shape}')
+                        except KeyError:
+                            continue
+                        # Initialize shape to first record's field dimensions
+                        if rec_idx == 0:
+                            fields_max_dims[field] = field_shape
+                            if field == 'beam_nums':
+                                max_num_beams = max(field_shape[0], max_num_beams)
+                        else:
+                            # Update dims to keep largest for all records
+                            new_shape = map(lambda dima, dimb: max(dima, dimb),
+                                            fields_max_dims[field],
+                                            field_shape)
+                            fields_max_dims[field] = tuple(new_shape)
+        return fields_max_dims, max_num_sequences, max_num_beams
+
     # CLASS METHODS COMMON ACROSS FORMATS
     # i.e. class methods that build off the other class methods so generally
     # do not need to be overwritten by the formats.
@@ -836,6 +953,63 @@ def array_array_dtypes(cls):
 
         return array_array_dtypes
 
+    @classmethod
+    def string_fields(cls):
+        """
+        Retrieve the fields of the format that hold strings
+        in the records.
+
+        Returns
+        -------
+        string_fields
+            All the string fields in records of the
+            format, as a list.
+        """
+        return cls.single_string_fields() + cls.array_string_fields()
+
+    @classmethod
+    def single_string_fields(cls):
+        """
+        Retrieve the fields of the format that hold single element strings
+        in the records.
+
+        Returns
+        -------
+        string_fields
+            All the single element string fields in records of the
+            format, as a list.
+        """
+        return [k for k, v in cls.single_element_types().items() if v == np.str_]
+
+    @classmethod
+    def array_string_fields(cls):
+        """
+        Retrieve the fields of the format that hold arrays of strings
+        in the records.
+
+        Returns
+        -------
+        string_fields
+            All the fields with arrays of strings in records of the
+            format, as a list.
+        """
+        return [k for k, v in cls.array_dtypes().items() if v == np.str_]
+
+    @classmethod
+    def bool_types(cls):
+        """
+        Retrieve the fields of the format that hold boolean data
+        in the records.
+
+        Returns
+        -------
+        bool_dtypes
+            All the boolean fields in records of the
+            format, as a list.
+        """
+
+        return [k for k, v in cls.single_element_types().items() if v == np.bool_]
+
     @classmethod
     def _site_to_array(cls, data_dict: OrderedDict) -> dict:
         """
@@ -952,7 +1126,7 @@ class methods used inside this method should be specific
                     # dims with a determined max value
                     data_buffer = data_dict[k][field]
                     buffer_shape = data_buffer.shape
-                    index_slice = [slice(0, i) for i in buffer_shape]
+                    index_slice = [slice(0, i) for i in buffer_shape if i != 0]
                     # insert record index at start of array's slice list
                     index_slice.insert(0, rec_idx)
                     index_slice = tuple(index_slice)
@@ -1162,7 +1336,6 @@ def find_max_field_len(records: OrderedDict) -> int:
 
         return find_max_field_len
 
-
     @staticmethod
     def find_max_pulse_phase_offset(records: OrderedDict) -> int:
         """

diff --git a/pydarnio/borealis/borealis_array.py b/pydarnio/borealis/borealis_array.py
@@ -117,6 +117,7 @@ def __init__(self, filename: str, borealis_filetype: str):
         try:
             version = dd.io.load(self.filename,
                                  group='/borealis_git_hash').split('-')[0]
+            version = '.'.join(version.split('.')[:2])      # vX.Y, ignore patch revision
         except ValueError as err:
             raise borealis_exceptions.BorealisStructureError(
                 ' {} Could not find the borealis_git_hash required to '
@@ -361,6 +362,7 @@ def __init__(self, filename: str, borealis_arrays: dict,
         # 'vX.X'
         try:
             version = self._arrays['borealis_git_hash'].split('-')[0]
+            version = '.'.join(version.split('.')[:2])      # vX.Y, ignore patch revision
         except KeyError as err:
             raise borealis_exceptions.BorealisStructureError(
                 ' {} Could not find the borealis_git_hash required to '

diff --git a/pydarnio/borealis/borealis_convert.py b/pydarnio/borealis/borealis_convert.py
@@ -363,12 +363,12 @@ def _is_convertible_to_iqdat(self) -> bool:
                 sample_spacing = int(record['tau_spacing'] /
                                      record['tx_pulse_len'])
 
-                # Check to see if tagged version. If not, use 255.255
-                git_hash = record['borealis_git_hash'].split('-')[0]
-                major_version, minor_version = git_hash.split('.')
-                if major_version[0] == 'v':
-                    borealis_major_revision = major_version[1:]
-                    borealis_minor_revision = minor_version
+                # Borealis git tag version numbers. If not a tagged version,
+                # then use 255.255
+                if record['borealis_git_hash'][0] == 'v':  # tagged version, non-tagged versions have hexadecimal
+                    version = record['borealis_git_hash'].split('-')[0].split('.')
+                    borealis_major_revision = version[0][1:]  # strip off the 'v'
+                    borealis_minor_revision = version[1]
                 else:
                     borealis_major_revision = 255
                     borealis_minor_revision = 255
@@ -439,12 +439,12 @@ def _is_convertible_to_rawacf(self) -> bool:
                 sample_spacing = int(record['tau_spacing'] /
                                      record['tx_pulse_len'])
 
-                # Check to see if tagged version. If not, use 255.255
-                git_hash = record['borealis_git_hash'].split('-')[0]
-                major_version, minor_version = git_hash.split('.')
-                if major_version[0] == 'v':
-                    borealis_major_revision = major_version[1:]
-                    borealis_minor_revision = minor_version
+                # Borealis git tag version numbers. If not a tagged version,
+                # then use 255.255
+                if record['borealis_git_hash'][0] == 'v':  # tagged version, non-tagged versions have hexadecimal
+                    version = record['borealis_git_hash'].split('-')[0].split('.')
+                    borealis_major_revision = version[0][1:]  # strip off the 'v'
+                    borealis_minor_revision = version[1]
                 else:
                     borealis_major_revision = 255
                     borealis_minor_revision = 255
@@ -569,11 +569,10 @@ def __convert_bfiq_record(borealis_slice_id: int,
 
         # Borealis git tag version numbers. If not a tagged version,
         # then use 255.255
-        if record_dict['borealis_git_hash'][0] == 'v' and \
-                record_dict['borealis_git_hash'][2] == '.':
-
-            borealis_major_revision = record_dict['borealis_git_hash'][1]
-            borealis_minor_revision = record_dict['borealis_git_hash'][3]
+        if record_dict['borealis_git_hash'][0] == 'v':  # tagged version, non-tagged versions have hexadecimal
+            version = record_dict['borealis_git_hash'].split('-')[0].split('.')
+            borealis_major_revision = version[0][1:]   # strip off the 'v'
+            borealis_minor_revision = version[1]
         else:
             borealis_major_revision = 255
             borealis_minor_revision = 255
@@ -841,10 +840,10 @@ def __convert_rawacf_record(borealis_slice_id: int,
 
         # Borealis git tag version numbers. If not a tagged version,
         # then use 255.255
-        if record_dict['borealis_git_hash'][0] == 'v' and \
-                record_dict['borealis_git_hash'][2] == '.':
-            borealis_major_revision = record_dict['borealis_git_hash'][1]
-            borealis_minor_revision = record_dict['borealis_git_hash'][3]
+        if record_dict['borealis_git_hash'][0] == 'v':  # tagged version, non-tagged versions are hexadecimal
+            version = record_dict['borealis_git_hash'].split('-')[0].split('.')
+            borealis_major_revision = version[0][1:]    # strip off the 'v'
+            borealis_minor_revision = version[1]
         else:
             borealis_major_revision = 255
             borealis_minor_revision = 255