Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

load_cdf helper function #537

Merged
38 changes: 33 additions & 5 deletions imap_processing/cdf/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@

import logging
import re
from pathlib import Path

import imap_data_access
import numpy as np
import xarray as xr
from cdflib.xarray import xarray_to_cdf
from cdflib.xarray import cdf_to_xarray, xarray_to_cdf

from imap_processing import launch_time

Expand Down Expand Up @@ -41,6 +42,33 @@ def calc_start_time(shcoarse_time: float) -> np.datetime64:
return launch_time + time_delta


def load_cdf(file_path: Path, **kwargs: dict) -> xr.Dataset:
"""Load the contents of a CDF file into an ``xarray`` dataset.

Parameters
----------
file_path : Path
The path to the CDF file
**kwargs : dict, optional
Keyword arguments for ``cdf_to_xarray``

Returns
-------
dataset : xr.Dataset
bourque marked this conversation as resolved.
Show resolved Hide resolved
The ``xarray`` dataset for the CDF file
"""
dataset = cdf_to_xarray(file_path, kwargs)

# cdf_to_xarray converts single-value attributes to lists
# convert these back to single values where applicable
for attribute in dataset.attrs:
value = dataset.attrs[attribute]
if isinstance(value, list) and len(value) == 1:
dataset.attrs[attribute] = value[0]

return dataset


def write_cdf(dataset: xr.Dataset):
"""Write the contents of "data" to a CDF file using cdflib.xarray_to_cdf.

Expand All @@ -53,13 +81,13 @@ def write_cdf(dataset: xr.Dataset):

Parameters
----------
dataset : xarray.Dataset
The dataset object to convert to a CDF
dataset : xarray.Dataset
The dataset object to convert to a CDF

Returns
-------
pathlib.Path
Path to the file created
file_path: pathlib.Path
Path to the file created
"""
# Create the filename from the global attributes
# Logical_source looks like "imap_swe_l2_counts-1min"
Expand Down
5 changes: 2 additions & 3 deletions imap_processing/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
from urllib.error import HTTPError

import imap_data_access
from cdflib.xarray import cdf_to_xarray

import imap_processing

Expand All @@ -31,7 +30,7 @@
# from imap_processing import cdf
# In code:
# call cdf.utils.write_cdf
from imap_processing.cdf.utils import write_cdf
from imap_processing.cdf.utils import load_cdf, write_cdf
from imap_processing.mag.l1a.mag_l1a import mag_l1a
from imap_processing.swe.l1a.swe_l1a import swe_l1a
from imap_processing.swe.l1b.swe_l1b import swe_l1b
Expand Down Expand Up @@ -352,7 +351,7 @@ def process(self):

elif self.data_level == "l1b":
# read CDF file
l1a_dataset = cdf_to_xarray(dependencies[0])
l1a_dataset = load_cdf(dependencies[0])
processed_data = swe_l1b(l1a_dataset)
# TODO: Pass in the proper version and descriptor
cdf_file_path = write_cdf(data=processed_data)
Expand Down
75 changes: 66 additions & 9 deletions imap_processing/tests/cdf/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,26 @@
"""Tests for the ``cdf.utils`` module."""

import imap_data_access
import numpy as np
import pytest
import xarray as xr

from imap_processing import launch_time
from imap_processing.cdf.global_attrs import ConstantCoordinates
from imap_processing.cdf.utils import calc_start_time, write_cdf
from imap_processing.cdf.utils import calc_start_time, load_cdf, write_cdf
from imap_processing.swe.swe_cdf_attrs import swe_l1a_global_attrs


def test_calc_start_time():
assert calc_start_time(0) == launch_time
assert calc_start_time(1) == launch_time + np.timedelta64(1, "s")
@pytest.fixture()
def test_dataset():
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice! I like the re-use of this :)

I'm not sure how feasible/hard this would be, but can you do any assertion on round-tripping? You are testing loading/writing separately, but can you say anything about test_dataset == load_cdf(write_cdf(test_dataset)) Maybe testing it has the same variables or something like that if == doesn't work.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I follow what you are saying here but just want to be clear. Are you talking about adding a test like this?

def test_written_and_loaded_dataset(test_dataset):

    assert test_dataset == load_cdf(write_cdf(test_dataset))

to make sure that nothing i getting changed in the xarray object when passed through the write_cdf() and load_cdf() function?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, exactly! Making sure it is a reversible operation and we are getting the same data in and out.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Gotcha. Unfortunately it is indeed breaking, so good that we checked!

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Uh oh! Hopefully not too big of an issue? Does it need to be fixed in cdflib or on our end... there is hardly anything we are doing here in these functions.

Copy link
Collaborator Author

@bourque bourque May 2, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I have identified that the transformation is happening within cdflib:

In [1]: import cdflib
   ...: from cdflib import xarray
   ...: import xarray as xr
   ...: import numpy as np
   ...: from imap_processing.cdf.global_attrs import ConstantCoordinates
   ...: from imap_processing.swe.swe_cdf_attrs import swe_l1a_global_attrs

In [2]: filename = 'imap_swe_l1_sci_20100101_v001.cdf'

In [3]:     dataset = xr.Dataset(
   ...:         {
   ...:             "epoch": (
   ...:                 "epoch",
   ...:                 [
   ...:                     np.datetime64("2010-01-01T00:01:01", "ns"),
   ...:                     np.datetime64("2010-01-01T00:01:02", "ns"),
   ...:                     np.datetime64("2010-01-01T00:01:03", "ns"),
   ...:                 ],
   ...:             ),
   ...:         },
   ...:         attrs=swe_l1a_global_attrs.output()
   ...:         | {"Logical_source": "imap_swe_l1_sci", "Data_version": "001"},
   ...:     )
   ...:     dataset["epoch"].attrs = ConstantCoordinates.EPOCH

In [4]: dataset
Out[4]: 
<xarray.Dataset> Size: 24B
Dimensions:  (epoch: 3)
Coordinates:
  * epoch    (epoch) datetime64[ns] 24B 2010-01-01T00:01:01 ... 2010-01-01T00...
Data variables:
    *empty*
Attributes: (12/15)
    Project:                     STP>Solar-Terrestrial Physics
    Source_name:                 IMAP>Interstellar Mapping and Acceleration P...
    Discipline:                  Solar Physics>Heliospheric Physics
    Mission_group:               IMAP>Interstellar Mapping and Acceleration P...
    PI_name:                     Dr. David J. McComas
    PI_affiliation:              ('Princeton Plasma Physics Laboratory', '100...
    ...                          ...
    TEXT:                        The Solar Wind Electron (SWE) instrument mea...
    Instrument_type:             Particles (space)
    Logical_file_id:             ['FILL ME IN AT FILE CREATION']
    Data_type:                   L1A_SCI>Level-1A Science Data
    Logical_source:              imap_swe_l1_sci
    Logical_source_description:  IMAP Mission SWE Instrument Level-1A Data

In [5]: cdflib.xarray.xarray_to_cdf(dataset, filename)

In [6]: new_dataset = cdflib.xarray.cdf_to_xarray(filename)

In [7]: new_dataset
Out[7]: 
<xarray.Dataset> Size: 24B
Dimensions:  (record0: 3)
Dimensions without coordinates: record0
Data variables:
    epoch    (record0) int64 24B 1262304061000000000 ... 1262304063000000000
Attributes: (12/15)
    Project:                     ['STP>Solar-Terrestrial Physics']
    Source_name:                 ['IMAP>Interstellar Mapping and Acceleration...
    Discipline:                  ['Solar Physics>Heliospheric Physics']
    Mission_group:               ['IMAP>Interstellar Mapping and Acceleration...
    PI_name:                     ['Dr. David J. McComas']
    PI_affiliation:              ['Princeton Plasma Physics Laboratory', '100...
    ...                          ...
    TEXT:                        ['The Solar Wind Electron (SWE) instrument m...
    Instrument_type:             ['Particles (space)']
    Logical_file_id:             ['FILL ME IN AT FILE CREATION']
    Data_type:                   ['L1A_SCI>Level-1A Science Data']
    Logical_source:              ['imap_swe_l1_sci']
    Logical_source_description:  ['IMAP Mission SWE Instrument Level-1A Data']

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ping @bryan-harter I think this is a bug within cdflib, but maybe it isn't so much a bug as a limitation of CDF? Is there any way to keep the coordinates when loading the cdf from disk into an xarray dataset?

Copy link
Contributor

@bryan-harter bryan-harter May 2, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So the way cdf_to_xarray tells what the dimensions are is only through what the other variables in the file are pointing at (i.e. with a DEPEND_0/1/2/3/etc).

If there are no other variables in the file, then cdf_to_xarray doesn't consider "epoch" a dimension. So its more of a limitation of CDF I suppose. One way we can get around this is to add a DEPEND_0 to the epoch variable that just points to itself. So if you add the following, the code will identify the dimension correctly:

dataset["epoch"].attrs['DEPEND_0'] = 'epoch'

Alternatively you can just add some dummy variable, like:

dataset = xr.Dataset(
         {
             "epoch": (
                 "epoch",
                 [
                     np.datetime64("2010-01-01T00:01:01", "ns"),
                     np.datetime64("2010-01-01T00:01:02", "ns"),
                     np.datetime64("2010-01-01T00:01:03", "ns"),
                 ],
             ),
             "hello": (
                 "hello",
                 [
                     1,
                     2,
                     3,
                 ],
             ),
         },
         attrs=swe_l1a_global_attrs.output()
         | {"Logical_source": "imap_swe_l1_sci", "Data_version": "001"},
     )
dataset["epoch"].attrs = ConstantCoordinates.EPOCH
dataset["hello"].attrs = swe_metadata_attrs.output()

In this case, since the variable "hello" points to "epoch", then cdf_to_xarray correctly identifies "epoch" as a dimension

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess on the cdflib side, I can try to make changes so that 1D "support_variables" are always their own dimension? I can try taking a look at that and see if that fundamentally breaks anything. But it handles time variables slightly differently than the other DEPENDs.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@bryan-harter Thanks for the info on that. Your simple fix of adding the DEPEND_0 appears to be working as intended!

One other thing that seems to be changing is that cdflib is writing and/or loading the CDF attributes into lists, i.e.:

Attributes: (12/15)
    Project:                     STP>Solar-Terrestrial Physics
    Source_name:                 IMAP>Interstellar Mapping and Acceleration P...
    Discipline:                  Solar Physics>Heliospheric Physics
    Mission_group:               IMAP>Interstellar Mapping and Acceleration P...
    PI_name:                     Dr. David J. McComas
    PI_affiliation:              ('Princeton Plasma Physics Laboratory', '100...
    ...                          ...
    TEXT:                        The Solar Wind Electron (SWE) instrument mea...
    Instrument_type:             Particles (space)
    Logical_file_id:             ['FILL ME IN AT FILE CREATION']
    Data_type:                   L1A_SCI>Level-1A Science Data
    Logical_source:              imap_swe_l1_sci
    Logical_source_description:  IMAP Mission SWE Instrument Level-1A Data

gets turned into

Attributes: (12/15)
    Project:                     ['STP>Solar-Terrestrial Physics']
    Source_name:                 ['IMAP>Interstellar Mapping and Acceleration...
    Discipline:                  ['Solar Physics>Heliospheric Physics']
    Mission_group:               ['IMAP>Interstellar Mapping and Acceleration...
    PI_name:                     ['Dr. David J. McComas']
    PI_affiliation:              ['Princeton Plasma Physics Laboratory', '100...
    ...                          ...
    TEXT:                        ['The Solar Wind Electron (SWE) instrument m...
    Instrument_type:             ['Particles (space)']
    Logical_file_id:             ['FILL ME IN AT FILE CREATION']
    Data_type:                   ['L1A_SCI>Level-1A Science Data']
    Logical_source:              ['imap_swe_l1_sci']
    Logical_source_description:  ['IMAP Mission SWE Instrument Level-1A Data']

Do you know if this is expected / purposeful?

"""Create a simple ``xarray`` dataset to be used in testing

Returns
-------
dataset : xarray.Dataset
The ``xarray`` dataset object
"""

def test_write_cdf():
# Set up a fake dataset
# lots of requirements on attributes, so depend on SWE for now
dataset = xr.Dataset(
{
"epoch": (
Expand All @@ -28,11 +33,63 @@ def test_write_cdf():
)
},
attrs=swe_l1a_global_attrs.output()
| {"Logical_source": "imap_swe_l1_sci", "Data_version": "001"},
| {
"Logical_source": "imap_swe_l1_sci",
"Data_version": "001",
"Logical_file_id": "imap_swe_l1_sci_20100101_v001",
},
)
dataset["epoch"].attrs = ConstantCoordinates.EPOCH
dataset["epoch"].attrs["DEPEND_0"] = "epoch"

return dataset


def test_calc_start_time():
"""Tests the ``calc_start_time`` function"""

assert calc_start_time(0) == launch_time
assert calc_start_time(1) == launch_time + np.timedelta64(1, "s")

file_path = write_cdf(dataset)

def test_load_cdf(test_dataset):
"""Tests the ``load_cdf`` function."""

# Write the dataset to a CDF to be used to test the load function
file_path = write_cdf(test_dataset)

# Load the CDF and ensure the function returns a dataset
dataset = load_cdf(file_path)
assert isinstance(dataset, xr.core.dataset.Dataset)


def test_write_cdf(test_dataset):
"""Tests the ``write_cdf`` function.

Parameters
----------
dataset : xarray.Dataset
An ``xarray`` dataset object to test with
"""

file_path = write_cdf(test_dataset)
assert file_path.exists()
assert file_path.name == "imap_swe_l1_sci_20100101_v001.cdf"
assert file_path.relative_to(imap_data_access.config["DATA_DIR"])


def test_written_and_loaded_dataset(test_dataset):
"""Tests that a dataset that is written to CDF and then loaded results in
the original dataset.

Parameters
----------
dataset : xarray.Dataset
An ``xarray`` dataset object to test with
"""

new_dataset = load_cdf(write_cdf(test_dataset), to_datetime=True)
new_dataset.attrs["PI_affiliation"] = tuple(
new_dataset.attrs["PI_affiliation"]
) # The PI_affiliation attribute should be a tuple
assert str(test_dataset) == str(new_dataset)
4 changes: 2 additions & 2 deletions imap_processing/tests/codice/test_codice_l1a.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@

from pathlib import Path

import cdflib
import numpy as np
import pytest
import xarray as xr

from imap_processing import imap_module_directory
from imap_processing.cdf.utils import load_cdf
from imap_processing.codice.codice_l0 import decom_packets
from imap_processing.codice.codice_l1a import process_codice_l1a

Expand Down Expand Up @@ -140,7 +140,7 @@ def test_l1a_data_array_values(test_l1a_data: xr.Dataset, validation_data: Path)
"""

generated_dataset = test_l1a_data
validation_dataset = cdflib.xarray.cdf_to_xarray(validation_data)
validation_dataset = load_cdf(validation_data)

# Ensure the processed data matches the validation data
for variable in validation_dataset:
Expand Down
7 changes: 2 additions & 5 deletions imap_processing/tests/idex/test_l1_cdfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,10 @@
import numpy as np
import pytest
import xarray as xr
from cdflib.xarray import cdf_to_xarray
from cdflib.xarray.xarray_to_cdf import ISTPError

from imap_processing import imap_module_directory
from imap_processing.cdf.utils import write_cdf
from imap_processing.cdf.utils import load_cdf, write_cdf
from imap_processing.idex.idex_packet_parser import PacketParser


Expand Down Expand Up @@ -74,7 +73,5 @@ def test_idex_tof_high_data_from_cdf(decom_test_data):
data = np.array([int(line.rstrip()) for line in f])

file_name = write_cdf(decom_test_data)
l1_data = cdf_to_xarray(
file_name
) # Read in the data from the CDF file to an xarray object
l1_data = load_cdf(file_name)
assert (l1_data["TOF_High"][13].data == data).all()
7 changes: 3 additions & 4 deletions imap_processing/tests/mag/test_mag_decom.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
from pathlib import Path

import pandas as pd
from cdflib.xarray import cdf_to_xarray

from imap_processing.cdf import global_attrs
from imap_processing.cdf.utils import write_cdf
from imap_processing.cdf.utils import load_cdf, write_cdf
from imap_processing.mag import mag_cdf_attrs
from imap_processing.mag.l0.decom_mag import decom_packets, generate_dataset

Expand Down Expand Up @@ -91,12 +90,12 @@ def test_mag_raw_cdf_generation():
assert output.exists()
assert output.name == "imap_mag_l1a_norm-raw_20231025_v001.cdf"

input_xarray = cdf_to_xarray(output)
input_xarray = load_cdf(output)
assert input_xarray.attrs.keys() == norm_data.attrs.keys()

output = write_cdf(burst_data)
assert output.exists()
assert output.name == "imap_mag_l1a_burst-raw_20231025_v001.cdf"

input_xarray = cdf_to_xarray(output)
input_xarray = load_cdf(output)
assert input_xarray.attrs.keys() == burst_data.attrs.keys()
5 changes: 2 additions & 3 deletions imap_processing/tests/swe/test_swe_l1b.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
import pandas as pd
import pytest
from cdflib.xarray import cdf_to_xarray

from imap_processing import imap_module_directory
from imap_processing.cdf.utils import write_cdf
from imap_processing.cdf.utils import load_cdf, write_cdf
from imap_processing.swe.l0 import decom_swe
from imap_processing.swe.l1a.swe_l1a import swe_l1a
from imap_processing.swe.l1a.swe_science import swe_science
Expand Down Expand Up @@ -132,7 +131,7 @@ def test_cdf_creation():
assert hk_l1a_filepath.name == "imap_swe_l1a_sci_20230927_v001.cdf"

# reads data from CDF file and passes to l1b
l1a_cdf_dataset = cdf_to_xarray(hk_l1a_filepath, to_datetime=True)
l1a_cdf_dataset = load_cdf(hk_l1a_filepath, to_datetime=True)
l1b_dataset = swe_l1b(l1a_cdf_dataset)

hk_l1b_filepath = write_cdf(l1b_dataset)
Expand Down
10 changes: 5 additions & 5 deletions imap_processing/tests/ultra/unit/test_ultra_l1a.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import dataclasses

import pytest
from cdflib.xarray import cdf_to_xarray

from imap_processing.cdf.utils import load_cdf
from imap_processing.ultra import ultra_cdf_attrs
from imap_processing.ultra.l0.decom_ultra import decom_ultra_apids
from imap_processing.ultra.l0.ultra_utils import (
Expand Down Expand Up @@ -185,7 +185,7 @@ def test_cdf_aux(
assert test_data_path.name == "imap_ultra_l1a_sci_20220530_v001.cdf"

dataset_aux = create_dataset({ULTRA_AUX.apid[0]: decom_ultra_aux})
input_xarray_aux = cdf_to_xarray(test_data_path)
input_xarray_aux = load_cdf(test_data_path)

assert input_xarray_aux.attrs.keys() == dataset_aux.attrs.keys()

Expand All @@ -203,7 +203,7 @@ def test_cdf_rates(
assert test_data_path.name == "imap_ultra_l1a_sci_20220530_v001.cdf"

dataset_rates = create_dataset({ULTRA_RATES.apid[0]: decom_ultra_rates})
input_xarray_rates = cdf_to_xarray(test_data_path)
input_xarray_rates = load_cdf(test_data_path)

assert input_xarray_rates.attrs.keys() == dataset_rates.attrs.keys()

Expand All @@ -219,7 +219,7 @@ def test_cdf_tof(
assert test_data_path.name == "imap_ultra_l1a_sci_20240124_v001.cdf"

dataset_tof = create_dataset({ULTRA_TOF.apid[0]: decom_ultra_tof})
input_xarray_tof = cdf_to_xarray(test_data_path)
input_xarray_tof = load_cdf(test_data_path)

assert input_xarray_tof.attrs.keys() == dataset_tof.attrs.keys()

Expand All @@ -243,6 +243,6 @@ def test_cdf_events(
dataset_events = create_dataset(
{ULTRA_EVENTS.apid[0]: decom_ultra_events, ULTRA_AUX.apid[0]: decom_ultra_aux}
)
input_xarray_events = cdf_to_xarray(test_data_path)
input_xarray_events = load_cdf(test_data_path)

assert input_xarray_events.attrs.keys() == dataset_events.attrs.keys()
Loading