Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MAG CDF updates #349

Merged
merged 26 commits into from
Feb 23, 2024
Merged
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
4c3b55d
First pass at MAG XTCE documents
maxinelasp Oct 5, 2023
e7d533f
Updating XTCE with dynamic size fields
maxinelasp Oct 5, 2023
2e52740
Updating packet definitions
maxinelasp Nov 28, 2023
3d28cbf
Updating packet definitions
maxinelasp Nov 28, 2023
cd715da
Updates to code and creating tests
maxinelasp Dec 1, 2023
cd739f3
Updating MAG tests
maxinelasp Dec 5, 2023
865a808
setting up files for MAG CDF
maxinelasp Dec 6, 2023
0a63a62
deleted unneeded files
maxinelasp Dec 6, 2023
a88ec06
Removing print statements
maxinelasp Dec 6, 2023
7771aee
First pass on global attributes and main vectors
maxinelasp Dec 18, 2023
0b8cfbd
precommit auto fix
maxinelasp Dec 18, 2023
f32ba40
Initial MAG CDF files
maxinelasp Dec 21, 2023
d373538
Updating based on PR comments
maxinelasp Feb 15, 2024
6371708
Updating with example file
maxinelasp Feb 3, 2024
e6e3d05
Update CLI to include MAG file generation
maxinelasp Feb 5, 2024
1fed03a
Updating from PR comments
maxinelasp Feb 20, 2024
c826d08
Updating tests
maxinelasp Feb 20, 2024
7ea8d6f
updating poetry.lock
maxinelasp Feb 20, 2024
cc3738e
Updating
maxinelasp Feb 21, 2024
9976ffa
Downgrading cdflib
maxinelasp Feb 21, 2024
3a1c166
Removing some dependency requirements for batch_starter
maxinelasp Feb 21, 2024
849d670
Updating mag CDF generation to generate two files
maxinelasp Feb 23, 2024
e08e692
removing files
maxinelasp Feb 23, 2024
4589167
Removing print
maxinelasp Feb 23, 2024
1cf4ce9
minor updates
maxinelasp Feb 23, 2024
c4688ea
PR updates, naming changes
maxinelasp Feb 23, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 16 additions & 7 deletions imap_processing/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
"""

import argparse
import logging
import sys
from abc import ABC, abstractmethod
from json import loads
Expand All @@ -22,6 +23,8 @@

import imap_processing

__logger__ = logging.getLogger(__name__)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Our pattern has been to set the local logger variable and then logger.info() later.

Suggested change
__logger__ = logging.getLogger(__name__)
logger = logging.getLogger(__name__)


# TODO: change how we import things and also folder
# structure may?
# From this:
Expand Down Expand Up @@ -313,17 +316,23 @@ def process(self):
f"Unexpected dependencies found for MAG L1A:"
f"{file_paths}. Expected only one dependency."
)
filename = imap_data_access.ScienceFilePath.generate_from_inputs(
"mag", "l1a", "raw", self.start_date, self.end_date, self.version
)
mag_l1a(file_paths[0], filename.construct_path())
print(f"Generated file: {filename.construct_path()}")
filename_norm = imap_data_access.ScienceFilePath.generate_from_inputs(
"mag", "l1a", "raw-norm", self.start_date, self.end_date, self.version
).construct_path()
filename_burst = imap_data_access.ScienceFilePath.generate_from_inputs(
"mag", "l1a", "raw-burst", self.start_date, self.end_date, self.version
).construct_path()
mag_l1a(file_paths[0], filename_norm, filename_burst)

if self.upload_to_sdc:
print(f"Uploading file from: {filename.construct_path()}")
# TODO: figure out data_dir, because now this fails.
# Should switch to using IMAP_DATA_DIR env var.
imap_data_access.upload(filename.construct_path())
if filename_norm.exists():
logging.info(f"Uploading file: {filename_norm}")
imap_data_access.upload(filename_norm)
if filename_burst.exists():
logging.info(f"Uploading file: {filename_burst}")
imap_data_access.upload(filename_burst)


class Swapi(ProcessInstrument):
Expand Down
Binary file not shown.
103 changes: 81 additions & 22 deletions imap_processing/mag/l0/decom_mag.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,58 +71,118 @@ def export_to_xarray(l0_data: list[MagL0]):
----------
l0_data: list[MagL0]
A list of MagL0 datapoints

Returns
-------
norm_data : xr.Dataset
xarray dataset for generating burst data CDFs
burst_data : xr.Dataset
xarray dataset for generating burst data CDFs
"""
# TODO split by mago and magi using primary sensor
# TODO split by norm and burst
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove this TODO now?

norm_data = defaultdict(list)
burst_data = norm_data.copy()
norm_data = []
burst_data = []

for datapoint in l0_data:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Call this packet?
I consider datapoint a measurement, but I think your packet could contain multiple datapoints.

if datapoint.ccsds_header.PKT_APID == Mode.NORMAL:
for key, value in dataclasses.asdict(datapoint).items():
if key != "ccsds_header":
norm_data[key].append(value)
norm_data.append(datapoint)
if datapoint.ccsds_header.PKT_APID == Mode.BURST:
burst_data["SHCOARSE"].append(datapoint.SHCOARSE)
burst_data["raw_vectors"].append(datapoint.VECTORS)
burst_data.append(datapoint)

norm_dataset = None
burst_dataset = None

if len(norm_data) > 0:
norm_dataset = generate_dataset(norm_data)
if len(burst_data) > 0:
burst_dataset = generate_dataset(burst_data)

return norm_dataset, burst_dataset
Comment on lines +92 to +100
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rather than returning these as a two-tuple, would you possibly want to return a single dictionary similar to what you had before?
datasets["norm"] = norm_dataset
That might make it easier to extend later with datasets["outboard"]["norm"]

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I like that

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I actually might come back to this when I add the rest of the L1A processing. I don't usually like returning dictionaries like that if I have a known number of outputs, and I think I will come up with something better when I add in the other kinds of processing...



def generate_dataset(l0_data: list[MagL0]):
"""
Generate a CDF dataset from the sorted L0 MAG data.

Used to create 2 similar datasets, for norm and burst data.

Parameters
----------
l0_data : list[MagL0]
List of sorted L0 MAG data.

Returns
-------
dataset : xr.Dataset
xarray dataset with proper CDF attributes and shape.
"""
vector_data = np.zeros((len(l0_data), len(l0_data[0].VECTORS)))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rather than padding below, you might be able to calculate it upfront here with a max.

Suggested change
vector_data = np.zeros((len(l0_data), len(l0_data[0].VECTORS)))
vector_data = np.zeros((len(l0_data), max(lambda x: len(x.VECTORS) for x in l0_data))

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That requires looping through all the data, whereas what I have loops through once and takes the performance hit on resizing the np array. I think the array is not likely to get expanded very often, so this is better performance overall (also, because np in general is way faster than normal python stuff)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍 Maybe a question for another time then is why are we getting resizes at all, is this expected? I would have figured the packets to all contain X number of vectors.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, the vector count can vary based on the settings of the instrument

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting, I would have expected it to only be a difference between burst and normal mode, but otherwise, be measured at the same frequency/cadence.

Another follow-on here then is whether 0 is an OK fill value for this data, or do you want np.nan to indicate that the entry didn't have any vector measurements past a certain point?

I guess I'm not entirely clear on what the shifting shape means. I think the shape is (npackets, ndirections) by looking later at the xarray dataset, but is it OK to have a shifting directions variable, or do we need different ndirections variables within the same dataset to distinguish these? A linearly spaced ndirections instead of only at the beginning (padding 0s to the end)?

I might be getting confused with thinking about higher level data products though, and just don't understand enough about the lower level organization to expect here. So just keep these questions in mind, but you don't need to address anything now.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree that "direction" is not the correct axis for this particular CDF. It should be something else, with "Direction" being every other data product. But, I am planning to discuss this with mag when I get back. They didn't reply with their thoughts before now, so it will have to stay as my first draft for now 🤷 I asked if 0 was an ok fill value and they said yes. The byte vectors are varying lengths but you can calculate the number of fixed-length vectors from other data so the length per packet is known when processing.

shcoarse_data = np.zeros(len(l0_data))

support_data = defaultdict(list)

for index, datapoint in enumerate(l0_data):
vector_len = len(datapoint.VECTORS)
if vector_len > vector_data.shape[1]:
# If the new vector is longer than the existing shape, first reshape
# vector_data and pad the existing vectors with zeros.
vector_data = np.pad(
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have no clue why this formatting is so weird... if anyone knows how to make ruff happy here I'd be happy to hear it

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is a common complaint with the auto-formatters from the scientific community and why many people hate using them. I think ruff might have some flags to try and make this better, but I gave another suggestion above to avoid the padding entirely which might be easier.

vector_data,
(
(
0,
0,
),
(0, vector_len - vector_data.shape[1]),
),
"constant",
constant_values=(0,),
)
vector_data[index, :vector_len] = datapoint.VECTORS

shcoarse_data[index] = calc_start_time(datapoint.SHCOARSE)

# Add remaining pieces to arrays
for key, value in dataclasses.asdict(datapoint).items():
if key not in ("ccsds_header", "VECTORS", "SHCOARSE"):
support_data[key].append(value)

# Used in L1A vectors
direction_norm = xr.DataArray(
np.arange(len(norm_data["VECTORS"][0])),
direction = xr.DataArray(
np.arange(vector_data.shape[1]),
name="Direction",
dims=["Direction"],
attrs=mag_cdf_attrs.direction_attrs.output(),
)

norm_epoch_time = xr.DataArray(
[calc_start_time(shcoarse) for shcoarse in norm_data["SHCOARSE"]],
epoch_time = xr.DataArray(
shcoarse_data,
name="Epoch",
dims=["Epoch"],
attrs=ConstantCoordinates.EPOCH,
)

# TODO: raw vectors units
norm_raw_vectors = xr.DataArray(
norm_data["VECTORS"],
raw_vectors = xr.DataArray(
vector_data,
name="Raw_Vectors",
dims=["Epoch", "Direction"],
attrs=mag_cdf_attrs.mag_vector_attrs.output(),
)

# TODO add norm to attrs somehow
norm_dataset = xr.Dataset(
coords={"Epoch": norm_epoch_time, "Direction": direction_norm},
output = xr.Dataset(
coords={"Epoch": epoch_time, "Direction": direction},
attrs=mag_cdf_attrs.mag_l1a_attrs.output(),
)

norm_dataset["RAW_VECTORS"] = norm_raw_vectors

# TODO: retrieve the doc for the CDF description (etattr(MagL0, "__doc__", {}))
output["RAW_VECTORS"] = raw_vectors

for key, value in norm_data.items():
for key, value in support_data.items():
# Time varying values
if key not in ["SHCOARSE", "VECTORS"]:
norm_datarray = xr.DataArray(
output[key] = xr.DataArray(
value,
name=key,
dims=["Epoch"],
Expand All @@ -135,6 +195,5 @@ def export_to_xarray(l0_data: list[MagL0]):
display_type="no_plot",
).output(),
)
norm_dataset[key] = norm_datarray

return norm_dataset
return output
23 changes: 17 additions & 6 deletions imap_processing/mag/l1a/mag_l1a.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,34 @@
"""Methods for decomming packets, processing to level 1A, and writing CDFs for MAG."""
import logging
from pathlib import Path

from imap_processing.cdf.utils import write_cdf
from imap_processing.mag.l0 import decom_mag

__logger__ = logging.getLogger(__name__)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: logger =


def mag_l1a(packet_filepath, output_filepath):

def mag_l1a(packet_filepath, output_filepath_norm, ouptput_filepath_burst):
"""
Process MAG L0 data into L1A CDF files at cdf_filepath.

Parameters
----------
packet_filepath:
packet_filepath :
Packet files for processing
output_filepath:
Directory for output
output_filepath_norm :
Full directory and filename for raw-norm CDF file
ouptput_filepath_burst :
Full directory and filename for raw-burst CDF file
"""
mag_l0 = decom_mag.decom_packets(packet_filepath)

mag_datasets = decom_mag.export_to_xarray(mag_l0)
mag_norm, mag_burst = decom_mag.export_to_xarray(mag_l0)

if mag_norm is not None:
write_cdf(mag_norm, Path(output_filepath_norm))
logging.info(f"Created CDF file at {output_filepath_norm}")

write_cdf(mag_datasets, Path(output_filepath))
if mag_burst is not None:
write_cdf(mag_burst, Path(ouptput_filepath_burst))
logging.info(f"Created CDF file at {output_filepath_norm}")
4 changes: 4 additions & 0 deletions imap_processing/mag/mag_cdf_attrs.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@
)

mag_l1a_attrs = GlobalDataLevelAttrs(
# TODO: data type should include "norm" and "burst" L1A-norm>Level-1A-normal-rate
"L1A>Level-1A",
# Should also include data type
logical_source="imap_mag_l1a",
logical_source_desc="IMAP Mission MAG Instrument Level-1A Data.",
instrument_base=mag_base,
Expand All @@ -50,6 +52,8 @@

# TODO: display type, catdesc, units, format, label_axis

# TODO: update descriptor to be more accurate for L1A raw
# TODO: does raw value need "counts"
mag_vector_attrs = ScienceAttrs(
validmin=GlobalConstants.INT_FILLVAL,
validmax=GlobalConstants.INT_MAXVAL,
Expand Down
45 changes: 40 additions & 5 deletions imap_processing/tests/mag/test_mag_decom.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from pathlib import Path

import pandas as pd
from cdflib.xarray import cdf_to_xarray

from imap_processing.cdf import global_attrs
from imap_processing.cdf.utils import write_cdf
from imap_processing.mag.l0.decom_mag import decom_packets, export_to_xarray


Expand All @@ -12,7 +14,6 @@ def test_mag_decom():
l0 = decom_packets(burst_test_file)

expected_output = pd.read_csv(current_directory / "mag_l0_test_output.csv")

for index, test in enumerate(l0):
assert test.ccsds_header.PKT_APID == expected_output["PHAPID"][index]
assert test.ccsds_header.SRC_SEQ_CTR == expected_output["PHSEQCNT"][index]
Expand All @@ -35,15 +36,49 @@ def test_mag_decom():
assert len(l0) == len(expected_output.index)


def test_mag_raw_cdf():
def test_mag_raw_xarray():
current_directory = Path(__file__).parent
burst_test_file = current_directory / "mag_l0_test_data.pkts"
l0 = decom_packets(str(burst_test_file))

output_data = export_to_xarray(l0)
norm_data, burst_data = export_to_xarray(l0)
required_attrs = list(
global_attrs.GlobalInstrumentAttrs("", "", "").output().keys()
)

assert all([item in list(output_data.attrs.keys()) for item in required_attrs])
assert all([item is not None for _, item in output_data.attrs.items()])
assert all([item in list(norm_data.attrs.keys()) for item in required_attrs])
assert all([item is not None for _, item in norm_data.attrs.items()])

assert all([item in list(burst_data.attrs.keys()) for item in required_attrs])
assert all([item is not None for _, item in burst_data.attrs.items()])

expected_norm_len = 17
print(norm_data.dims["Epoch"])
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
print(norm_data.dims["Epoch"])

assert norm_data.dims["Epoch"] == expected_norm_len

expected_burst_len = 19
assert burst_data.dims["Epoch"] == expected_burst_len


def test_mag_raw_cdf_generation(tmp_path):
current_directory = Path(__file__).parent
test_file = current_directory / "mag_l0_test_data.pkts"
l0 = decom_packets(str(test_file))

norm_data, burst_data = export_to_xarray(l0)

test_data_path_norm = tmp_path / "mag_l1a_raw-normal_20210101_20210102_v01-01.cdf"

output = write_cdf(norm_data, test_data_path_norm)
assert Path.exists(test_data_path_norm)

input_xarray = cdf_to_xarray(output)
assert input_xarray.attrs.keys() == norm_data.attrs.keys()

test_data_path_burst = tmp_path / "mag_l1a_raw-burst_20210101_20210102_v01-01.cdf"

output = write_cdf(burst_data, test_data_path_burst)
assert Path.exists(test_data_path_burst)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might want to assert it doesn't exist before the functions as well? (Similar above this)

Suggested change
assert Path.exists(test_data_path_burst)
assert test_data_path_burst.exists()


input_xarray = cdf_to_xarray(output)
assert input_xarray.attrs.keys() == burst_data.attrs.keys()
Loading