From 5a8f9b003d766c58c3b563154580882e469a653d Mon Sep 17 00:00:00 2001 From: Steinn Ymir Agustsson Date: Tue, 21 Nov 2023 16:42:15 +0100 Subject: [PATCH 01/12] expose parquet metadata --- sed/loader/flash/loader.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sed/loader/flash/loader.py b/sed/loader/flash/loader.py index 8627b762..119c5451 100644 --- a/sed/loader/flash/loader.py +++ b/sed/loader/flash/loader.py @@ -52,6 +52,8 @@ def __init__(self, config: dict) -> None: self.index_per_pulse: MultiIndex = None self.failed_files_error: List[str] = [] + self.prq_metadata = None + def initialize_paths(self) -> Tuple[List[Path], Path]: """ Initializes the paths based on the configuration. @@ -737,10 +739,10 @@ def buffer_file_handler( print("All files converted successfully!") # read all parquet metadata and schema - metadata = [pq.read_metadata(file) for file in parquet_filenames] + self.prq_metadata = [pq.read_metadata(file) for file in parquet_filenames] schema = [pq.read_schema(file) for file in parquet_filenames] - return parquet_filenames, metadata, schema + return parquet_filenames, self.prq_metadata, schema def parquet_handler( self, From 684eb1923f417d9a545575ba06598f821faa191a Mon Sep 17 00:00:00 2001 From: Steinn Ymir Agustsson Date: Tue, 21 Nov 2023 21:13:39 +0100 Subject: [PATCH 02/12] implement get_run_info and bugfix --- sed/core/processor.py | 72 +++++++++++++++++++++++++++++++++++--- sed/loader/flash/loader.py | 10 ++++-- 2 files changed, 76 insertions(+), 6 deletions(-) diff --git a/sed/core/processor.py b/sed/core/processor.py index 22c5abf1..3f9ec4e9 100644 --- a/sed/core/processor.py +++ b/sed/core/processor.py @@ -16,6 +16,7 @@ import pandas as pd import psutil import xarray as xr +from dask.diagnostics import ProgressBar from sed.binning import bin_dataframe from sed.binning.binning import normalization_histogram_from_timed_dataframe @@ -25,8 +26,8 @@ from sed.calibrator import MomentumCorrector from sed.core.config import parse_config from sed.core.config import save_config -from sed.core.dfops import apply_filter from sed.core.dfops import add_time_stamped_data +from sed.core.dfops import apply_filter from sed.core.dfops import apply_jitter from sed.core.metadata import MetaHandler from sed.diagnostics import grid_histogram @@ -164,14 +165,77 @@ def __init__( ) def __repr__(self): + info = self.get_run_info() if self._dataframe is None: df_str = "Data Frame: No Data loaded" else: - df_str = self._dataframe.__repr__() - attributes_str = f"Metadata: {self._attributes.metadata}" - pretty_str = df_str + "\n" + attributes_str + df_str = f"Data Frame: {len(info['dataframe']['columns'])} columns.\n" + df_str += f"{' '*11} {info['dataframe']['num_electrons']:,.0f} electrons.\n" + df_str += f"{' '*11} {info['dataframe']['num_trains']:,.0f} trains.\n" + if "num_pulses" in info["dataframe"]: + df_str += f"{' '*11} {info['dataframe']['num_pulses']:,.0f} pulses.\n" + df_str += f"{' '*11} {info['dataframe']['timestamp_duration']:,.2f} seconds.\n" + df_str += f"{' '*11} {info['dataframe']['duration']}.\n" + df_str += ( + f"{' '*11} {info['dataframe']['start_time']} to {info['dataframe']['end_time']}.\n" + ) + + # df_str = self._dataframe.__repr__() + # attributes_str = f"Metadata: {self._attributes.metadata}" + pretty_str = df_str # + "\n" + attributes_str return pretty_str + def get_run_info(self, compute=False) -> dict: + """Function to return a dict of information about the loaded data. + + TODO: add dtypes from dataframe. add columns per pulse/per electron/per train + + Returns: + dict: Dictionary with information about the loaded data. + """ + info: Dict[str, Any] = {} + head = self.dataframe.head(1) + tail = self.dataframe.tail(1) + info["dataframe"] = {} + info["dataframe"]["columns"] = self.dataframe.columns + if hasattr(self.loader, "num_electrons"): + n_el: int = self.loader.num_electrons + else: + n_el = None + if n_el is None and compute: + with ProgressBar(): + print("computing number of electrons") + n_el = len(self.dataframe) + info["dataframe"]["num_electrons"] = n_el + if hasattr(self.loader, "num_pulses"): + n_pulses: int = self.loader.num_pulses + else: + n_pulses = None + if n_pulses is None and compute: + with ProgressBar(): + print("computing number of pulses") + n_pulses = len(self.dataframe[self.dataframe["electronId"] == 0]) + train_range: tuple = int(head["trainId"]), int(tail["trainId"]) + n_trains = train_range[1] - train_range[0] + info["dataframe"]["trainId_min"] = train_range[0] + info["dataframe"]["trainId_max"] = train_range[1] + info["dataframe"]["num_trains"] = n_trains + if n_pulses is not None: + info["dataframe"]["electrons_per_pulse"] = n_el / n_pulses + if n_trains is not None: + info["dataframe"]["electrons_per_train"] = n_el / n_trains + tsr = float(head["timeStamp"]), float(tail["timeStamp"]) + info["dataframe"]["timestamp_min"] = tsr[0] + info["dataframe"]["timestamp_max"] = tsr[1] + info["dataframe"]["timestamp_duration"] = tsr[1] - tsr[0] + info["dataframe"]["start_time"] = pd.to_datetime(tsr[0], unit="s") + info["dataframe"]["end_time"] = pd.to_datetime(tsr[1], unit="s") + info["dataframe"]["duration"] = pd.to_timedelta(tsr[1] - tsr[0], unit="s") + + info["metadata"] = self._attributes.metadata + info["config"] = self._config + return info + @property def dataframe(self) -> Union[pd.DataFrame, ddf.DataFrame]: """Accessor to the underlying dataframe. diff --git a/sed/loader/flash/loader.py b/sed/loader/flash/loader.py index 119c5451..efb1659c 100644 --- a/sed/loader/flash/loader.py +++ b/sed/loader/flash/loader.py @@ -10,6 +10,7 @@ import time from functools import reduce from pathlib import Path +from typing import Any from typing import List from typing import Sequence from typing import Tuple @@ -52,7 +53,10 @@ def __init__(self, config: dict) -> None: self.index_per_pulse: MultiIndex = None self.failed_files_error: List[str] = [] - self.prq_metadata = None + self.prq_metadata: List[Any] = None + self.num_electrons: int = None + self.num_electrons_per_part: List[int] = None + self.num_pulses: int = None def initialize_paths(self) -> Tuple[List[Path], Path]: """ @@ -727,7 +731,7 @@ def buffer_file_handler( for h5_path, parquet_path in files_to_read ) if any(error): - raise RuntimeError(f"Conversion failed for some files. {error}") + raise RuntimeError(f"Conversion failed for some files. {error}") from error[0] # Raise an error if the conversion failed for any files # TODO: merge this and the previous error trackings @@ -740,6 +744,8 @@ def buffer_file_handler( # read all parquet metadata and schema self.prq_metadata = [pq.read_metadata(file) for file in parquet_filenames] + self.num_electrons_per_part = [metadata.num_rows for metadata in self.prq_metadata] + self.num_electrons = sum(self.num_electrons_per_part) schema = [pq.read_schema(file) for file in parquet_filenames] return parquet_filenames, self.prq_metadata, schema From 1606f06dc6ac3cc3cb072d2c9dd4deaf8233cf74 Mon Sep 17 00:00:00 2001 From: Steinn Ymir Agustsson Date: Wed, 22 Nov 2023 00:17:28 +0100 Subject: [PATCH 03/12] augment info prints --- sed/core/processor.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sed/core/processor.py b/sed/core/processor.py index 3f9ec4e9..b56da417 100644 --- a/sed/core/processor.py +++ b/sed/core/processor.py @@ -172,9 +172,13 @@ def __repr__(self): df_str = f"Data Frame: {len(info['dataframe']['columns'])} columns.\n" df_str += f"{' '*11} {info['dataframe']['num_electrons']:,.0f} electrons.\n" df_str += f"{' '*11} {info['dataframe']['num_trains']:,.0f} trains.\n" + df_str += f"{' '*11} {info['dataframe']['electrons_per_train']} electrons/train.\n" if "num_pulses" in info["dataframe"]: df_str += f"{' '*11} {info['dataframe']['num_pulses']:,.0f} pulses.\n" - df_str += f"{' '*11} {info['dataframe']['timestamp_duration']:,.2f} seconds.\n" + df_str += ( + f"{' '*11} {info['dataframe']['electrons_per_pulse']} " "electrons/pulse.\n" + ) + df_str += f"{' '*11} {info['dataframe']['timestamp_duration']:,.0f} seconds.\n" df_str += f"{' '*11} {info['dataframe']['duration']}.\n" df_str += ( f"{' '*11} {info['dataframe']['start_time']} to {info['dataframe']['end_time']}.\n" From 8c1ee974b9a6a7a9599bf06a4989a94b1fd8f794 Mon Sep 17 00:00:00 2001 From: Steinn Ymir Agustsson Date: Wed, 22 Nov 2023 00:19:35 +0100 Subject: [PATCH 04/12] augment info prints --- sed/core/processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sed/core/processor.py b/sed/core/processor.py index b56da417..7fce6d41 100644 --- a/sed/core/processor.py +++ b/sed/core/processor.py @@ -172,7 +172,7 @@ def __repr__(self): df_str = f"Data Frame: {len(info['dataframe']['columns'])} columns.\n" df_str += f"{' '*11} {info['dataframe']['num_electrons']:,.0f} electrons.\n" df_str += f"{' '*11} {info['dataframe']['num_trains']:,.0f} trains.\n" - df_str += f"{' '*11} {info['dataframe']['electrons_per_train']} electrons/train.\n" + df_str += f"{' '*11} {info['dataframe']['electrons_per_train']:,.1f} electrons/train.\n" if "num_pulses" in info["dataframe"]: df_str += f"{' '*11} {info['dataframe']['num_pulses']:,.0f} pulses.\n" df_str += ( From e0beeff39a4e28784739f7e081cf964735ec557d Mon Sep 17 00:00:00 2001 From: Steinn Ymir Agustsson Date: Wed, 22 Nov 2023 01:05:10 +0100 Subject: [PATCH 05/12] error catching for empty h5 channel --- sed/loader/flash/loader.py | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/sed/loader/flash/loader.py b/sed/loader/flash/loader.py index efb1659c..57190a4f 100644 --- a/sed/loader/flash/loader.py +++ b/sed/loader/flash/loader.py @@ -57,6 +57,7 @@ def __init__(self, config: dict) -> None: self.num_electrons: int = None self.num_electrons_per_part: List[int] = None self.num_pulses: int = None + self.parallel_loader: bool = True def initialize_paths(self) -> Tuple[List[Path], Path]: """ @@ -437,7 +438,15 @@ def create_dataframe_per_pulse( # Macrobunch resolved data is exploded to a DataFrame and the MultiIndex is set # Creates the index_per_pulse for the given channel - self.create_multi_index_per_pulse(train_id, np_array) + # if np_array.ndim != 2: + # np_array = np.empty((train_id.size, 0)) + # np_array[:,:]=np.nan + try: + self.create_multi_index_per_pulse(train_id, np_array) + except IndexError: + raise IndexError( + f"IndexError: {channel} seems to be empty.", + ) data = ( Series((np_array[i] for i in train_id.index), name=channel) .explode() @@ -652,6 +661,7 @@ def buffer_file_handler( data_parquet_dir: Path, detector: str, force_recreate: bool, + parallel_loader=True, ) -> Tuple[List[Path], List, List]: """ Handles the conversion of buffer files (h5 to parquet) and returns the filenames. @@ -726,12 +736,20 @@ def buffer_file_handler( # Convert the remaining h5 files to parquet in parallel if there are any if len(files_to_read) > 0: - error = Parallel(n_jobs=len(files_to_read), verbose=10)( - delayed(self.create_buffer_file)(h5_path, parquet_path) - for h5_path, parquet_path in files_to_read - ) - if any(error): - raise RuntimeError(f"Conversion failed for some files. {error}") from error[0] + if parallel_loader: + error = Parallel(n_jobs=len(files_to_read), verbose=10)( + delayed(self.create_buffer_file)(h5_path, parquet_path) + for h5_path, parquet_path in files_to_read + ) + if any(error): + raise RuntimeError(f"Conversion failed for some files. {error}") from error[0] + else: + for h5_path, parquet_path in files_to_read: + error = self.create_buffer_file(h5_path, parquet_path) + if error: + raise RuntimeError( + f"Conversion failed for some file {h5_path}.\n {error}", + ) from error # Raise an error if the conversion failed for any files # TODO: merge this and the previous error trackings @@ -759,6 +777,7 @@ def parquet_handler( load_parquet: bool = False, save_parquet: bool = False, force_recreate: bool = False, + parallel_loader: bool = True, ) -> Tuple[dd.DataFrame, dd.DataFrame]: """ Handles loading and saving of parquet files based on the provided parameters. @@ -809,6 +828,7 @@ def parquet_handler( data_parquet_dir, detector, force_recreate, + parallel_loader, ) # Read all parquet files into one dataframe using dask From 264630cfb98b1a2e7b71a6cb98c2f879883231b2 Mon Sep 17 00:00:00 2001 From: Steinn Ymir Agustsson Date: Wed, 22 Nov 2023 01:11:47 +0100 Subject: [PATCH 06/12] fixed failing loading empty per_pulse channels --- sed/loader/flash/loader.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/sed/loader/flash/loader.py b/sed/loader/flash/loader.py index 57190a4f..c641dbd9 100644 --- a/sed/loader/flash/loader.py +++ b/sed/loader/flash/loader.py @@ -26,6 +26,7 @@ from pandas import DataFrame from pandas import MultiIndex from pandas import Series +from tqdm.auto import tqdm from sed.core import dfops from sed.loader.base.loader import BaseLoader @@ -438,9 +439,9 @@ def create_dataframe_per_pulse( # Macrobunch resolved data is exploded to a DataFrame and the MultiIndex is set # Creates the index_per_pulse for the given channel - # if np_array.ndim != 2: - # np_array = np.empty((train_id.size, 0)) - # np_array[:,:]=np.nan + if np_array.ndim != 2: + np_array = np.empty((train_id.size, 0)) + np_array[:, :] = np.nan try: self.create_multi_index_per_pulse(train_id, np_array) except IndexError: @@ -535,6 +536,8 @@ def create_dataframe_per_channel( # Pulse resolved data is treated here elif channel_dict["format"] == "per_pulse": # Create a DataFrame for pulse-resolved data + if np_array.ndim != 2: + np_array = np_array.reshape((np_array.size, 1)) data = self.create_dataframe_per_pulse( np_array, train_id, @@ -744,7 +747,10 @@ def buffer_file_handler( if any(error): raise RuntimeError(f"Conversion failed for some files. {error}") from error[0] else: - for h5_path, parquet_path in files_to_read: + for h5_path, parquet_path in tqdm( + files_to_read, + desc="Converting h5 files to parquet", + ): error = self.create_buffer_file(h5_path, parquet_path) if error: raise RuntimeError( From 0fbd2d9436cd6bd684cde641a98b3d046240db46 Mon Sep 17 00:00:00 2001 From: Steinn Ymir Agustsson Date: Wed, 22 Nov 2023 01:34:28 +0100 Subject: [PATCH 07/12] filter dataframe prior to binning --- sed/binning/binning.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sed/binning/binning.py b/sed/binning/binning.py index c57efa72..c21aeabd 100644 --- a/sed/binning/binning.py +++ b/sed/binning/binning.py @@ -293,7 +293,8 @@ def bin_dataframe( xarray object, combining the data with the axes (bin centers). """ bins, axes, ranges = simplify_binning_arguments(bins, axes, ranges) - + # filter dataframe to use only the columns needed for the binning + df = df[axes] # create the coordinate axes for the xarray output # if provided as array, they are interpreted as bin centers if isinstance(bins[0], np.ndarray): From e8cd06ab9d0496cdf4842ebf7732f6fa701159d7 Mon Sep 17 00:00:00 2001 From: Steinn Ymir Agustsson Date: Wed, 22 Nov 2023 01:43:56 +0100 Subject: [PATCH 08/12] catch numba typing error --- sed/core/processor.py | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/sed/core/processor.py b/sed/core/processor.py index 7fce6d41..5be1d51f 100644 --- a/sed/core/processor.py +++ b/sed/core/processor.py @@ -1999,20 +1999,28 @@ def compute( dataframe = self._dataframe.partitions[df_partitions] else: dataframe = self._dataframe - - self._binned = bin_dataframe( - df=dataframe, - bins=bins, - axes=axes, - ranges=ranges, - hist_mode=hist_mode, - mode=mode, - pbar=pbar, - n_cores=num_cores, - threads_per_worker=threads_per_worker, - threadpool_api=threadpool_api, - **kwds, - ) + try: + self._binned = bin_dataframe( + df=dataframe, + bins=bins, + axes=axes, + ranges=ranges, + hist_mode=hist_mode, + mode=mode, + pbar=pbar, + n_cores=num_cores, + threads_per_worker=threads_per_worker, + threadpool_api=threadpool_api, + **kwds, + ) + except Exception as ex: + if type(ex).__name__ == "TypingError": + raise TypeError( + "Numba TypingError during binning. One of the axes probably has invalid types." + " Could one of the axes be all nans?", + ) from ex + else: + raise ex for dim in self._binned.dims: try: From e776f335bdef66ae25e45230e677a09787ffd977 Mon Sep 17 00:00:00 2001 From: Steinn Ymir Agustsson Date: Thu, 23 Nov 2023 21:21:23 +0100 Subject: [PATCH 09/12] add mono photon energy calculator --- sed/config/flash_example_config.yaml | 64 +++++++++++++++++++++++----- sed/loader/flash/loader.py | 7 +++ sed/loader/utils.py | 53 +++++++++++++++++++++++ 3 files changed, 114 insertions(+), 10 deletions(-) diff --git a/sed/config/flash_example_config.yaml b/sed/config/flash_example_config.yaml index 3c968f0c..320feac7 100644 --- a/sed/config/flash_example_config.yaml +++ b/sed/config/flash_example_config.yaml @@ -87,18 +87,19 @@ dataframe: # slice: if the group contains multidim data, where to slice channels: - # pulse ID is a necessary channel for using the loader. - pulseId: + timeStamp: + format: per_train + group_name: "/uncategorised/FLASH.DIAG/TIMINGINFO/TIME1.BUNCH_FIRST_INDEX.1/" + pulseId: # pulse ID is a necessary channel for using the loader. format: per_electron group_name: "/uncategorised/FLASH.EXP/HEXTOF.DAQ/DLD1/" slice: 2 - - dldPosX: + # DLD channels + dldPosX: # x position on the DLD detector format: per_electron group_name: "/uncategorised/FLASH.EXP/HEXTOF.DAQ/DLD1/" slice: 1 - - dldPosY: + dldPosY: # x position on the DLD detector format: per_electron group_name: "/uncategorised/FLASH.EXP/HEXTOF.DAQ/DLD1/" slice: 0 @@ -108,10 +109,8 @@ dataframe: format: per_electron group_name: "/uncategorised/FLASH.EXP/HEXTOF.DAQ/DLD1/" slice: 3 - - # The auxillary channel has a special structure where the group further contains - # a multidim structure so further aliases are defined below - dldAux: + dldAux: # The auxillary channel has a special structure where the group further contains + # a multidim structure so further aliases are defined below format: per_pulse group_name: "/uncategorised/FLASH.EXP/HEXTOF.DAQ/DLD1/" slice: 4 @@ -123,6 +122,49 @@ dataframe: cryoTemperature: 4 sampleTemperature: 5 dldTimeBinSize: 15 + # FEL channels + gmdBda: # in uJ per pulse + format: per_pulse + group_name: "/FL1/Photon Diagnostic/GMD/Average energy/energy BDA/" + slice: 0 + gmdPosh: # verrical position of the FEL + format: per_pulse + group_name: "/FL1/Photon Diagnostic/GMD/Average energy/energy BDA/" + slice: 2 + gmdPosv: # horizontal position of the FEL + format: per_pulse + group_name: "/FL1/Photon Diagnostic/GMD/Average energy/energy BDA/" + slice: 3 + bam: # Here we use the DBC2 BAM as the "normal" one is broken. + format: per_pulse + group_name: "/uncategorised/FLASH.SDIAG/BAM.DAQ/FL0.DBC2.ARRIVAL_TIME.ABSOLUTE.SA1.COMP/" + monochromatorPhotonEnergy: # single value. to be changed + format: per_train + group_name: "/FL1/Beamlines/PG/Monochromator/monochromator photon energy/" + monoDelta1: + format: per_train + group_name: "/FL1/Beamlines/PG/Monochromator/ADC.PGM2/" + slice: 0 + monoDelta2: + format: per_train + group_name: "/FL1/Beamlines/PG/Monochromator/ADC.PGM2/" + slice: 1 + monoMirrorAngle: + format: per_train + group_name: "/FL1/Beamlines/PG/Monochromator/ADC.PGM2/" + slice: 2 + monoGratingAngle: + format: per_train + group_name: "/FL1/Beamlines/PG/Monochromator/ADC.PGM2/" + slice: 3 + + # Optical laser channels + delayStage: + format: per_train + group_name: "/zraw/FLASH.SYNC/LASER.LOCK.EXP/F1.PG.OSC/FMC0.MD22.1.ENCODER_POSITION.RD/dGroup/" + opticalDiode: + format: per_pulse + group_name: "/zraw/FLASH.LASER/FLACPUPGLASER1.PULSEENERGY/PG2_incoupl/dGroup/" # The prefixes of the stream names for different DAQ systems for parsing filenames # (Not to be changed by user) @@ -139,6 +181,8 @@ dataframe: # (Not to be changed by user) beamtime_dir: pg2: "/asap3/flash/gpfs/pg2/" + hextof: "/asap3/fs-flash-o/gpfs/hextof/" + wespe: "/asap3/fs-flash-o/gpfs/wespe/" # metadata collection from scicat # metadata: diff --git a/sed/loader/flash/loader.py b/sed/loader/flash/loader.py index 8627b762..87f7d853 100644 --- a/sed/loader/flash/loader.py +++ b/sed/loader/flash/loader.py @@ -29,6 +29,7 @@ from sed.core import dfops from sed.loader.base.loader import BaseLoader from sed.loader.flash.metadata import MetadataRetriever +from sed.loader.utils import add_monochromator_photon_energy from sed.loader.utils import parse_h5_keys from sed.loader.utils import split_dld_time_from_sector_id @@ -613,6 +614,12 @@ def create_dataframe_per_file( # correct the 3 bit shift which encodes the detector ID in the 8s time if self._config["dataframe"].get("split_sector_id_from_dld_time", False): df = split_dld_time_from_sector_id(df, config=self._config) + mono_channels = ["delta1", "delta2"] + if all([channel in df.columns for channel in mono_channels]): + grating_density = self._config["dataframe"].get("gratingDensity", None) + order = self._config["dataframe"].get("order", None) + if grating_density is not None and order is not None: + df = add_monochromator_photon_energy(df) return df def create_buffer_file(self, h5_path: Path, parquet_path: Path) -> Union[bool, Exception]: diff --git a/sed/loader/utils.py b/sed/loader/utils.py index ab3fde3a..f56fe8c6 100644 --- a/sed/loader/utils.py +++ b/sed/loader/utils.py @@ -195,3 +195,56 @@ def split_dld_time_from_sector_id( types=[np.int8, np.int32], ) return df + + +def calculate_monochromator_photon_energy( + delta1: float, + delta2: float, + grating_density: int, + order: int, +) -> float: + """ + Calculates the photon energy of the monochromator using the grating density and order. + + Args: + delta1 (float): The angle of the first crystal in radians. + delta2 (float): The angle of the second crystal in radians. + grating_density (int): The grating density in grooves per mm. + order (int): The order of the diffraction. + + Returns: + float: The photon energy in eV. + """ + alpha = 2.0 * delta1 + 90.0 - delta2 + beta = -1.0 * delta2 - 86.0 + num = 1e6 * (np.sin(beta / 180.0 * np.pi) + np.sin(alpha / 180.0 * np.pi)) + den = order * grating_density + lambda_nm = num / den + hc = 1239.84 + lambda_ev = hc / lambda_nm + return lambda_ev + + +def add_monochromator_photon_energy( + df: Union[pd.DataFrame, dask.dataframe.DataFrame], + config: dict = None, +) -> Union[pd.DataFrame, dask.dataframe.DataFrame]: + mono_channels = ["delta1", "delta2", "gratingDensity", "order"] + if config is None: + raise ValueError("config must be given.") + for channel in mono_channels: + if channel not in config["dataframe"]: + raise ValueError(f"config must contain {channel}.") + if "monoPhotonEnergy" in df.columns: + raise ValueError( + "Column monoPhotonEnergy already in dataframe. ", + ) + df = df.assign( + monoPhotonEnergy=calculate_monochromator_photon_energy( + df[config["dataframe"]["delta1"]], + df[config["dataframe"]["delta2"]], + config["dataframe"]["gratingDensity"], + config["dataframe"]["order"], + ), + ) + return df From b9e98d2aafbf54ef3702e1eb364294ad72efbd08 Mon Sep 17 00:00:00 2001 From: Steinn Ymir Agustsson Date: Fri, 1 Dec 2023 14:13:57 +0100 Subject: [PATCH 10/12] fix monochromator_photon_energy --- sed/loader/flash/loader.py | 51 +++++++++++++++++++++++++++++--------- sed/loader/utils.py | 4 +-- 2 files changed, 41 insertions(+), 14 deletions(-) diff --git a/sed/loader/flash/loader.py b/sed/loader/flash/loader.py index 155ea34d..1d49dcf4 100644 --- a/sed/loader/flash/loader.py +++ b/sed/loader/flash/loader.py @@ -31,7 +31,7 @@ from sed.core import dfops from sed.loader.base.loader import BaseLoader from sed.loader.flash.metadata import MetadataRetriever -from sed.loader.utils import add_monochromator_photon_energy +from sed.loader.utils import calculate_monochromator_photon_energy from sed.loader.utils import parse_h5_keys from sed.loader.utils import split_dld_time_from_sector_id @@ -350,11 +350,17 @@ def create_numpy_array_per_channel( # Use predefined axis and slice from the json file # to choose correct dimension for necessary channel if "slice" in channel_dict: - np_array = np.take( - np_array, - channel_dict["slice"], - axis=1, - ) + try: + np_array = np.take( + np_array, + channel_dict["slice"], + axis=1, + ) + except np.AxisError: + raise np.AxisError( + f"AxisError: {channel}, looking for slice {channel_dict['slice']}, " + f"when shape is {np_array.shape}", + ) return train_id, np_array def create_dataframe_per_electron( @@ -632,12 +638,33 @@ def create_dataframe_per_file( # correct the 3 bit shift which encodes the detector ID in the 8s time if self._config["dataframe"].get("split_sector_id_from_dld_time", False): df = split_dld_time_from_sector_id(df, config=self._config) - mono_channels = ["delta1", "delta2"] - if all([channel in df.columns for channel in mono_channels]): - grating_density = self._config["dataframe"].get("gratingDensity", None) - order = self._config["dataframe"].get("order", None) - if grating_density is not None and order is not None: - df = add_monochromator_photon_energy(df) + mono_settings = self._config["dataframe"].get("monochromator", None) + if mono_settings is not None: + mono_vals = {} + for k, v in mono_settings.items(): + if k == "channel": + if not isinstance(v, str): + raise ValueError( + f"Invalid channel name {k} in mono_settings. Name must be a string", + ) + mono_vals[k] = v + elif isinstance(v, str) and v in df.columns: + mono_vals[k] = df[v].values + elif isinstance(v, int): + mono_vals[k] = v + else: + raise ValueError(f"Invalid value for {k} in mono_settings") + + name = mono_vals.pop("channel") + df[name] = calculate_monochromator_photon_energy( + delta1=mono_vals["delta1"], + delta2=mono_vals["delta2"], + grating_density=mono_vals["grating_density"], + grating_order=mono_vals["grating_order"], + ) + + else: + print("no monochromator settings found in config") return df def create_buffer_file(self, h5_path: Path, parquet_path: Path) -> Union[bool, Exception]: diff --git a/sed/loader/utils.py b/sed/loader/utils.py index f56fe8c6..108fa5eb 100644 --- a/sed/loader/utils.py +++ b/sed/loader/utils.py @@ -201,7 +201,7 @@ def calculate_monochromator_photon_energy( delta1: float, delta2: float, grating_density: int, - order: int, + grating_order: int, ) -> float: """ Calculates the photon energy of the monochromator using the grating density and order. @@ -218,7 +218,7 @@ def calculate_monochromator_photon_energy( alpha = 2.0 * delta1 + 90.0 - delta2 beta = -1.0 * delta2 - 86.0 num = 1e6 * (np.sin(beta / 180.0 * np.pi) + np.sin(alpha / 180.0 * np.pi)) - den = order * grating_density + den = grating_order * grating_density lambda_nm = num / den hc = 1239.84 lambda_ev = hc / lambda_nm From bf79089573b3a4803bac222ae8928a021a5d32a3 Mon Sep 17 00:00:00 2001 From: Steinn Ymir Agustsson Date: Fri, 1 Dec 2023 14:14:33 +0100 Subject: [PATCH 11/12] inting --- sed/loader/flash/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sed/loader/flash/loader.py b/sed/loader/flash/loader.py index 1d49dcf4..4c3cbce3 100644 --- a/sed/loader/flash/loader.py +++ b/sed/loader/flash/loader.py @@ -785,7 +785,7 @@ def buffer_file_handler( files_to_read, desc="Converting h5 files to parquet", ): - error = self.create_buffer_file(h5_path, parquet_path) + error: Exception = self.create_buffer_file(h5_path, parquet_path) if error: raise RuntimeError( f"Conversion failed for some file {h5_path}.\n {error}", From 702203f9e55128b53be484852650aec1fa443697 Mon Sep 17 00:00:00 2001 From: Steinn Ymir Agustsson Date: Fri, 1 Dec 2023 14:15:28 +0100 Subject: [PATCH 12/12] remove unused function --- sed/loader/utils.py | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/sed/loader/utils.py b/sed/loader/utils.py index 108fa5eb..1a5e0519 100644 --- a/sed/loader/utils.py +++ b/sed/loader/utils.py @@ -223,28 +223,3 @@ def calculate_monochromator_photon_energy( hc = 1239.84 lambda_ev = hc / lambda_nm return lambda_ev - - -def add_monochromator_photon_energy( - df: Union[pd.DataFrame, dask.dataframe.DataFrame], - config: dict = None, -) -> Union[pd.DataFrame, dask.dataframe.DataFrame]: - mono_channels = ["delta1", "delta2", "gratingDensity", "order"] - if config is None: - raise ValueError("config must be given.") - for channel in mono_channels: - if channel not in config["dataframe"]: - raise ValueError(f"config must contain {channel}.") - if "monoPhotonEnergy" in df.columns: - raise ValueError( - "Column monoPhotonEnergy already in dataframe. ", - ) - df = df.assign( - monoPhotonEnergy=calculate_monochromator_photon_energy( - df[config["dataframe"]["delta1"]], - df[config["dataframe"]["delta2"]], - config["dataframe"]["gratingDensity"], - config["dataframe"]["order"], - ), - ) - return df