From fc59e43012096c424aad4d00230d44e71b13f5cc Mon Sep 17 00:00:00 2001 From: C-PROOF Date: Thu, 4 Jul 2024 12:59:18 -0700 Subject: [PATCH] ENH: allow multiple deploymentyaml --- pyglider/ncprocess.py | 8 ++++---- pyglider/seaexplorer.py | 17 ++++++++--------- pyglider/slocum.py | 22 +++++++++++++--------- pyglider/utils.py | 20 ++++++++++++++++++++ 4 files changed, 45 insertions(+), 22 deletions(-) diff --git a/pyglider/ncprocess.py b/pyglider/ncprocess.py index 9c32c34..b0fdb26 100644 --- a/pyglider/ncprocess.py +++ b/pyglider/ncprocess.py @@ -35,8 +35,8 @@ def extract_timeseries_profiles(inname, outdir, deploymentyaml): except FileExistsError: pass - with open(deploymentyaml) as fin: - deployment = yaml.safe_load(fin) + deployment = utils._get_deployment(deploymentyaml) + meta = deployment['metadata'] with xr.open_dataset(inname) as ds: _log.info('Extracting profiles: opening %s', inname) @@ -172,8 +172,8 @@ def make_gridfiles(inname, outdir, deploymentyaml, *, fnamesuffix='', dz=1, star except FileExistsError: pass - with open(deploymentyaml) as fin: - deployment = yaml.safe_load(fin) + deployment = utils._get_deployment(deploymentyaml) + profile_meta = deployment['profile_variables'] ds = xr.open_dataset(inname, decode_times=True) diff --git a/pyglider/seaexplorer.py b/pyglider/seaexplorer.py index 6bf4164..722e1df 100644 --- a/pyglider/seaexplorer.py +++ b/pyglider/seaexplorer.py @@ -128,7 +128,7 @@ def raw_to_rawnc(indir, outdir, deploymentyaml, incremental=True, # Try to read the file with polars. If the file is corrupted (rare), file read will fail and file # is appended to badfiles try: - out = pl.read_csv(f, separator=';') + out = pl.read_csv(f, sep=';') except Exception as e: _log.warning(f'Exception reading {f}: {e}') _log.warning(f'Could not read {f}') @@ -137,11 +137,11 @@ def raw_to_rawnc(indir, outdir, deploymentyaml, incremental=True, # Parse the datetime from nav files (called Timestamp) and pld1 files (called PLD_REALTIMECLOCK) if "Timestamp" in out.columns: out = out.with_columns( - pl.col("Timestamp").str.strptime(pl.Datetime, format="%d/%m/%Y %H:%M:%S")) + pl.col("Timestamp").str.strptime(pl.Datetime, fmt="%d/%m/%Y %H:%M:%S")) out = out.rename({"Timestamp": "time"}) else: out = out.with_columns( - pl.col("PLD_REALTIMECLOCK").str.strptime(pl.Datetime, format="%d/%m/%Y %H:%M:%S.%3f")) + pl.col("PLD_REALTIMECLOCK").str.strptime(pl.Datetime, fmt="%d/%m/%Y %H:%M:%S.%3f")) out = out.rename({"PLD_REALTIMECLOCK": "time"}) for col_name in out.columns: if "time" not in col_name.lower(): @@ -150,7 +150,7 @@ def raw_to_rawnc(indir, outdir, deploymentyaml, incremental=True, if 'AD2CP_TIME' in out.columns: # Set datestamps with date 00000 to None out = out.with_columns( - pl.col('AD2CP_TIME').str.strptime(pl.Datetime, format="%m%d%y %H:%M:%S", strict=False)) + pl.col('AD2CP_TIME').str.strptime(pl.Datetime, fmt="%m%d%y %H:%M:%S", strict=False)) # subsetting for heavily oversampled raw data: if rawsub == 'raw' and dropna_subset is not None: @@ -232,8 +232,8 @@ def merge_parquet(indir, outdir, deploymentyaml, incremental=False, kind='raw'): Only add new files.... """ - with open(deploymentyaml) as fin: - deployment = yaml.safe_load(fin) + deployment = utils._get_deployment(deploymentyaml) + metadata = deployment['metadata'] id = metadata['glider_name'] outgli = outdir + '/' + id + '-rawgli.parquet' @@ -297,7 +297,6 @@ def _remove_fill_values(df, fill_value=9999): pl.when(pl.col(pl.Float64) == fill_value) .then(None) .otherwise(pl.col(pl.Float64)) - .name.keep() ) return df @@ -309,8 +308,8 @@ def raw_to_timeseries(indir, outdir, deploymentyaml, kind='raw', A little different than above, for the 4-file version of the data set. """ - with open(deploymentyaml) as fin: - deployment = yaml.safe_load(fin) + deployment = utils._get_deployment(deploymentyaml) + metadata = deployment['metadata'] ncvar = deployment['netcdf_variables'] device_data = deployment['glider_devices'] diff --git a/pyglider/slocum.py b/pyglider/slocum.py index 642cab7..a1386b6 100644 --- a/pyglider/slocum.py +++ b/pyglider/slocum.py @@ -16,7 +16,7 @@ import time import xarray as xr import xml.etree.ElementTree as ET -import yaml +from collections.abc import Iterable import pyglider.utils as utils @@ -621,8 +621,8 @@ def merge_rawnc(indir, outdir, deploymentyaml, scisuffix = scisuffix.lower() glidersuffix = glidersuffix.lower() - with open(deploymentyaml) as fin: - deployment = yaml.safe_load(fin) + deployment = utils._get_deployment(deploymentyaml) + metadata = deployment['metadata'] id = metadata['glider_name'] + metadata['glider_serial'] @@ -684,8 +684,7 @@ def raw_to_timeseries(indir, outdir, deploymentyaml, *, name of the new merged netcdf file. """ - with open(deploymentyaml) as fin: - deployment = yaml.safe_load(fin) + deployment = utils._get_deployment(deploymentyaml) metadata = deployment['metadata'] ncvar = deployment['netcdf_variables'] device_data = deployment['glider_devices'] @@ -807,8 +806,13 @@ def binary_to_timeseries(indir, cachedir, outdir, deploymentyaml, *, outdir : string Directory to put the merged timeseries files. - deploymentyaml : str - YAML text file with deployment information for this glider. + deploymentyaml : str or list + Name of YAML text file with deployment information for this glider. + + If a list, then the YAML files are read in order, and any top-level dictionaries + are overwritten from the previous YAMLs. The advantage of this is that it allows + metadata that is common to multiple ways of processing the data come from the + first file, and then subsequent files change "netcdf_variables" if desired. profile_filt_time : float time in seconds over which to smooth the pressure time series for @@ -827,8 +831,8 @@ def binary_to_timeseries(indir, cachedir, outdir, deploymentyaml, *, if not have_dbdreader: raise ImportError('Cannot import dbdreader') - with open(deploymentyaml) as fin: - deployment = yaml.safe_load(fin) + deployment = utils._get_deployment(deploymentyaml) + ncvar = deployment['netcdf_variables'] device_data = deployment['glider_devices'] thenames = list(ncvar.keys()) diff --git a/pyglider/utils.py b/pyglider/utils.py index d466343..076b944 100644 --- a/pyglider/utils.py +++ b/pyglider/utils.py @@ -7,6 +7,8 @@ from scipy.signal import argrelextrema import gsw import logging +import yaml + _log = logging.getLogger(__name__) @@ -674,6 +676,24 @@ def example_gridplot(filename, outname, fig.savefig(outname, dpi=dpi) +def _get_deployment(deploymentyaml): + """ + Take the list of files in *deploymentyaml* and parse them + for deployment information, with subsequent files overwriting + previous files. + """ + if isinstance(deploymentyaml, str): + deploymentyaml = [deploymentyaml,] + deployment = {} + for nn, d in enumerate(deploymentyaml): + with open(d) as fin: + deployment_ = yaml.safe_load(fin) + for k in deployment_: + deployment[k] = deployment_[k] + + return deployment + + __all__ = ['get_distance_over_ground', 'get_glider_depth', 'get_profiles_new', 'get_derived_eos_raw', "fill_metadata", "nmea2deg", "gappy_fill_vertical", "oxygen_concentration_correction"]