diff --git a/.circleci/config.yml b/.circleci/config.yml index ce6b01d..17da7d6 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -25,7 +25,6 @@ jobs: - store_artifacts: path: docs/_build/html/ - workflows: main: jobs: diff --git a/.github/workflows/doc-artifacts.yml b/.github/workflows/doc-artifacts.yml index 234278d..c6864b5 100644 --- a/.github/workflows/doc-artifacts.yml +++ b/.github/workflows/doc-artifacts.yml @@ -9,4 +9,4 @@ jobs: with: repo-token: ${{ secrets.GITHUB_TOKEN }} artifact-path: 0/docs/_build/html/index.html - circleci-jobs: build_docs \ No newline at end of file + circleci-jobs: build_docs diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml index ba9c31f..db8d02e 100644 --- a/.github/workflows/publish-to-pypi.yml +++ b/.github/workflows/publish-to-pypi.yml @@ -8,90 +8,90 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: "3.x" - - name: Install pypa/build - run: python3 -m pip install build --user - - name: Build a binary wheel and a source tarball - run: python3 -m build - - name: Store the distribution packages - uses: actions/upload-artifact@v4 - with: - name: python-package-distributions - path: dist/ + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.x" + - name: Install pypa/build + run: python3 -m pip install build --user + - name: Build a binary wheel and a source tarball + run: python3 -m build + - name: Store the distribution packages + uses: actions/upload-artifact@v4 + with: + name: python-package-distributions + path: dist/ publish-to-pypi: name: >- Publish Python 🐍 distribution 📦 to PyPI - if: startsWith(github.ref, 'refs/tags/') # only publish to PyPI on tag pushes + if: startsWith(github.ref, 'refs/tags/') # only publish to PyPI on tag pushes needs: - - build + - build runs-on: ubuntu-latest environment: name: pypi url: https://pypi.org/p/pyglider permissions: - id-token: write # IMPORTANT: mandatory for trusted publishing + id-token: write # IMPORTANT: mandatory for trusted publishing steps: - - name: Download all the dists - uses: actions/download-artifact@v4 - with: - name: python-package-distributions - path: dist/ - - name: Publish distribution 📦 to PyPI - uses: pypa/gh-action-pypi-publish@release/v1 + - name: Download all the dists + uses: actions/download-artifact@v4 + with: + name: python-package-distributions + path: dist/ + - name: Publish distribution 📦 to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 github-release: name: >- Sign the Python 🐍 distribution 📦 with Sigstore and upload them to GitHub Release needs: - - publish-to-pypi + - publish-to-pypi runs-on: ubuntu-latest permissions: - contents: write # IMPORTANT: mandatory for making GitHub Releases - id-token: write # IMPORTANT: mandatory for sigstore + contents: write # IMPORTANT: mandatory for making GitHub Releases + id-token: write # IMPORTANT: mandatory for sigstore steps: - - name: Download all the dists - uses: actions/download-artifact@v4 - with: - name: python-package-distributions - path: dist/ - - name: Sign the dists with Sigstore - uses: sigstore/gh-action-sigstore-python@v2.1.1 - with: - inputs: >- - ./dist/*.tar.gz - ./dist/*.whl - - name: Create GitHub Release - env: - GITHUB_TOKEN: ${{ github.token }} - run: >- - gh release create - '${{ github.ref_name }}' - --repo '${{ github.repository }}' - --notes "" - - name: Upload artifact signatures to GitHub Release - env: - GITHUB_TOKEN: ${{ github.token }} - # Upload to GitHub Release using the `gh` CLI. - # `dist/` contains the built packages, and the - # sigstore-produced signatures and certificates. - run: >- - gh release upload - '${{ github.ref_name }}' dist/** - --repo '${{ github.repository }}' + - name: Download all the dists + uses: actions/download-artifact@v4 + with: + name: python-package-distributions + path: dist/ + - name: Sign the dists with Sigstore + uses: sigstore/gh-action-sigstore-python@v2.1.1 + with: + inputs: >- + ./dist/*.tar.gz + ./dist/*.whl + - name: Create GitHub Release + env: + GITHUB_TOKEN: ${{ github.token }} + run: >- + gh release create + '${{ github.ref_name }}' + --repo '${{ github.repository }}' + --notes "" + - name: Upload artifact signatures to GitHub Release + env: + GITHUB_TOKEN: ${{ github.token }} + # Upload to GitHub Release using the `gh` CLI. + # `dist/` contains the built packages, and the + # sigstore-produced signatures and certificates. + run: >- + gh release upload + '${{ github.ref_name }}' dist/** + --repo '${{ github.repository }}' publish-to-testpypi: name: Publish Python 🐍 distribution 📦 to TestPyPI needs: - - build + - build runs-on: ubuntu-latest environment: @@ -99,15 +99,15 @@ jobs: url: https://test.pypi.org/p/pyglider permissions: - id-token: write # IMPORTANT: mandatory for trusted publishing + id-token: write # IMPORTANT: mandatory for trusted publishing steps: - - name: Download all the dists - uses: actions/download-artifact@v4 - with: - name: python-package-distributions - path: dist/ - - name: Publish distribution 📦 to TestPyPI - uses: pypa/gh-action-pypi-publish@release/v1 - with: - repository-url: https://test.pypi.org/legacy/ \ No newline at end of file + - name: Download all the dists + uses: actions/download-artifact@v4 + with: + name: python-package-distributions + path: dist/ + - name: Publish distribution 📦 to TestPyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + repository-url: https://test.pypi.org/legacy/ diff --git a/.gitignore b/.gitignore index 0338b01..195440f 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,4 @@ docs/pyglider dist *.png -.DS_Store \ No newline at end of file +.DS_Store diff --git a/.readthedocs.yaml b/.readthedocs.yaml index a6835ca..e55e259 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -28,7 +28,6 @@ python: # Install our python package before building the docs - method: pip path: . - # Optionally build your docs in additional formats such as PDF and ePub # formats: # - pdf @@ -39,4 +38,4 @@ python: # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html # python: # install: -# - requirements: docs/requirements.txt \ No newline at end of file +# - requirements: docs/requirements.txt diff --git a/LICENSE b/LICENSE index 8405e89..37ec93a 100644 --- a/LICENSE +++ b/LICENSE @@ -188,4 +188,4 @@ third-party archives. distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. \ No newline at end of file + limitations under the License. diff --git a/README.md b/README.md index c58ff3c..dd9c79a 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ ![](docs/_static/PyGliderHorizontal.svg) -Python tools for interacting with ocean glider data. PyGlider takes data from +Python tools for interacting with ocean glider data. PyGlider takes data from Teledyne/Webb Slocum gliders and Alseamar SeaExplorers and creates CF-compliant NetCDF files. @@ -8,4 +8,4 @@ For documentation, please see ### Contact -Get in touch with us using Discussion above or by opening an issue. \ No newline at end of file +Get in touch with us using Discussion above or by opening an issue. diff --git a/codecov.yml b/codecov.yml index fc5b569..7db62aa 100644 --- a/codecov.yml +++ b/codecov.yml @@ -20,10 +20,10 @@ coverage: if_no_uploads: error if_not_found: success if_ci_failed: failure - paths: '!tests/.*' + paths: "!tests/.*" tests: target: auto if_no_uploads: error if_not_found: success if_ci_failed: failure - paths: 'tests/.*' + paths: "tests/.*" diff --git a/docs/Install.md b/docs/Install.md index f8ad89b..b669500 100644 --- a/docs/Install.md +++ b/docs/Install.md @@ -2,8 +2,8 @@ ## conda/pip -PyGlider depends on `dask` and `netcdf4`, both of which can be tricky to install using ``pip``, -hence we recommend these be installed with [``conda``](https://www.anaconda.com/). To install +PyGlider depends on `dask` and `netcdf4`, both of which can be tricky to install using `pip`, +hence we recommend these be installed with [`conda`](https://www.anaconda.com/). To install PyGlider, create an environment, and do ``` @@ -15,13 +15,13 @@ conda install -c conda-forge pyglider ## Editable installation If you want to be able to edit the files in `pyglider/pyglider` then install -the dependencies as above. Fork of PyGlider on github, and then clone it locally: +the dependencies as above. Fork of PyGlider on github, and then clone it locally: ``` git clone https://github.com/yourname/pyglider.git ``` -Navigate to the new ``pyglider`` directory and then do `pip install -e .`. +Navigate to the new `pyglider` directory and then do `pip install -e .`. That will re-install pyglider with links to the local directory, so you -can edit the library files. If you do so, consider making a pull-request +can edit the library files. If you do so, consider making a pull-request with your changes! diff --git a/docs/conf.py b/docs/conf.py index 5994a2b..80cac14 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -32,8 +32,8 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - "numpydoc", - "myst_parser", + 'numpydoc', + 'myst_parser', 'sphinx.ext.autodoc', 'sphinx.ext.inheritance_diagram', 'autoapi.sphinx', @@ -42,9 +42,9 @@ extensions.append('sphinx.ext.intersphinx') intersphinx_mapping = { - 'xarray': ('http://xarray.pydata.org/en/stable/', None), - 'python': ('https://docs.python.org/3/', None), - } + 'xarray': ('http://xarray.pydata.org/en/stable/', None), + 'python': ('https://docs.python.org/3/', None), +} autoapi_modules = {'pyglider': None} @@ -62,40 +62,41 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = "pydata_sphinx_theme" +html_theme = 'pydata_sphinx_theme' # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] -html_logo = "_static/PyGliderHorizontal.svg" +html_logo = '_static/PyGliderHorizontal.svg' html_context = { # "github_url": "https://github.com", # or your GitHub Enterprise interprise - "github_user": "c-proof", - "github_repo": "pyglider", - "doc_path": "docs/", + 'github_user': 'c-proof', + 'github_repo': 'pyglider', + 'doc_path': 'docs/', } html_theme_options = { - "icon_links": [ + 'icon_links': [ { # Label for this link - "name": "GitHub", + 'name': 'GitHub', # URL where the link will redirect - "url": "https://github.com/c-proof/pyglider", # required + 'url': 'https://github.com/c-proof/pyglider', # required # Icon class (if "type": "fontawesome"), or path to local image (if "type": "local") - "icon": "fab fa-github-square", + 'icon': 'fab fa-github-square', # The type of image to be used (see below for details) - "type": "fontawesome", + 'type': 'fontawesome', }, { - "name": "PyPI", - "url": "https://pypi.org/project/pyglider", - "icon": "fas fa-box", + 'name': 'PyPI', + 'url': 'https://pypi.org/project/pyglider', + 'icon': 'fas fa-box', }, # { "name": "conda-forge", # "url": "https://anaconda.org/conda-forge/pyglider", # "icon": "fas fa-box" # } - ]} + ] +} diff --git a/docs/getting-started-seaexplorer.md b/docs/getting-started-seaexplorer.md index 87ca7ae..779a33a 100644 --- a/docs/getting-started-seaexplorer.md +++ b/docs/getting-started-seaexplorer.md @@ -1,9 +1,8 @@ # Getting Started: SeaExplorer - ## Gather data -SeaExplorers send back and record two main types of files, glider files (`*.gli.*`) that contain glider navigation information, and payload files (`*.pld1.*`) that contain the science data. These can be subset files, `*.sub.*` that Alseamar decimates for transmission, or they can be full resolution files from the glider (`*.raw.*`), offloaded post mission. The raw or subset files need to be made available in a single directory for `pyglider` to process. +SeaExplorers send back and record two main types of files, glider files (`*.gli.*`) that contain glider navigation information, and payload files (`*.pld1.*`) that contain the science data. These can be subset files, `*.sub.*` that Alseamar decimates for transmission, or they can be full resolution files from the glider (`*.raw.*`), offloaded post mission. The raw or subset files need to be made available in a single directory for `pyglider` to process. You can download and expand example data using `.get_example_data`: @@ -17,25 +16,25 @@ which will add a local directory `example-data` to your current directory. ## Make a deployment configuration file -The processing routines all take a `deployment.yaml` file as an argument, and information from this is used to fill in metadata and to map sensor names to NetCDF variable names. See {ref}`ExDepl`, below. +The processing routines all take a `deployment.yaml` file as an argument, and information from this is used to fill in metadata and to map sensor names to NetCDF variable names. See {ref}`ExDepl`, below. There are four top-levels to the `deployment.yaml` -- `metadata`: The only field that is necessary here is `glider_name`. The rest of the fields will be added to the netcdf files as top-level attributes -- `glider_devices`: This is a list of the glider devices, and any information about them like make, mode, serial number. This is optional, and again is added to the netcdf top-level attributes -- `netcdf_variables`: These are necessary, and map from sensor name (e.g. `source: GPCTD_CONDUCTIVITY`) to a data variable name (e.g. `conductivity`). The fields other than `source:` are optional for the processing to run, and are placed in the attributes of the netCDF variable. However, note that many of these attributes are necessary for CF compliance. -- `profile_variables`: This is a mapping for variables that are per-profile, rather than timeseries. They include variables like a mean position and time for the profile, and a mean derived ocean velocities. +- `metadata`: The only field that is necessary here is `glider_name`. The rest of the fields will be added to the netcdf files as top-level attributes +- `glider_devices`: This is a list of the glider devices, and any information about them like make, mode, serial number. This is optional, and again is added to the netcdf top-level attributes +- `netcdf_variables`: These are necessary, and map from sensor name (e.g. `source: GPCTD_CONDUCTIVITY`) to a data variable name (e.g. `conductivity`). The fields other than `source:` are optional for the processing to run, and are placed in the attributes of the netCDF variable. However, note that many of these attributes are necessary for CF compliance. +- `profile_variables`: This is a mapping for variables that are per-profile, rather than timeseries. They include variables like a mean position and time for the profile, and a mean derived ocean velocities. ## Process -The example script is relatively straight forward if there is no intermediate processing. See {ref}`ExProc`, below. - -Data comes from an input directory, and is translated to raw glider-dependent parquet files files and put in a new directory. These files are useful of their own right. Apache Parquet is a columnar oriented format for storing tabular data. Parquet files take up less space than netCDF or csv and are much faster to read and write. These files can be opened with [polars.read_parquet](https://pola-rs.github.io/polars-book/user-guide/howcani/io/parquet.html) or [pandas.read_parquet](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_parquet.html). These files are then merged into a single monolithic parquet file, and this is translated to a CF-compliant timeseries netCDF file. Finally individual profiles are saved and a 2-D 1-m grid in time-depth is saved. +The example script is relatively straight forward if there is no intermediate processing. See {ref}`ExProc`, below. -It is likely that between these steps the user will want to add any screening steps, or adjustments to the calibrations. PyGlider does not provide those steps. +Data comes from an input directory, and is translated to raw glider-dependent parquet files files and put in a new directory. These files are useful of their own right. Apache Parquet is a columnar oriented format for storing tabular data. Parquet files take up less space than netCDF or csv and are much faster to read and write. These files can be opened with [polars.read_parquet](https://pola-rs.github.io/polars-book/user-guide/howcani/io/parquet.html) or [pandas.read_parquet](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_parquet.html). These files are then merged into a single monolithic parquet file, and this is translated to a CF-compliant timeseries netCDF file. Finally individual profiles are saved and a 2-D 1-m grid in time-depth is saved. +It is likely that between these steps the user will want to add any screening steps, or adjustments to the calibrations. PyGlider does not provide those steps. (ExDepl)= + ### Example deployment.yaml ```{literalinclude} ../tests/example-data/example-seaexplorer/deploymentRealtime.yml @@ -43,8 +42,9 @@ It is likely that between these steps the user will want to add any screening st ``` (ExProc)= + ### Example processing script -```{literalinclude} ../tests/example-data/example-seaexplorer/process_deploymentRealTime.py +```{literalinclude} ../tests/example-data/example-seaexplorer/process_deploymentRealTime.py :language: python ``` diff --git a/docs/getting-started-slocum.md b/docs/getting-started-slocum.md index e7dfaa6..fd01d30 100644 --- a/docs/getting-started-slocum.md +++ b/docs/getting-started-slocum.md @@ -2,48 +2,47 @@ ## Gather data -Slocum gliders have 4 types of files. For telemetry data there are `*.tbd` files for sensor data, and `*.sbd` for the glider's attitude and position data. These are called `*.ebd` and `*.tbd` respectively, when retrieved from the gliders' payload post deployment. Modern gliders have compressed version of these, eg `*.tcd`, `*.scd` that *pyglider* should be able to parse. These data files need to be made available in a _single_ directory for *pyglider* to process. Note that on the glider they are often separated into `science/logs` and `flight/logs`. +Slocum gliders have 4 types of files. For telemetry data there are `*.tbd` files for sensor data, and `*.sbd` for the glider's attitude and position data. These are called `*.ebd` and `*.tbd` respectively, when retrieved from the gliders' payload post deployment. Modern gliders have compressed version of these, eg `*.tcd`, `*.scd` that _pyglider_ should be able to parse. These data files need to be made available in a _single_ directory for _pyglider_ to process. Note that on the glider they are often separated into `science/logs` and `flight/logs`. -Slocum gliders also have a sensor cache file `*.cac`, all of which have randomized names. These are needed by the processing, and are usually stored in a separate cache directory. +Slocum gliders also have a sensor cache file `*.cac`, all of which have randomized names. These are needed by the processing, and are usually stored in a separate cache directory. You can download example data at which will add a local directory `example-data` to your current directory. - ## Make a deployment configuration file -The processing routines all take a `deployment.yaml` file as an argument, and information from this is used to fill in metadata and to map sensor names to NetCDF variable names. See {ref}`ExDeplSlocum`, below. +The processing routines all take a `deployment.yaml` file as an argument, and information from this is used to fill in metadata and to map sensor names to NetCDF variable names. See {ref}`ExDeplSlocum`, below. There are four top-levels to the `deployment.yaml` -- `metadata`: The only field that is necessary here is `glider_name`. The rest of the fields will be added to the netcdf files as top-level attributes -- `glider_devices`: This is a list of the glider devices, and any information about them like make, mode, serial number. This is optional, and again is added to the netcdf top-level attributes -- `netcdf_variables`: These are necessary, and map from sensor name (e.g. `source: GPCTD_CONDUCTIVITY`) to a data variable name (e.g. `conductivity`). The fields other than `source:` are optional for the processing to run, and are placed in the attributes of the netCDF variable. However, note that many of these attributes are necessary for CF compliance. -- `profile_variables`: This is a mapping for variables that are per-profile, rather than timeseries. They include variables like a mean position and time for the profile, and a mean derived ocean velocities. +- `metadata`: The only field that is necessary here is `glider_name`. The rest of the fields will be added to the netcdf files as top-level attributes +- `glider_devices`: This is a list of the glider devices, and any information about them like make, mode, serial number. This is optional, and again is added to the netcdf top-level attributes +- `netcdf_variables`: These are necessary, and map from sensor name (e.g. `source: GPCTD_CONDUCTIVITY`) to a data variable name (e.g. `conductivity`). The fields other than `source:` are optional for the processing to run, and are placed in the attributes of the netCDF variable. However, note that many of these attributes are necessary for CF compliance. +- `profile_variables`: This is a mapping for variables that are per-profile, rather than timeseries. They include variables like a mean position and time for the profile, and a mean derived ocean velocities. ## Process -The example script is relatively straight forward if there is no intermediate processing. See {ref}`ExProcSlocum`, below. +The example script is relatively straight forward if there is no intermediate processing. See {ref}`ExProcSlocum`, below. -Data comes from an input directory, and is translated into a single CF-compliant netCDF timeseries file using the package [dbdreader](https://dbdreader.readthedocs.io/en/latest/). Finally individual profiles are saved and a 2-D 1-m grid in time-depth is saved. +Data comes from an input directory, and is translated into a single CF-compliant netCDF timeseries file using the package [dbdreader](https://dbdreader.readthedocs.io/en/latest/). Finally individual profiles are saved and a 2-D 1-m grid in time-depth is saved. :::{note} -There is a version that does not require `dbdreader` to do the initial conversion from the Dinkum format to netCDF. However it is quite slow, particularly for full-resolution datasets, and less robust. We suggest using the `slocum.raw_to_timeseries`. +There is a version that does not require `dbdreader` to do the initial conversion from the Dinkum format to netCDF. However it is quite slow, particularly for full-resolution datasets, and less robust. We suggest using the `slocum.raw_to_timeseries`. ::: -It is possible that between these steps the user will want to add any screening steps, or adjustments to the calibrations. PyGlider does not provide those steps, but is designed so they are easy to add. - +It is possible that between these steps the user will want to add any screening steps, or adjustments to the calibrations. PyGlider does not provide those steps, but is designed so they are easy to add. (ExDeplSlocum)= + ### Example deployment.yaml -```{literalinclude} ../tests/example-data/example-slocum/deploymentRealtime.yml +```{literalinclude} ../tests/example-data/example-slocum/deploymentRealtime.yml :language: yaml ``` (ExProcSlocum)= + ### Example processing script -```{literalinclude} ../tests/example-data/example-slocum/process_deploymentRealTime.py +```{literalinclude} ../tests/example-data/example-slocum/process_deploymentRealTime.py :language: python ``` - diff --git a/docs/index.md b/docs/index.md index dc3de5e..3b4fcaa 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,9 +1,7 @@ - -PyGlider: Convert glider data to NetCDF -======================================= +# PyGlider: Convert glider data to NetCDF PyGlider converts raw glider files to NetCDF timeseries and netcdf depth-time grids, -using python and based on {ref}`xarray`. The NetCDF files should be largely CF compliant. +using python and based on {ref}`xarray`. The NetCDF files should be largely CF compliant. The basic workflow is three glider-dependent steps to make a netcdf timeseries, followed by optional steps to create profiles or depth-time grids. @@ -21,7 +19,7 @@ pgplot.grid_plots(outname2, plottingyaml) Data comes from and is written in directories, and metadata is supplied by a yaml file. -Currently only [Alseamar SeaExplorer](https://www.alseamar-alcen.com/products/underwater-glider/seaexplorer) and [Teledyne/Webb Slocum](http://www.teledynemarine.com/autonomous-underwater-gliders) glider data files are supported, and those with limited configurations. Other gliders will hopefully be added. If you have a glider type or configuration you would like added, [open an issue or pull request!](https://github.com/c-proof/pyglider). +Currently only [Alseamar SeaExplorer](https://www.alseamar-alcen.com/products/underwater-glider/seaexplorer) and [Teledyne/Webb Slocum](http://www.teledynemarine.com/autonomous-underwater-gliders) glider data files are supported, and those with limited configurations. Other gliders will hopefully be added. If you have a glider type or configuration you would like added, [open an issue or pull request!](https://github.com/c-proof/pyglider). ```{toctree} --- @@ -37,9 +35,8 @@ pyglider/pyglider ## Acknowledgements - Slocum binary translator based on - + - Processing steps closely follow the work by SOCIB - + - Rutger's description of the Slocum binary files is very helpful: - The metadata format for NGDAC is here: - diff --git a/environment.yml b/environment.yml index 31db1a8..342ac48 100644 --- a/environment.yml +++ b/environment.yml @@ -14,4 +14,4 @@ dependencies: - bitstring - pooch - pip: - - dbdreader + - dbdreader diff --git a/pyglider/_version.py b/pyglider/_version.py index 0fd1318..2792152 100644 --- a/pyglider/_version.py +++ b/pyglider/_version.py @@ -1 +1 @@ -__version__ = '0.0.7' \ No newline at end of file +__version__ = '0.0.7' diff --git a/pyglider/example_data.py b/pyglider/example_data.py index 67c2ff2..77930da 100644 --- a/pyglider/example_data.py +++ b/pyglider/example_data.py @@ -1,6 +1,8 @@ from zipfile import ZipFile + import pooch + def get_example_data(outdir='./'): """ Get example data sets and configuration files @@ -12,11 +14,14 @@ def get_example_data(outdir='./'): ``outdir/example-data/``. Default is to unpack in the current directory. """ - zipfile = pooch.retrieve("https://cproof.uvic.ca/pyglider-example-data/pyglider-example-data.zip", - known_hash='5643a5301530e8dd60060a357cd9ed88eb1e84d761710c2a4013bc3c1817a859') + zipfile = pooch.retrieve( + 'https://cproof.uvic.ca/pyglider-example-data/pyglider-example-data.zip', + known_hash='5643a5301530e8dd60060a357cd9ed88eb1e84d761710c2a4013bc3c1817a859', + ) with ZipFile(zipfile, 'r') as zipObj: # Extract all the contents of zip file in outdir zipObj.extractall(outdir) -__all__ = ['get_example_data'] \ No newline at end of file + +__all__ = ['get_example_data'] diff --git a/pyglider/ncprocess.py b/pyglider/ncprocess.py index 1278b38..332e7f4 100644 --- a/pyglider/ncprocess.py +++ b/pyglider/ncprocess.py @@ -2,14 +2,16 @@ Routines that are used for common processing of netcdf files after they have been converted to standard timeseries. """ + import logging -import xarray as xr -import numpy as np -import pyglider.utils as utils import os -import yaml + import netCDF4 +import numpy as np import scipy.stats as stats +import xarray as xr + +import pyglider.utils as utils _log = logging.getLogger(__name__) @@ -44,8 +46,7 @@ def extract_timeseries_profiles(inname, outdir, deploymentyaml, force=False): with xr.open_dataset(inname) as ds: _log.info('Extracting profiles: opening %s', inname) profiles = np.unique(ds.profile_index) - profiles = [p for p in profiles if (~np.isnan(p) and not (p % 1) - and (p > 0))] + profiles = [p for p in profiles if (~np.isnan(p) and not (p % 1) and (p > 0))] for p in profiles: ind = np.where(ds.profile_index == p)[0] dss = ds.isel(time=ind) @@ -58,7 +59,8 @@ def extract_timeseries_profiles(inname, outdir, deploymentyaml, force=False): dss['trajectory'].attrs['cf_role'] = 'trajectory_id' dss['trajectory'].attrs['comment'] = ( 'A trajectory is a single' - 'deployment of a glider and may span multiple data files.') + 'deployment of a glider and may span multiple data files.' + ) dss['trajectory'].attrs['long_name'] = 'Trajectory/Deployment Name' # profile-averaged variables.... @@ -79,13 +81,16 @@ def extract_timeseries_profiles(inname, outdir, deploymentyaml, force=False): dss['u'] = np.nan dss['v'] = np.nan - dss['profile_id'] = np.int32(p) dss['profile_id'].attrs = profile_meta['profile_id'] if '_FillValue' not in dss['profile_id'].attrs: dss['profile_id'].attrs['_FillValue'] = -1 - dss['profile_id'].attrs['valid_min'] = np.int32(dss['profile_id'].attrs['valid_min']) - dss['profile_id'].attrs['valid_max'] = np.int32(dss['profile_id'].attrs['valid_max']) + dss['profile_id'].attrs['valid_min'] = np.int32( + dss['profile_id'].attrs['valid_min'] + ) + dss['profile_id'].attrs['valid_max'] = np.int32( + dss['profile_id'].attrs['valid_max'] + ) dss['profile_time'] = dss.time.mean() dss['profile_time'].attrs = profile_meta['profile_time'] @@ -103,20 +108,20 @@ def extract_timeseries_profiles(inname, outdir, deploymentyaml, force=False): dss['lat'] = dss['latitude'] dss['lon'] = dss['longitude'] dss['platform'] = np.int32(1) - comment = (meta['glider_model'] + ' operated by ' + - meta['institution']) + comment = meta['glider_model'] + ' operated by ' + meta['institution'] dss['platform'].attrs['comment'] = comment dss['platform'].attrs['id'] = ( - meta['glider_name'] + meta['glider_serial']) + meta['glider_name'] + meta['glider_serial'] + ) dss['platform'].attrs['instrument'] = 'instrument_ctd' dss['platform'].attrs['long_name'] = ( - meta['glider_model'] + dss['platform'].attrs['id']) + meta['glider_model'] + dss['platform'].attrs['id'] + ) dss['platform'].attrs['type'] = 'platform' dss['platform'].attrs['wmo_id'] = meta['wmo_id'] if '_FillValue' not in dss['platform'].attrs: dss['platform'].attrs['_FillValue'] = -1 - dss['lat_uv'] = np.nan dss['lat_uv'].attrs = profile_meta['lat_uv'] dss['lon_uv'] = np.nan @@ -134,13 +139,20 @@ def extract_timeseries_profiles(inname, outdir, deploymentyaml, force=False): # ancillary variables: link and create with values of 2. If # we dont' want them all 2, then create these variables in the # time series - to_fill = ['temperature', 'pressure', 'conductivity', - 'salinity', 'density', 'lon', 'lat', 'depth'] + to_fill = [ + 'temperature', + 'pressure', + 'conductivity', + 'salinity', + 'density', + 'lon', + 'lat', + 'depth', + ] for name in to_fill: qcname = name + '_qc' dss[name].attrs['ancillary_variables'] = qcname if qcname not in dss.keys(): - dss[qcname] = ('time', 2 * np.ones(len(dss[name]), np.int8)) dss[qcname].attrs = utils.fill_required_qcattrs({}, name) # 2 is "not eval" @@ -153,22 +165,30 @@ def extract_timeseries_profiles(inname, outdir, deploymentyaml, force=False): del dss.profile_time.attrs['units'] except KeyError: pass - dss.to_netcdf(outname, encoding={'time': {'units': timeunits, - 'calendar': timecalendar, - 'dtype': 'float64'}, - 'profile_time': - {'units': timeunits, - '_FillValue': -99999.0, - 'dtype': 'float64'}, - } - - ) + dss.to_netcdf( + outname, + encoding={ + 'time': { + 'units': timeunits, + 'calendar': timecalendar, + 'dtype': 'float64', + }, + 'profile_time': { + 'units': timeunits, + '_FillValue': -99999.0, + 'dtype': 'float64', + }, + }, + ) # add traj_strlen using bare ntcdf to make IOOS happy with netCDF4.Dataset(outname, 'r+') as nc: nc.renameDimension('string%d' % trajlen, 'traj_strlen') -def make_gridfiles(inname, outdir, deploymentyaml, *, fnamesuffix='', dz=1, starttime='1970-01-01'): + +def make_gridfiles( + inname, outdir, deploymentyaml, *, fnamesuffix='', dz=1, starttime='1970-01-01' +): """ Turn a timeseries netCDF file into a vertically gridded netCDF. @@ -211,9 +231,8 @@ def make_gridfiles(inname, outdir, deploymentyaml, *, fnamesuffix='', dz=1, star _log.debug(str(ds.time[-1])) profiles = np.unique(ds.profile_index) - profiles = [p for p in profiles if (~np.isnan(p) and not (p % 1) - and (p > 0))] - profile_bins = np.hstack((np.array(profiles) - 0.5, [profiles[-1]+0.5])) + profiles = [p for p in profiles if (~np.isnan(p) and not (p % 1) and (p > 0))] + profile_bins = np.hstack((np.array(profiles) - 0.5, [profiles[-1] + 0.5])) _log.debug(profile_bins) Nprofiles = len(profiles) _log.info(f'Nprofiles {Nprofiles}') @@ -221,23 +240,27 @@ def make_gridfiles(inname, outdir, deploymentyaml, *, fnamesuffix='', dz=1, star depths = depth_bins[:-1] + 0.5 xdimname = 'time' dsout = xr.Dataset( - coords={'depth': ('depth', depths), - 'profile': (xdimname, profiles)}) - dsout['depth'].attrs = {'units': 'm', - 'long_name': 'Depth', - 'standard_name': 'depth', - 'positive': 'down', - 'coverage_content_type': 'coordinate', - 'comment': 'center of depth bins'} + coords={'depth': ('depth', depths), 'profile': (xdimname, profiles)} + ) + dsout['depth'].attrs = { + 'units': 'm', + 'long_name': 'Depth', + 'standard_name': 'depth', + 'positive': 'down', + 'coverage_content_type': 'coordinate', + 'comment': 'center of depth bins', + } ds['time_1970'] = ds.temperature.copy() ds['time_1970'].values = ds.time.values.astype(np.float64) for td in ('time_1970', 'longitude', 'latitude'): good = np.where(~np.isnan(ds[td]) & (ds['profile_index'] % 1 == 0))[0] dat, xedges, binnumber = stats.binned_statistic( - ds['profile_index'].values[good], - ds[td].values[good], statistic='mean', - bins=[profile_bins]) + ds['profile_index'].values[good], + ds[td].values[good], + statistic='mean', + bins=[profile_bins], + ) if td == 'time_1970': td = 'time' dat = dat.astype('timedelta64[ns]') + np.datetime64('1970-01-01T00:00:00') @@ -246,10 +269,8 @@ def make_gridfiles(inname, outdir, deploymentyaml, *, fnamesuffix='', dz=1, star ds.drop('time_1970') good = np.where(~np.isnan(ds['time']) & (ds['profile_index'] % 1 == 0))[0] _log.info(f'Done times! {len(dat)}') - dsout['profile_time_start'] = ( - (xdimname), dat, profile_meta['profile_time_start']) - dsout['profile_time_end'] = ( - (xdimname), dat, profile_meta['profile_time_end']) + dsout['profile_time_start'] = ((xdimname), dat, profile_meta['profile_time_start']) + dsout['profile_time_end'] = ((xdimname), dat, profile_meta['profile_time_end']) for k in ds.keys(): if k in ['time', 'profile', 'longitude', 'latitude', 'depth'] or 'time' in k: @@ -258,22 +279,26 @@ def make_gridfiles(inname, outdir, deploymentyaml, *, fnamesuffix='', dz=1, star good = np.where(~np.isnan(ds[k]) & (ds['profile_index'] % 1 == 0))[0] if len(good) <= 0: continue - if "average_method" in ds[k].attrs: - average_method = ds[k].attrs["average_method"] - ds[k].attrs["processing"] = ( - f"Using average method {average_method} for " - f"variable {k} following deployment yaml.") - if average_method == "geometric mean": + if 'average_method' in ds[k].attrs: + average_method = ds[k].attrs['average_method'] + ds[k].attrs['processing'] = ( + f'Using average method {average_method} for ' + f'variable {k} following deployment yaml.' + ) + if average_method == 'geometric mean': average_method = stats.gmean - ds[k].attrs["processing"] += (" Using geometric mean implementation " - "scipy.stats.gmean") + ds[k].attrs['processing'] += ( + ' Using geometric mean implementation ' 'scipy.stats.gmean' + ) else: - average_method = "mean" + average_method = 'mean' dat, xedges, yedges, binnumber = stats.binned_statistic_2d( - ds['profile_index'].values[good], - ds['depth'].values[good], - values=ds[k].values[good], statistic=average_method, - bins=[profile_bins, depth_bins]) + ds['profile_index'].values[good], + ds['depth'].values[good], + values=ds[k].values[good], + statistic=average_method, + bins=[profile_bins, depth_bins], + ) _log.debug(f'dat{np.shape(dat)}') dsout[k] = (('depth', xdimname), dat.T, ds[k].attrs) @@ -282,15 +307,13 @@ def make_gridfiles(inname, outdir, deploymentyaml, *, fnamesuffix='', dz=1, star dsout[k].values = utils.gappy_fill_vertical(dsout[k].values) # fix u and v, because they should really not be gridded... - if (('water_velocity_eastward' in dsout.keys()) and - ('u' in profile_meta.keys())): + if ('water_velocity_eastward' in dsout.keys()) and ('u' in profile_meta.keys()): _log.debug(str(ds.water_velocity_eastward)) dsout['u'] = dsout.water_velocity_eastward.mean(axis=0) dsout['u'].attrs = profile_meta['u'] dsout['v'] = dsout.water_velocity_northward.mean(axis=0) dsout['v'].attrs = profile_meta['v'] - dsout = dsout.drop(['water_velocity_eastward', - 'water_velocity_northward']) + dsout = dsout.drop(['water_velocity_eastward', 'water_velocity_northward']) dsout.attrs = ds.attrs dsout.attrs.pop('cdm_data_type') # fix to be ISO parsable: @@ -321,16 +344,20 @@ def make_gridfiles(inname, outdir, deploymentyaml, *, fnamesuffix='', dz=1, star else: dsout[k].attrs['coverage_content_type'] = 'physicalMeasurement' - outname = outdir + '/' + ds.attrs['deployment_name'] + '_grid' + fnamesuffix + '.nc' _log.info('Writing %s', outname) # timeunits = 'nanoseconds since 1970-01-01T00:00:00Z' dsout.to_netcdf( outname, - encoding={'time': {'units': 'seconds since 1970-01-01T00:00:00Z', - '_FillValue': np.nan, - 'calendar': 'gregorian', - 'dtype': 'float64'}}) + encoding={ + 'time': { + 'units': 'seconds since 1970-01-01T00:00:00Z', + '_FillValue': np.nan, + 'calendar': 'gregorian', + 'dtype': 'float64', + } + }, + ) _log.info('Done gridding') return outname diff --git a/pyglider/seaexplorer.py b/pyglider/seaexplorer.py index 7057444..09e9c15 100644 --- a/pyglider/seaexplorer.py +++ b/pyglider/seaexplorer.py @@ -2,15 +2,17 @@ """ SeaExplorer-specific processing routines. """ + +import datetime import glob import logging -import numpy as np import os + +import numpy as np +import polars as pl import xarray as xr -import yaml + import pyglider.utils as utils -import datetime -import polars as pl _log = logging.getLogger(__name__) @@ -31,15 +33,22 @@ def _outputname(f, outdir): def _needsupdating(ftype, fin, fout): if not os.path.isfile(fout): return True - return (os.path.getmtime(fin) >= os.path.getmtime(fout)) + return os.path.getmtime(fin) >= os.path.getmtime(fout) def _sort(ds): return ds.sortby('time') -def raw_to_rawnc(indir, outdir, deploymentyaml, incremental=True, - min_samples_in_file=5, dropna_subset=None, dropna_thresh=1): +def raw_to_rawnc( + indir, + outdir, + deploymentyaml, + incremental=True, + min_samples_in_file=5, + dropna_subset=None, + dropna_thresh=1, +): """ Convert seaexplorer text files to raw parquet pandas files. @@ -135,49 +144,69 @@ def raw_to_rawnc(indir, outdir, deploymentyaml, incremental=True, badfiles.append(f) continue # Parse the datetime from nav files (called Timestamp) and pld1 files (called PLD_REALTIMECLOCK) - if "Timestamp" in out.columns: + if 'Timestamp' in out.columns: out = out.with_columns( - pl.col("Timestamp").str.strptime(pl.Datetime, format="%d/%m/%Y %H:%M:%S")) - out = out.rename({"Timestamp": "time"}) + pl.col('Timestamp').str.strptime( + pl.Datetime, format='%d/%m/%Y %H:%M:%S' + ) + ) + out = out.rename({'Timestamp': 'time'}) else: out = out.with_columns( - pl.col("PLD_REALTIMECLOCK").str.strptime(pl.Datetime, format="%d/%m/%Y %H:%M:%S.%3f")) - out = out.rename({"PLD_REALTIMECLOCK": "time"}) + pl.col('PLD_REALTIMECLOCK').str.strptime( + pl.Datetime, format='%d/%m/%Y %H:%M:%S.%3f' + ) + ) + out = out.rename({'PLD_REALTIMECLOCK': 'time'}) for col_name in out.columns: - if "time" not in col_name.lower(): + if 'time' not in col_name.lower(): out = out.with_columns(pl.col(col_name).cast(pl.Float64)) # If AD2CP data present, convert timestamps to datetime if 'AD2CP_TIME' in out.columns: # Set datestamps with date 00000 to None out = out.with_columns( - pl.col('AD2CP_TIME').str.strptime(pl.Datetime, format="%m%d%y %H:%M:%S", strict=False)) + pl.col('AD2CP_TIME').str.strptime( + pl.Datetime, format='%m%d%y %H:%M:%S', strict=False + ) + ) # subsetting for heavily oversampled raw data: if rawsub == 'raw' and dropna_subset is not None: # This check is the polars equivalent of pandas dropna. See docstring note on dropna - out = out.with_columns(out.select(pl.col(dropna_subset).is_null().cast(pl.Int64)) - .sum(axis=1).alias("null_count")).filter( - pl.col("null_count") <= dropna_thresh) \ - .drop("null_count") + out = ( + out.with_columns( + out.select( + pl.col(dropna_subset).is_null().cast(pl.Int64) + ) + .sum(axis=1) + .alias('null_count') + ) + .filter(pl.col('null_count') <= dropna_thresh) + .drop('null_count') + ) if ftype == 'gli': - out = out.with_columns([(pl.col("NavState") * 0 + int(filenum)).alias("fnum")]) + out = out.with_columns( + [(pl.col('NavState') * 0 + int(filenum)).alias('fnum')] + ) out.write_parquet(fnout) goodfiles.append(f) else: - if out.select("time").shape[0] > min_samples_in_file: + if out.select('time').shape[0] > min_samples_in_file: out.write_parquet(fnout) goodfiles.append(f) else: - _log.warning('Number of sensor data points' - 'too small. Skipping parquet write') + _log.warning( + 'Number of sensor data points' + 'too small. Skipping parquet write' + ) badfiles.append(f) if len(badfiles) > 0: _log.warning('Some files could not be parsed:') for fn in badfiles: _log.warning('%s', fn) if not goodfiles: - _log.warning(f'No valid unprocessed seaexplorer files found in'f'{indir}') + _log.warning(f'No valid unprocessed seaexplorer files found in' f'{indir}') return False _log.info('All raw files converted to parquet') return True @@ -198,13 +227,13 @@ def drop_pre_1971_samples(df): """ dt_1971 = datetime.datetime(1971, 1, 1) # If all dates before or after 1971-01-01, return the dataset - pre_1971 = df.filter(pl.col("time") < dt_1971) + pre_1971 = df.filter(pl.col('time') < dt_1971) if len(pre_1971) == len(df): return pre_1971 - post_1971 = df.filter(pl.col("time") > dt_1971) + post_1971 = df.filter(pl.col('time') > dt_1971) if len(post_1971) == len(df): return post_1971 - return df.filter(pl.col("time") > dt_1971) + return df.filter(pl.col('time') > dt_1971) def merge_parquet(indir, outdir, deploymentyaml, incremental=False, kind='raw'): @@ -267,22 +296,22 @@ def _interp_gli_to_pld(gli, ds, val, indctd): gli_ind = ~np.isnan(val) # switch for if we are comparing two polars dataframes or a polars dataframe and a xarray dataset if type(ds) is pl.DataFrame: - valout = np.interp(ds["time"], - gli.filter(gli_ind)["time"], - val[gli_ind]) + valout = np.interp(ds['time'], gli.filter(gli_ind)['time'], val[gli_ind]) else: - valout = np.interp(ds['time'].astype(int), - np.array(gli.filter(gli_ind)["time"].to_numpy().astype('datetime64[ns]')).astype(int), - val[gli_ind]) + valout = np.interp( + ds['time'].astype(int), + np.array( + gli.filter(gli_ind)['time'].to_numpy().astype('datetime64[ns]') + ).astype(int), + val[gli_ind], + ) return valout def _interp_pld_to_pld(pld, ds, val, indctd): pld_ind = np.where(~np.isnan(val))[0] if len(pld_ind) != len(indctd): - val = np.interp(ds['time'], - pld['time'][pld_ind], - val[pld_ind]) + val = np.interp(ds['time'], pld['time'][pld_ind], val[pld_ind]) else: val = val[indctd] return val @@ -302,9 +331,17 @@ def _remove_fill_values(df, fill_value=9999): return df -def raw_to_timeseries(indir, outdir, deploymentyaml, kind='raw', - profile_filt_time=100, profile_min_time=300, - maxgap=10, interpolate=False, fnamesuffix=''): +def raw_to_timeseries( + indir, + outdir, + deploymentyaml, + kind='raw', + profile_filt_time=100, + profile_min_time=300, + maxgap=10, + interpolate=False, + fnamesuffix='', +): """ A little different than above, for the 4-file version of the data set. """ @@ -333,23 +370,31 @@ def raw_to_timeseries(indir, outdir, deploymentyaml, kind='raw', # If present, use the timebase specified in ncvar: timebase in the # deployment yaml. if 'timebase' not in ncvar: - raise ValueError("Must specify timebase:source in netcdf_variables section of deployment yaml") + raise ValueError( + 'Must specify timebase:source in netcdf_variables section of deployment yaml' + ) if ncvar['timebase']['source'] not in sensor.columns: - raise ValueError(f"timebase source: {ncvar['timebase']['source']} not found in pld1 columns") + raise ValueError( + f"timebase source: {ncvar['timebase']['source']} not found in pld1 columns" + ) vals = sensor.select([ncvar['timebase']['source']]).to_numpy()[:, 0] indctd = np.where(~np.isnan(vals))[0] - ds['time'] = (('time'), sensor.select('time').to_numpy()[indctd, 0].astype('datetime64[ns]'), attr) + ds['time'] = ( + ('time'), + sensor.select('time').to_numpy()[indctd, 0].astype('datetime64[ns]'), + attr, + ) thenames = list(ncvar.keys()) # Check yaml to see if interpolate has been set to True - if "interpolate" in thenames: - if ncvar["interpolate"]: + if 'interpolate' in thenames: + if ncvar['interpolate']: interpolate = True for i in ['time', 'timebase', 'keep_variables', 'interpolate']: if i in thenames: thenames.remove(i) for name in thenames: _log.info('interpolating ' + name) - if not ('method' in ncvar[name].keys()): + if 'method' not in ncvar[name].keys(): # variables that are in the data set or can be interpolated from it if 'conversion' in ncvar[name].keys(): convert = getattr(utils, ncvar[name]['conversion']) @@ -363,17 +408,19 @@ def raw_to_timeseries(indir, outdir, deploymentyaml, kind='raw', # coarsen oxygen data as originally perscribed coarsen_time = ncvar[name]['coarsen'] # create a boolean mask of coarsened timesteps. Use this mask to create an array of samples to keep - coarse_ints = np.arange(0, len(sensor) / coarsen_time, 1 / coarsen_time).astype(int) - sensor_sub = sensor.with_columns(pl.lit(coarse_ints).alias("coarse_ints")) + coarse_ints = np.arange( + 0, len(sensor) / coarsen_time, 1 / coarsen_time + ).astype(int) + sensor_sub = sensor.with_columns( + pl.lit(coarse_ints).alias('coarse_ints') + ) # Subsample the variable data keeping only the samples from the coarsened timeseries - sensor_sub_grouped = sensor_sub.with_columns( - pl.col('time').to_physical() - ).group_by( - pl.col('coarse_ints'), - maintain_order=True - ).mean().with_columns( - pl.col('time').cast(pl.Datetime('ms')) - )[:-1, :] + sensor_sub_grouped = ( + sensor_sub.with_columns(pl.col('time').to_physical()) + .group_by(pl.col('coarse_ints'), maintain_order=True) + .mean() + .with_columns(pl.col('time').cast(pl.Datetime('ms')))[:-1, :] + ) val2 = sensor_sub_grouped.select(sensorname).to_numpy()[:, 0] val = _interp_gli_to_pld(sensor_sub_grouped, sensor, val2, indctd) if interpolate and not np.isnan(val).all(): @@ -381,19 +428,33 @@ def raw_to_timeseries(indir, outdir, deploymentyaml, kind='raw', time_var = time_original[np.where(~np.isnan(val))[0]] var_non_nan = val[np.where(~np.isnan(val))[0]] time_timebase = sensor.select('time').to_numpy()[indctd, 0] - if val.dtype == " 0)[0] + 1 - ds['longitude'].values = np.interp(ds.time, ds.time[good], - ds.longitude[good]) - ds['latitude'].values = np.interp(ds.time, ds.time[good], - ds.latitude[good]) + good = ( + np.where(np.abs(np.diff(ds.longitude)) + np.abs(np.diff(ds.latitude)) > 0)[0] + + 1 + ) + ds['longitude'].values = np.interp(ds.time, ds.time[good], ds.longitude[good]) + ds['latitude'].values = np.interp(ds.time, ds.time[good], ds.latitude[good]) # keep only timestamps with data from one of a set of variables if 'keep_variables' in ncvar: @@ -434,27 +495,30 @@ def raw_to_timeseries(indir, outdir, deploymentyaml, kind='raw', ds = utils.get_glider_depth(ds) ds = utils.get_distance_over_ground(ds) # ds = utils.get_profiles(ds) - ds = utils.get_profiles_new(ds, filt_time=profile_filt_time, - profile_min_time=profile_min_time) + ds = utils.get_profiles_new( + ds, filt_time=profile_filt_time, profile_min_time=profile_min_time + ) ds = utils.get_derived_eos_raw(ds) # somehow this comes out unsorted: ds = ds.sortby(ds.time) # Drop duplicate timestamps and check how many are removed this way len_before_drop = len(ds.time) - if hasattr(ds, "drop_duplicates"): - ds = ds.drop_duplicates(dim="time") + if hasattr(ds, 'drop_duplicates'): + ds = ds.drop_duplicates(dim='time') else: - time_diffs = (ds.time.astype(int).diff(dim="time") > 1e-6).values + time_diffs = (ds.time.astype(int).diff(dim='time') > 1e-6).values time_diffs_list = list(time_diffs) time_diffs_list.append(True) good = np.array(time_diffs_list) ds = ds.isel(time=good) len_after_drop = len(ds.time) proportion_kept = len_after_drop / len_before_drop - loss_str = f"{100 * (1 - proportion_kept)} % samples removed by timestamp deduplication." + loss_str = ( + f'{100 * (1 - proportion_kept)} % samples removed by timestamp deduplication.' + ) if proportion_kept < 0.5: - raise ValueError(f"{loss_str} Check input data for duplicate timestamps") + raise ValueError(f'{loss_str} Check input data for duplicate timestamps') elif proportion_kept < 0.999: _log.warning(loss_str) else: @@ -465,8 +529,9 @@ def raw_to_timeseries(indir, outdir, deploymentyaml, kind='raw', if 'correct_oxygen' in ncvar['oxygen_concentration'].keys(): ds = utils.oxygen_concentration_correction(ds, ncvar) else: - _log.warning('correct_oxygen not found in oxygen yaml. No' - 'correction applied') + _log.warning( + 'correct_oxygen not found in oxygen yaml. No' 'correction applied' + ) ds = ds.assign_coords(longitude=ds.longitude) ds = ds.assign_coords(latitude=ds.latitude) ds = ds.assign_coords(depth=ds.depth) @@ -494,9 +559,13 @@ def raw_to_timeseries(indir, outdir, deploymentyaml, kind='raw', if 'ad2cp_time' in list(ds): if 'units' in ds.ad2cp_time.attrs.keys(): ds.ad2cp_time.attrs.pop('units') - ds.to_netcdf(outname, 'w', - encoding={'time': {'units': 'seconds since 1970-01-01T00:00:00Z', - 'dtype': 'float64'}}) + ds.to_netcdf( + outname, + 'w', + encoding={ + 'time': {'units': 'seconds since 1970-01-01T00:00:00Z', 'dtype': 'float64'} + }, + ) return outname diff --git a/pyglider/slocum.py b/pyglider/slocum.py index 83dfea3..4d43b81 100644 --- a/pyglider/slocum.py +++ b/pyglider/slocum.py @@ -2,32 +2,42 @@ Routines to convert raw slocum dinkum files to netcdf timeseries. """ -import bitstring + from datetime import datetime + +import bitstring + try: import dbdreader + have_dbdreader = True except ImportError: have_dbdreader = True import glob import logging -import numpy as np import os import re import time -import xarray as xr import xml.etree.ElementTree as ET -from collections.abc import Iterable -import pyglider.utils as utils +import numpy as np +import xarray as xr +import pyglider.utils as utils _log = logging.getLogger(__name__) -def binary_to_rawnc(indir, outdir, cacdir, - sensorlist, deploymentyaml, - incremental=True, scisuffix='EBD', glidersuffix='DBD'): +def binary_to_rawnc( + indir, + outdir, + cacdir, + sensorlist, + deploymentyaml, + incremental=True, + scisuffix='EBD', + glidersuffix='DBD', +): """ Convert slocum binary data (*.ebd/*.dbd) to raw netcdf files. @@ -99,26 +109,31 @@ def binary_to_rawnc(indir, outdir, cacdir, try: fmeta, _ = dbd_get_meta(filen, cachedir=cacdir) path, ext = os.path.splitext(filen) - fncname = (fmeta['the8x3_filename'] + '.' + - fmeta['filename_extension'] + '.nc') + fncname = ( + fmeta['the8x3_filename'] + '.' + fmeta['filename_extension'] + '.nc' + ) fullfncname = outdir + '/' + fncname ncfilesexist = os.path.isfile(fullfncname) if incremental and ncfilesexist: - ncfilesold = (os.path.getmtime(filen) >= - os.path.getmtime(fullfncname)) + ncfilesold = os.path.getmtime(filen) >= os.path.getmtime( + fullfncname + ) else: ncfilesold = True if ncfilesold: - fdata, fmeta = dbd_to_dict( - filen, cacdir, keys=keys) + fdata, fmeta = dbd_to_dict(filen, cacdir, keys=keys) # need a unique index that increases monotonically, and is # different for each file (we can't use time because it is # not necessarily monotonic): - deployment_ind_flight = (int(fmeta['the8x3_filename']) * 1.0e6) + deployment_ind_flight = int(fmeta['the8x3_filename']) * 1.0e6 ds, deployment_ind_flight = datameta_to_nc( - fdata, fmeta, outdir=outdir, name=fncname, - deployment_ind=deployment_ind_flight) + fdata, + fmeta, + outdir=outdir, + name=fncname, + deployment_ind=deployment_ind_flight, + ) except Exception as e: badfiles += [files[ind]] _log.warning('Could not do parsing for %s', files[ind]) @@ -149,13 +164,18 @@ def _decode_sensor_info(dfh, meta): line = dfh.readline().decode('utf-8') if line.split(':')[0] != 's': raise ValueError('Failed to parse sensor info') - splitLine = [string.strip() for string in line.split(' ')[1:] - if string and not string.isspace()] + splitLine = [ + string.strip() + for string in line.split(' ')[1:] + if string and not string.isspace() + ] sensorInfo[splitLine[-2]] = splitLine if splitLine[0] == 'T': activeSensorList[int(splitLine[2])] = { - 'name': splitLine[-2], 'unit': splitLine[-1], - 'bits': splitLine[-3]} + 'name': splitLine[-2], + 'unit': splitLine[-1], + 'bits': splitLine[-3], + } outlines = outlines + [line] bindatafilepos = dfh.tell() # keep this for seeking @@ -171,16 +191,16 @@ def _get_cached_sensorlist(cachedir, meta): dd = glob.glob(cachedir + '/*') found = False for d in dd: - if (os.path.split(d)[1].upper() == - os.path.split(fname0)[1].upper()): + if os.path.split(d)[1].upper() == os.path.split(fname0)[1].upper(): found = True break if not found: raise FileNotFoundError(f'Could not find {fname0}') with open(d, 'rb') as dfh: - activeSensorList, sensorInfo, outlines, bindatafilepos = \ - _decode_sensor_info(dfh, meta) + activeSensorList, sensorInfo, outlines, bindatafilepos = _decode_sensor_info( + dfh, meta + ) return activeSensorList, sensorInfo @@ -226,7 +246,7 @@ def dbd_get_meta(filename, cachedir): with open(filename, 'rb') as dfh: meta['num_ascii_tags'] = 99 # read the first 99 lines - while (len(meta) < int(meta['num_ascii_tags'])): + while len(meta) < int(meta['num_ascii_tags']): line = dfh.readline().decode('utf-8') meta[line.split(':')[0]] = line.split(':')[1].strip() if len(meta) != int(meta['num_ascii_tags']): @@ -235,16 +255,15 @@ def dbd_get_meta(filename, cachedir): localcache = False # if the sensor data is here, we need to read it, even though we # will use the cache - if ('sensor_list_factored' in meta and - not int(meta['sensor_list_factored'])): + if 'sensor_list_factored' in meta and not int(meta['sensor_list_factored']): localcache = True - activeSensorList, sensorInfo, outlines, bindatafilepos = \ + activeSensorList, sensorInfo, outlines, bindatafilepos = ( _decode_sensor_info(dfh, meta) + ) # read the cache first. If its not there, try to make one.... try: - activeSensorList, sensorInfo = \ - _get_cached_sensorlist(cachedir, meta) + activeSensorList, sensorInfo = _get_cached_sensorlist(cachedir, meta) except FileNotFoundError: if localcache: _log.info('No cache file found; trying to create one') @@ -255,7 +274,8 @@ def dbd_get_meta(filename, cachedir): f'{meta["sensor_list_crc"]}. These are often found in ', 'offloaddir/Science/STATE/CACHE/ or ', 'offloaddir/Main_board/STATE/CACHE/. ', - f'Copy those locally into {cachedir}') + f'Copy those locally into {cachedir}', + ) meta['activeSensorList'] = activeSensorList # get the file's timestamp... meta['_dbdfiletimestamp'] = os.path.getmtime(filename) @@ -330,19 +350,19 @@ def dbd_to_dict(dinkum_file, cachedir, keys=None): elif data == 13330: endian = 'le' else: - _log.warning("integer failure: %s != 4660", data) - raise ValueError("Diagnostic header check failed.") + _log.warning('integer failure: %s != 4660', data) + raise ValueError('Diagnostic header check failed.') _log.debug('Endianness is %s', endian) data = binaryData.read(f'float{endian}:32') if not np.allclose(data, 123.456): - _log.warning("float32 failure: %s != 123.456", data) - raise ValueError("Diagnostic header check failed.") + _log.warning('float32 failure: %s != 123.456', data) + raise ValueError('Diagnostic header check failed.') data = binaryData.read(f'float{endian}:64') if not np.allclose(data, 123456789.12345): - _log.warning("float64 failure: %s != 123456789.12345", data) - raise ValueError("Diagnostic header check failed.") + _log.warning('float64 failure: %s != 123456789.12345', data) + raise ValueError('Diagnostic header check failed.') _log.debug('Diagnostic check passed. Endian is %s', endian) nsensors = int(meta['sensors_per_cycle']) @@ -353,7 +373,7 @@ def dbd_to_dict(dinkum_file, cachedir, keys=None): # 01 means updated with 'same value', 10 means updated with a new value, # 11 is reserved, 00 is not updated. # This character leads off each byte cycle. - frameCheck = binaryData.read('bytes:1').decode("utf-8") + frameCheck = binaryData.read('bytes:1').decode('utf-8') updatedCode = ['00'] * int(meta['sensors_per_cycle']) # Data cycle begins now. @@ -377,17 +397,18 @@ def dbd_to_dict(dinkum_file, cachedir, keys=None): elif code == '10': # New value. if int(activeSensorList[i]['bits']) in [4, 8]: currentValues[i] = binaryData.read( - f'float{endian}:' + - str(int(activeSensorList[i]['bits']) * 8)) + f'float{endian}:' + str(int(activeSensorList[i]['bits']) * 8) + ) elif int(activeSensorList[i]['bits']) in [1, 2]: currentValues[i] = binaryData.read( - f'uint{endian}:' + - str(int(activeSensorList[i]['bits']) * 8)) + f'uint{endian}:' + str(int(activeSensorList[i]['bits']) * 8) + ) else: raise ValueError('Bad bits') else: - raise ValueError(('Unrecognizable code in data cycle. ', - 'Parsing failed')) + raise ValueError( + ('Unrecognizable code in data cycle. ', 'Parsing failed') + ) data[ndata] = currentValues binaryData.bytealign() @@ -395,8 +416,9 @@ def dbd_to_dict(dinkum_file, cachedir, keys=None): try: d = binaryData.peek('bytes:1').decode('utf-8') except bitstring.ReadError: - _log.debug('position at end of stream %d', - binaryData.pos + 8 * bindatafilepos) + _log.debug( + 'position at end of stream %d', binaryData.pos + 8 * bindatafilepos + ) _log.warning('End of file reached without termination char') d = 'X' if d == 'd': @@ -405,21 +427,23 @@ def dbd_to_dict(dinkum_file, cachedir, keys=None): if ndata % DINKUMCHUNKSIZE == 0: # need to allocate more data! data = np.concatenate( - (data, np.nan + np.zeros((DINKUMCHUNKSIZE, nsensors))), - axis=0) + (data, np.nan + np.zeros((DINKUMCHUNKSIZE, nsensors))), axis=0 + ) elif d == 'X': # End of file cycle tag. We made it through. # throw out pre-allocated data we didn't use... data = data[:ndata] break else: - raise ValueError(f'Parsing failed at {binaryData.bytepos}. ', - f'Got {d} expected d or X') + raise ValueError( + f'Parsing failed at {binaryData.bytepos}. ', f'Got {d} expected d or X' + ) proctimeend = time.time() - _log.info(('%s lines of data read from %s, data rate of %s rows ' - 'per second') % (len(data), dinkum_file, - len(data) / (proctimeend - proctimestart))) + _log.info( + ('%s lines of data read from %s, data rate of %s rows ' 'per second') + % (len(data), dinkum_file, len(data) / (proctimeend - proctimestart)) + ) dfh.close() _log.info('Putting data into dictionary') @@ -453,14 +477,16 @@ def add_times_flight_sci(fdata, sdata=None): # Basically throw out the leading flight timestamp if its lag threshhold # is too high. - uniqueTimes, uniqueTimeIndices = np.unique(fdata['m_present_time'], - return_index=True) + uniqueTimes, uniqueTimeIndices = np.unique( + fdata['m_present_time'], return_index=True + ) if len(uniqueTimes) != len(fdata['m_present_time']): # Correct the duplicates in the flight timestamps.. _log.warning('Duplicate flight entries detected.') # Fix common problems with science data set. - fdata['sci_m_present_time_fixed'] = (fdata['sci_m_present_time'] + - fdata['m_science_clothesline_lag']) + fdata['sci_m_present_time_fixed'] = ( + fdata['sci_m_present_time'] + fdata['m_science_clothesline_lag'] + ) # There are some nans in the sci_m_present_time_fixed set. # We need to interpolate them. @@ -469,8 +495,10 @@ def add_times_flight_sci(fdata, sdata=None): bad = ~good if not np.all(bad): fdata['sci_m_present_time_fixed'][bad] = np.interp( - fdata['m_present_time'][bad], fdata['m_present_time'][good], - fdata['sci_m_present_time_fixed'][good]) + fdata['m_present_time'][bad], + fdata['m_present_time'][good], + fdata['sci_m_present_time_fixed'][good], + ) if sdata is not None: lag_threshhold = np.nanmax(fdata['u_max_clothesline_lag_for_consci']) @@ -486,16 +514,17 @@ def add_times_flight_sci(fdata, sdata=None): # With the recommended fix from Kerfoot's lab and D. Pingal # Some of the m_present_times are nan-ed... good = np.logical_and( - (fdata['m_science_clothesline_lag'][uniqueTimeIndices] < - lag_threshhold), - np.isfinite(fdata['m_science_clothesline_lag'][uniqueTimeIndices])) + (fdata['m_science_clothesline_lag'][uniqueTimeIndices] < lag_threshhold), + np.isfinite(fdata['m_science_clothesline_lag'][uniqueTimeIndices]), + ) if not np.all(~good): tf = fdata['sci_m_present_time_fixed'][uniqueTimeIndices[good]] pt = fdata['m_present_time'][uniqueTimeIndices[good]] sdata['m_present_time_sci'] = np.interp( - sdata['sci_m_present_time'], tf, pt, np.nan, np.nan) + sdata['sci_m_present_time'], tf, pt, np.nan, np.nan + ) else: sdata['m_present_time_sci'] = np.nan * sdata['sci_m_present_time'] @@ -506,7 +535,7 @@ def parse_filter_file(filter_file): keys = [] with open(filter_file) as fin: for li in fin: - if not li[0] in ['#']: + if li[0] not in ['#']: lis = li.split(' ') if len(lis) == 1: key = lis[0].rstrip() @@ -515,8 +544,9 @@ def parse_filter_file(filter_file): return keys -def datameta_to_nc(data, meta, outdir=None, name=None, check_exists=False, - deployment_ind=0): +def datameta_to_nc( + data, meta, outdir=None, name=None, check_exists=False, deployment_ind=0 +): """ Convert a raw dinkum data and meta dict to a netcdf file. @@ -545,10 +575,10 @@ def datameta_to_nc(data, meta, outdir=None, name=None, check_exists=False, outname = outdir + '/' + name if check_exists: - if (os.path.isfile(outname) and - (os.path.getmtime(outname) > meta['_dbdfiletimestamp'])): - _log.info('%s already exists and is newer than timestamp in meta', - outname) + if os.path.isfile(outname) and ( + os.path.getmtime(outname) > meta['_dbdfiletimestamp'] + ): + _log.info('%s already exists and is newer than timestamp in meta', outname) return None ds = xr.Dataset() @@ -596,8 +626,7 @@ def datameta_to_nc(data, meta, outdir=None, name=None, check_exists=False, return ds, deployment_ind -def merge_rawnc(indir, outdir, deploymentyaml, - scisuffix='EBD', glidersuffix='DBD'): +def merge_rawnc(indir, outdir, deploymentyaml, scisuffix='EBD', glidersuffix='DBD'): """ Merge all the raw netcdf files in indir. These are meant to be the raw flight and science files from the slocum. @@ -640,11 +669,11 @@ def merge_rawnc(indir, outdir, deploymentyaml, else: bad = False if bad: - os.rename(f, f+'.singleton') + os.rename(f, f + '.singleton') fglider = f try: fglider = fglider.replace(scisuffix, glidersuffix) - os.rename(fglider, fglider+'.singleton') + os.rename(fglider, fglider + '.singleton') except FileNotFoundError: pass @@ -664,8 +693,9 @@ def merge_rawnc(indir, outdir, deploymentyaml, ds.to_netcdf(outnebd, 'w') -def raw_to_timeseries(indir, outdir, deploymentyaml, *, - profile_filt_time=100, profile_min_time=300): +def raw_to_timeseries( + indir, outdir, deploymentyaml, *, profile_filt_time=100, profile_min_time=300 +): """ Parameters ---------- @@ -732,7 +762,7 @@ def raw_to_timeseries(indir, outdir, deploymentyaml, *, _log.debug('EBD sensorname %s', sensorname) val = ebd[sensorname] val = utils._zero_screen(val) - # val[val==0] = np.nan + # val[val==0] = np.nan val = convert(val) else: _log.debug('DBD sensorname %s', sensorname) @@ -758,8 +788,9 @@ def raw_to_timeseries(indir, outdir, deploymentyaml, *, ds = ds.assign_coords(latitude=ds.latitude) ds = ds.assign_coords(depth=ds.depth) - ds['time'] = (ds.time.values.astype('timedelta64[s]') + - np.datetime64('1970-01-01T00:00:00')).astype('datetime64[ns]') + ds['time'] = ( + ds.time.values.astype('timedelta64[s]') + np.datetime64('1970-01-01T00:00:00') + ).astype('datetime64[ns]') ds = utils.fill_metadata(ds, deployment['metadata'], device_data) start = ds['time'].values[0] end = ds['time'].values[-1] @@ -769,7 +800,8 @@ def raw_to_timeseries(indir, outdir, deploymentyaml, *, _log.debug(ds.depth.values[:100]) _log.debug(ds.depth.values[2000:2100]) ds = utils.get_profiles_new( - ds, filt_time=profile_filt_time, profile_min_time=profile_min_time) + ds, filt_time=profile_filt_time, profile_min_time=profile_min_time + ) _log.debug(ds.depth.values[:100]) _log.debug(ds.depth.values[2000:2100]) @@ -777,21 +809,31 @@ def raw_to_timeseries(indir, outdir, deploymentyaml, *, os.mkdir(outdir) except: pass - outname = (outdir + '/' + ds.attrs['deployment_name'] + '.nc') + outname = outdir + '/' + ds.attrs['deployment_name'] + '.nc' _log.info('writing %s', outname) - ds.to_netcdf(outname, 'w', - encoding={'time': {'units': 'seconds since 1970-01-01T00:00:00Z'}}) + ds.to_netcdf( + outname, 'w', encoding={'time': {'units': 'seconds since 1970-01-01T00:00:00Z'}} + ) if id0 is None: id0 = ds.attrs['deployment_name'] return outname -def binary_to_timeseries(indir, cachedir, outdir, deploymentyaml, *, - search='*.[D|E]BD', fnamesuffix='', - time_base='sci_water_temp', profile_filt_time=100, - profile_min_time=300, maxgap=300, - replace_attrs=None): +def binary_to_timeseries( + indir, + cachedir, + outdir, + deploymentyaml, + *, + search='*.[D|E]BD', + fnamesuffix='', + time_base='sci_water_temp', + profile_filt_time=100, + profile_min_time=300, + maxgap=300, + replace_attrs=None, +): """ Convert directly from binary files to netcdf timeseries file. Requires dbdreader to be installed. @@ -854,8 +896,7 @@ def binary_to_timeseries(indir, cachedir, outdir, deploymentyaml, *, # get the dbd file _log.info(f'{indir}/{search}') - dbd = dbdreader.MultiDBD(pattern=f'{indir}/{search}', - cacheDir=cachedir) + dbd = dbdreader.MultiDBD(pattern=f'{indir}/{search}', cacheDir=cachedir) # build a new data set based on info in `deployment.` # We will use ebd.m_present_time as the interpolant if the @@ -927,7 +968,6 @@ def binary_to_timeseries(indir, cachedir, outdir, deploymentyaml, *, attrs = utils.fill_required_attrs(attrs) ds[name] = (('time'), val, attrs) - _log.info(f'Getting glider depths, {ds}') _log.debug(f'HERE, {ds.pressure[0:100]}') @@ -937,9 +977,9 @@ def binary_to_timeseries(indir, cachedir, outdir, deploymentyaml, *, ds = utils.get_derived_eos_raw(ds) # screen out-of-range times; these won't convert: - ds['time'] = ds.time.where((ds.time>0) & (ds.time<6.4e9), np.nan) + ds['time'] = ds.time.where((ds.time > 0) & (ds.time < 6.4e9), np.nan) # convert time to datetime64: - ds['time'] = (ds.time*1e9).astype('datetime64[ns]') + ds['time'] = (ds.time * 1e9).astype('datetime64[ns]') ds['time'].attrs = attr ds = utils.fill_metadata(ds, deployment['metadata'], device_data) @@ -956,7 +996,8 @@ def binary_to_timeseries(indir, cachedir, outdir, deploymentyaml, *, if (profile_filt_time is not None) and (profile_min_time is not None): ds = utils.get_profiles_new( - ds, filt_time=profile_filt_time, profile_min_time=profile_min_time) + ds, filt_time=profile_filt_time, profile_min_time=profile_min_time + ) _log.debug(ds.depth.values[:100]) _log.debug(ds.depth.values[2000:2100]) @@ -964,14 +1005,21 @@ def binary_to_timeseries(indir, cachedir, outdir, deploymentyaml, *, os.mkdir(outdir) except: pass - outname = (outdir + '/' + ds.attrs['deployment_name'] + fnamesuffix + '.nc') + outname = outdir + '/' + ds.attrs['deployment_name'] + fnamesuffix + '.nc' _log.info('writing %s', outname) # convert time back to float64 seconds for ERDDAP etc happiness, as they won't take ns # as a unit: - ds.to_netcdf(outname, 'w', - encoding={'time': {'units': 'seconds since 1970-01-01T00:00:00Z', - '_FillValue': np.nan, - 'dtype': 'float64'}}) + ds.to_netcdf( + outname, + 'w', + encoding={ + 'time': { + 'units': 'seconds since 1970-01-01T00:00:00Z', + '_FillValue': np.nan, + 'dtype': 'float64', + } + }, + ) return outname @@ -980,8 +1028,7 @@ def binary_to_timeseries(indir, cachedir, outdir, deploymentyaml, *, raw_to_L1timeseries = raw_to_L0timeseries = raw_to_timeseries -def timeseries_get_profiles(inname, profile_filt_time=100, - profile_min_time=400): +def timeseries_get_profiles(inname, profile_filt_time=100, profile_min_time=400): """ Parameters ---------- @@ -993,7 +1040,8 @@ def timeseries_get_profiles(inname, profile_filt_time=100, """ with xr.open_dataset(inname) as ds: ds = utils.get_profiles_new( - ds, filt_time=profile_filt_time, profile_min_time=profile_min_time) + ds, filt_time=profile_filt_time, profile_min_time=profile_min_time + ) ds.to_netcdf(inname, mode='a') return inname @@ -1008,7 +1056,8 @@ def _dbd2ebd(dbd, ds, val): if (len(goodt) > 1) and (len(good) > 1): _log.debug(f'GOOD, {goodt}, {good}') vout[goodt] = np.interp( - ds.time[goodt].values, dbd.m_present_time.values[good], val[good].values) + ds.time[goodt].values, dbd.m_present_time.values[good], val[good].values + ) return vout @@ -1027,10 +1076,11 @@ def parse_gliderState(fname): xarray with fields time, lon, lat """ - data = {'lon': ('report', np.zeros(10000)), - 'lat': ('report', np.zeros(10000)), - 'time': ('report', np.zeros(10000, - dtype='datetime64[s]'))} + data = { + 'lon': ('report', np.zeros(10000)), + 'lat': ('report', np.zeros(10000)), + 'time': ('report', np.zeros(10000, dtype='datetime64[s]')), + } dat = xr.Dataset(data_vars=data) @@ -1086,16 +1136,16 @@ def parse_logfiles(files): found_time = True if found_time and 'GPS Location' in ll: gps[ntimes - 1] = ll - if found_time and "sensor:m_coulomb_amphr_total" in ll: - amph[ntimes-1] = ll + if found_time and 'sensor:m_coulomb_amphr_total' in ll: + amph[ntimes - 1] = ll if ll.startswith(' sensor:m_lithium_battery_relative_charge'): - pattern = r'=(\d+\.\d+)' - match = re.search(pattern, ll) - relcharge[ntimes-1] = float(match.group(1)) + pattern = r'=(\d+\.\d+)' + match = re.search(pattern, ll) + relcharge[ntimes - 1] = float(match.group(1)) if ll.startswith(' sensor:m_battery(volts)='): - pattern = r'=(\d+\.\d+)' - match = re.search(pattern, ll) - volts[ntimes-1] = float(match.group(1)) + pattern = r'=(\d+\.\d+)' + match = re.search(pattern, ll) + volts[ntimes - 1] = float(match.group(1)) amph = amph[:ntimes] gps = gps[:ntimes] times = times[:ntimes] @@ -1103,7 +1153,8 @@ def parse_logfiles(files): relcharge = relcharge[:ntimes] # now parse them out = xr.Dataset( - coords={'time': ('surfacing', np.zeros(ntimes, dtype='datetime64[ns]'))}) + coords={'time': ('surfacing', np.zeros(ntimes, dtype='datetime64[ns]'))} + ) out['ampH'] = ('surfacing', np.zeros(ntimes) * np.nan) out['lon'] = ('surfacing', np.zeros(ntimes) * np.nan) out['lat'] = ('surfacing', np.zeros(ntimes) * np.nan) @@ -1115,10 +1166,11 @@ def parse_logfiles(files): timestring = times[i][11:-13] try: out['time'][i] = np.datetime64( - datetime.strptime(timestring, '%a %b %d %H:%M:%S %Y'), 'ns') + datetime.strptime(timestring, '%a %b %d %H:%M:%S %Y'), 'ns' + ) st = amph[i].index('=') en = amph[i][st:].index(' ') + st - out['ampH'][i] = float(amph[i][(st+1):en]) + out['ampH'][i] = float(amph[i][(st + 1) : en]) sp = gps[i].split(' ') out['lat'][i] = utils.nmea2deg(float(sp[3])) out['lon'][i] = utils.nmea2deg(float(sp[5])) @@ -1161,18 +1213,18 @@ def parse_logfiles_maybe(files): if 'Curr Time:' in l: times[ntimes] = l ntimes += 1 - found_time=True + found_time = True elif found_time and 'GPS Location' in l: gps[ntimes - 1] = l - elif found_time and "sensor:m_coulomb_amphr_total" in l: - amph[ntimes-1] = l - elif found_time and "Because:" in l: + elif found_time and 'sensor:m_coulomb_amphr_total' in l: + amph[ntimes - 1] = l + elif found_time and 'Because:' in l: surfacereason = l - elif found_time and "MissionNum" in l: - missionnum[ntimes-1] = l - elif found_time and "abort segment:" in l: + elif found_time and 'MissionNum' in l: + missionnum[ntimes - 1] = l + elif found_time and 'abort segment:' in l: abortsegment = l - elif found_time and "abort cause:" in l: + elif found_time and 'abort cause:' in l: abortcause = l amph = amph[:ntimes] @@ -1181,7 +1233,9 @@ def parse_logfiles_maybe(files): missionnum = missionnum[:ntimes] # now parse them - out = xr.Dataset(coords={'time': ('surfacing', np.zeros(ntimes, dtype='datetime64[ns]'))}) + out = xr.Dataset( + coords={'time': ('surfacing', np.zeros(ntimes, dtype='datetime64[ns]'))} + ) out['ampH'] = ('surfacing', np.zeros(ntimes) * np.nan) out['lon'] = ('surfacing', np.zeros(ntimes) * np.nan) out['lat'] = ('surfacing', np.zeros(ntimes) * np.nan) @@ -1193,12 +1247,14 @@ def parse_logfiles_maybe(files): for i in range(ntimes): timestring = times[i][11:-13] - out['time'][i] = np.datetime64(datetime.strptime(timestring, '%a %b %d %H:%M:%S %Y'), 'ns') + out['time'][i] = np.datetime64( + datetime.strptime(timestring, '%a %b %d %H:%M:%S %Y'), 'ns' + ) try: if '=' in amph[i]: st = amph[i].index('=') en = amph[i][st:].index(' ') + st - out['ampH'][i] = float(amph[i][(st+1):en]) + out['ampH'][i] = float(amph[i][(st + 1) : en]) # GPS Location: 4912.737 N -12357.253 E measured 110.757 secs ago sp = gps[i].split() @@ -1214,8 +1270,10 @@ def parse_logfiles_maybe(files): return out - - - -__all__ = ['binary_to_rawnc', 'merge_rawnc', 'raw_to_timeseries', - 'parse_gliderState', 'parse_logfiles'] +__all__ = [ + 'binary_to_rawnc', + 'merge_rawnc', + 'raw_to_timeseries', + 'parse_gliderState', + 'parse_logfiles', +] diff --git a/pyglider/utils.py b/pyglider/utils.py index e55350c..fe5db06 100644 --- a/pyglider/utils.py +++ b/pyglider/utils.py @@ -1,15 +1,16 @@ """ Utilities that are used for processing scripts. """ + import collections -import xarray as xr -import numpy as np -from scipy.signal import argrelextrema -import gsw import logging from pathlib import Path -import yaml +import gsw +import numpy as np +import xarray as xr +import yaml +from scipy.signal import argrelextrema _log = logging.getLogger(__name__) @@ -32,16 +33,18 @@ def get_distance_over_ground(ds): good = ~np.isnan(ds.latitude + ds.longitude) if np.any(good): - dist = gsw.distance(ds.longitude[good].values, ds.latitude[good].values)/1000 + dist = gsw.distance(ds.longitude[good].values, ds.latitude[good].values) / 1000 dist = np.roll(np.append(dist, 0), 1) dist = np.cumsum(dist) dist = np.interp(ds.time, ds.time[good], dist) else: dist = 0 * ds.latitude.values - attr = {'long_name': 'distance over ground flown since mission start', - 'method': 'get_distance_over_ground', - 'units': 'km', - 'sources': 'latitude longitude'} + attr = { + 'long_name': 'distance over ground flown since mission start', + 'method': 'get_distance_over_ground', + 'units': 'km', + 'sources': 'latitude longitude', + } ds['distance_over_ground'] = (('time'), dist, attr) return ds @@ -66,31 +69,39 @@ def get_glider_depth(ds): ds['depth'] = ds.pressure try: meanlat = ds.latitude.mean(skipna=True) - ds['depth'].values = -gsw.z_from_p(ds.pressure.values, - ds.latitude.fillna(meanlat).values) + ds['depth'].values = -gsw.z_from_p( + ds.pressure.values, ds.latitude.fillna(meanlat).values + ) except AttributeError: pass # now we really want to know where it is, so interpolate: if len(good) > 0: ds['depth'].values = np.interp( - np.arange(len(ds.depth)), good, ds['depth'].values[good]) - - attr = {'source': 'pressure', 'long_name': 'glider depth', - 'standard_name': 'depth', 'units': 'm', - 'comment': 'from science pressure and interpolated', - 'instrument': 'instrument_ctd', - 'observation_type': 'calulated', - 'accuracy': 1.0, - 'precision': 2.0, 'resolution': 0.02, - 'platform': 'platform', - 'valid_min': 0.0, 'valid_max': 2000.0, - 'reference_datum': 'surface', 'positive': 'down'} + np.arange(len(ds.depth)), good, ds['depth'].values[good] + ) + + attr = { + 'source': 'pressure', + 'long_name': 'glider depth', + 'standard_name': 'depth', + 'units': 'm', + 'comment': 'from science pressure and interpolated', + 'instrument': 'instrument_ctd', + 'observation_type': 'calulated', + 'accuracy': 1.0, + 'precision': 2.0, + 'resolution': 0.02, + 'platform': 'platform', + 'valid_min': 0.0, + 'valid_max': 2000.0, + 'reference_datum': 'surface', + 'positive': 'down', + } ds['depth'].attrs = attr return ds -def get_profiles(ds, min_dp=10.0, inversion=3., filt_length=7, - min_nsamples=14): +def get_profiles(ds, min_dp=10.0, inversion=3.0, filt_length=7, min_nsamples=14): """ Not currently used... @@ -98,7 +109,9 @@ def get_profiles(ds, min_dp=10.0, inversion=3., filt_length=7, is good for lots of data. Less good for sparse data """ if 'pressure' not in ds: - _log.warning('No "pressure" variable in the data set; not searching for profiles') + _log.warning( + 'No "pressure" variable in the data set; not searching for profiles' + ) return ds profile = ds.pressure.values * np.nan direction = ds.pressure.values * np.nan @@ -106,42 +119,47 @@ def get_profiles(ds, min_dp=10.0, inversion=3., filt_length=7, lastpronum = 0 good = np.where(~np.isnan(ds.pressure))[0] - p = np.convolve(ds.pressure.values[good], - np.ones(filt_length) / filt_length, 'same') + p = np.convolve( + ds.pressure.values[good], np.ones(filt_length) / filt_length, 'same' + ) dpall = np.diff(p) inflect = np.where(dpall[:-1] * dpall[1:] < 0)[0] for n, i in enumerate(inflect[:-1]): - nprofile = inflect[n+1] - inflect[n] - inds = np.arange(good[inflect[n]], good[inflect[n+1]]+1) + 1 + nprofile = inflect[n + 1] - inflect[n] + inds = np.arange(good[inflect[n]], good[inflect[n + 1]] + 1) + 1 dp = np.diff(ds.pressure[inds[[-1, 0]]]) - if ((nprofile >= min_nsamples) and (np.abs(dp) > 10)): + if (nprofile >= min_nsamples) and (np.abs(dp) > 10): _log.debug('Good') direction[inds] = np.sign(dp) profile[inds] = pronum lastpronum = pronum pronum += 1 else: - profile[good[inflect[n]]:good[inflect[n+1]]] = lastpronum + 0.5 - - attrs = collections.OrderedDict([ - ('long_name', 'profile index'), - ('units', '1'), - ('comment', - 'N = inside profile N, N + 0.5 = between profiles N and N + 1'), - ('sources', 'time pressure'), - ('method', 'get_profiles'), - ('min_dp', min_dp), - ('filt_length', filt_length), - ('min_nsamples', min_nsamples)]) + profile[good[inflect[n]] : good[inflect[n + 1]]] = lastpronum + 0.5 + + attrs = collections.OrderedDict( + [ + ('long_name', 'profile index'), + ('units', '1'), + ('comment', 'N = inside profile N, N + 0.5 = between profiles N and N + 1'), + ('sources', 'time pressure'), + ('method', 'get_profiles'), + ('min_dp', min_dp), + ('filt_length', filt_length), + ('min_nsamples', min_nsamples), + ] + ) ds['profile_index'] = (('time'), profile, attrs) - attrs = collections.OrderedDict([ - ('long_name', 'glider vertical speed direction'), - ('units', '1'), - ('comment', - '-1 = ascending, 0 = inflecting or stalled, 1 = descending'), - ('sources', 'time pressure'), - ('method', 'get_profiles')]) + attrs = collections.OrderedDict( + [ + ('long_name', 'glider vertical speed direction'), + ('units', '1'), + ('comment', '-1 = ascending, 0 = inflecting or stalled, 1 = descending'), + ('sources', 'time pressure'), + ('method', 'get_profiles'), + ] + ) ds['profile_direction'] = (('time'), direction, attrs) return ds @@ -166,7 +184,9 @@ def get_profiles_new(ds, min_dp=10.0, filt_time=100, profile_min_time=300): """ if 'pressure' not in ds: - _log.warning('No "pressure" variable in the data set; not searching for profiles') + _log.warning( + 'No "pressure" variable in the data set; not searching for profiles' + ) return ds profile = ds.pressure.values * 0 @@ -174,16 +194,18 @@ def get_profiles_new(ds, min_dp=10.0, filt_time=100, profile_min_time=300): pronum = 1 good = np.where(np.isfinite(ds.pressure))[0] - dt = float(np.median( - np.diff(ds.time.values[good[:200000]]).astype(np.float64)) * 1e-9) + dt = float( + np.median(np.diff(ds.time.values[good[:200000]]).astype(np.float64)) * 1e-9 + ) _log.info(f'dt, {dt}') filt_length = int(filt_time / dt) min_nsamples = int(profile_min_time / dt) _log.info('Filt Len %d, dt %f, min_n %d', filt_length, dt, min_nsamples) if filt_length > 1: - p = np.convolve(ds.pressure.values[good], - np.ones(filt_length) / filt_length, 'same') + p = np.convolve( + ds.pressure.values[good], np.ones(filt_length) / filt_length, 'same' + ) else: p = ds.pressure.values[good] decim = int(filt_length / 3) @@ -214,11 +236,12 @@ def get_profiles_new(ds, min_dp=10.0, filt_time=100, profile_min_time=300): else: break _log.debug(nmax) - ins = range(int(mins[nmin]), int(maxs[nmax]+1)) + ins = range(int(mins[nmin]), int(maxs[nmax] + 1)) _log.debug(f'{pronum}, {ins}, {len(p)}, {mins[nmin]}, {maxs[nmax]}') _log.debug(f'Down, {ins}, {p[ins[0]].values},{p[ins[-1]].values}') - if ((len(ins) > min_nsamples) and - (np.nanmax(p[ins]) - np.nanmin(p[ins]) > min_dp)): + if (len(ins) > min_nsamples) and ( + np.nanmax(p[ins]) - np.nanmin(p[ins]) > min_dp + ): profile[ins] = pronum direction[ins] = +1 pronum += 1 @@ -230,32 +253,37 @@ def get_profiles_new(ds, min_dp=10.0, filt_time=100, profile_min_time=300): ins = range(maxs[nmax], mins[nmin]) _log.debug(f'{pronum}, {ins}, {len(p)}, {mins[nmin]}, {maxs[nmax]}') _log.debug(f'Up, {ins}, {p[ins[0]].values}, {p[ins[-1]].values}') - if ((len(ins) > min_nsamples) and - (np.nanmax(p[ins]) - np.nanmin(p[ins]) > min_dp)): + if (len(ins) > min_nsamples) and ( + np.nanmax(p[ins]) - np.nanmin(p[ins]) > min_dp + ): # up profile[ins] = pronum direction[ins] = -1 pronum += 1 - attrs = collections.OrderedDict([ - ('long_name', 'profile index'), - ('units', '1'), - ('comment', - 'N = inside profile N, N + 0.5 = between profiles N and N + 1'), - ('sources', 'time pressure'), - ('method', 'get_profiles_new'), - ('min_dp', min_dp), - ('filt_length', filt_length), - ('min_nsamples', min_nsamples)]) + attrs = collections.OrderedDict( + [ + ('long_name', 'profile index'), + ('units', '1'), + ('comment', 'N = inside profile N, N + 0.5 = between profiles N and N + 1'), + ('sources', 'time pressure'), + ('method', 'get_profiles_new'), + ('min_dp', min_dp), + ('filt_length', filt_length), + ('min_nsamples', min_nsamples), + ] + ) ds['profile_index'] = (('time'), profile, attrs) - attrs = collections.OrderedDict([ - ('long_name', 'glider vertical speed direction'), - ('units', '1'), - ('comment', - '-1 = ascending, 0 = inflecting or stalled, 1 = descending'), - ('sources', 'time pressure'), - ('method', 'get_profiles_new')]) + attrs = collections.OrderedDict( + [ + ('long_name', 'glider vertical speed direction'), + ('units', '1'), + ('comment', '-1 = ascending, 0 = inflecting or stalled, 1 = descending'), + ('sources', 'time pressure'), + ('method', 'get_profiles_new'), + ] + ) ds['profile_direction'] = (('time'), direction, attrs) return ds @@ -304,25 +332,32 @@ def get_derived_eos_raw(ds): elif 'mS cm' in ds.conductivity.units: r = ds.conductivity else: - raise ValueError("Could not parse conductivity units in yaml. " - "Expected 'S m-1' or 'mS cm-1'. " - "Check yaml entry netcdf_variables: conductivity: units") + raise ValueError( + 'Could not parse conductivity units in yaml. ' + "Expected 'S m-1' or 'mS cm-1'. " + 'Check yaml entry netcdf_variables: conductivity: units' + ) ds['salinity'] = ( - ('time'), gsw.conversions.SP_from_C(r, ds.temperature, ds.pressure).values) - attrs = collections.OrderedDict([ - ('long_name', 'water salinity'), - ('standard_name', 'sea_water_practical_salinity'), - ('units', '1e-3'), - ('comment', 'raw, uncorrected salinity'), - ('sources', 'conductivity temperature pressure'), - ('method', 'get_derived_eos_raw'), - ('observation_type', 'calulated'), - ('instrument', 'instrument_ctd'), - ('valid_max', 40.0), - ('valid_min', 0.0), - ('accuracy', 0.01), - ('precision', 0.01), - ('resolution', 0.001)]) + ('time'), + gsw.conversions.SP_from_C(r, ds.temperature, ds.pressure).values, + ) + attrs = collections.OrderedDict( + [ + ('long_name', 'water salinity'), + ('standard_name', 'sea_water_practical_salinity'), + ('units', '1e-3'), + ('comment', 'raw, uncorrected salinity'), + ('sources', 'conductivity temperature pressure'), + ('method', 'get_derived_eos_raw'), + ('observation_type', 'calulated'), + ('instrument', 'instrument_ctd'), + ('valid_max', 40.0), + ('valid_min', 0.0), + ('accuracy', 0.01), + ('precision', 0.01), + ('resolution', 0.001), + ] + ) attrs = fill_required_attrs(attrs) ds['salinity'].attrs = attrs long = ds.longitude.fillna(ds.longitude.mean(skipna=True)) @@ -330,56 +365,66 @@ def get_derived_eos_raw(ds): sa = gsw.SA_from_SP(ds['salinity'], ds['pressure'], long, lat) ct = gsw.CT_from_t(sa, ds['temperature'], ds['pressure']) ds['potential_density'] = (('time'), 1000 + gsw.density.sigma0(sa, ct).values) - attrs = collections.OrderedDict([ - ('long_name', 'water potential density'), - ('standard_name', 'sea_water_potential_density'), - ('units', 'kg m-3'), - ('comment', 'raw, uncorrected salinity'), - ('sources', 'salinity temperature pressure'), - ('method', 'get_derived_eos_raw'), - ('observation_type', 'calulated'), - ('instrument', 'instrument_ctd'), - ('accuracy', 0.01), - ('precision', 0.01), - ('resolution', 0.001) - ]) + attrs = collections.OrderedDict( + [ + ('long_name', 'water potential density'), + ('standard_name', 'sea_water_potential_density'), + ('units', 'kg m-3'), + ('comment', 'raw, uncorrected salinity'), + ('sources', 'salinity temperature pressure'), + ('method', 'get_derived_eos_raw'), + ('observation_type', 'calulated'), + ('instrument', 'instrument_ctd'), + ('accuracy', 0.01), + ('precision', 0.01), + ('resolution', 0.001), + ] + ) attrs = fill_required_attrs(attrs) ds['potential_density'].attrs = attrs - ds['density'] = (('time'), gsw.density.rho( - ds.salinity, ds.temperature, ds.pressure).values) - attrs = collections.OrderedDict([ - ('long_name', 'Density'), - ('standard_name', 'sea_water_density'), - ('units', 'kg m-3'), - ('comment', 'raw, uncorrected salinity'), - ('observation_type', 'calulated'), - ('sources', 'salinity temperature pressure'), - ('instrument', 'instrument_ctd'), - ('method', 'get_derived_eos_raw'), - ('valid_min', 990.0), - ('valid_max', 1040.0), - ('accuracy', 0.01), - ('precision', 0.01), - ('resolution', 0.001) - ]) + ds['density'] = ( + ('time'), + gsw.density.rho(ds.salinity, ds.temperature, ds.pressure).values, + ) + attrs = collections.OrderedDict( + [ + ('long_name', 'Density'), + ('standard_name', 'sea_water_density'), + ('units', 'kg m-3'), + ('comment', 'raw, uncorrected salinity'), + ('observation_type', 'calulated'), + ('sources', 'salinity temperature pressure'), + ('instrument', 'instrument_ctd'), + ('method', 'get_derived_eos_raw'), + ('valid_min', 990.0), + ('valid_max', 1040.0), + ('accuracy', 0.01), + ('precision', 0.01), + ('resolution', 0.001), + ] + ) attrs = fill_required_attrs(attrs) ds['density'].attrs = attrs - ds['potential_temperature'] = (('time'), gsw.conversions.pt0_from_t( - ds.salinity, ds.temperature, ds.pressure).values) - attrs = collections.OrderedDict([ - ('long_name', 'water potential temperature'), - ('standard_name', 'sea_water_potential_temperature'), - ('units', 'Celsius'), - ('comment', 'raw, uncorrected salinity'), - ('sources', 'salinity temperature pressure'), - ('observation_type', 'calulated'), - ('method', 'get_derived_eos_raw'), - ('instrument', 'instrument_ctd'), - ('accuracy', 0.002), - ('precision', 0.001), - ('resolution', 0.0001) - ]) + ds['potential_temperature'] = ( + ('time'), + gsw.conversions.pt0_from_t(ds.salinity, ds.temperature, ds.pressure).values, + ) + attrs = collections.OrderedDict( + [ + ('long_name', 'water potential temperature'), + ('standard_name', 'sea_water_potential_temperature'), + ('units', 'Celsius'), + ('comment', 'raw, uncorrected salinity'), + ('sources', 'salinity temperature pressure'), + ('observation_type', 'calulated'), + ('method', 'get_derived_eos_raw'), + ('instrument', 'instrument_ctd'), + ('accuracy', 0.002), + ('precision', 0.001), + ('resolution', 0.0001), + ] + ) attrs = fill_required_attrs(attrs) ds['potential_temperature'].attrs = attrs @@ -391,36 +436,36 @@ def _time_to_datetime64(time): Pass in a glider undecoded time (seconds since 1970-01-01), and get a np.datetime64[s] object back. """ - return (time.astype('timedelta64[s]') + - np.datetime64('1970-01-01')) + return time.astype('timedelta64[s]') + np.datetime64('1970-01-01') def fill_required_attrs(attrs): required = { - 'comment': " ", - 'accuracy': " ", - 'precision': " ", - 'platform': "platform", - 'resolution': " ", - 'ancillary_variables': " "} + 'comment': ' ', + 'accuracy': ' ', + 'precision': ' ', + 'platform': 'platform', + 'resolution': ' ', + 'ancillary_variables': ' ', + } for k in required.keys(): - if not (k in attrs.keys()): + if k not in attrs.keys(): attrs[k] = required[k] return attrs def fill_required_qcattrs(attrs, varname): required = { - "units": "1", - "flag_values": np.array([1, 2, 3, 4, 9], dtype=np.int8), - "valid_min": np.int8(1), - "valid_max": np.int8(9), - "flag_meanings": "PASS NOT_EVALUATED SUSPECT FAIL MISSING", - "standard_name": "quality_flag", - "long_name": "Initial flag for {varname}" + 'units': '1', + 'flag_values': np.array([1, 2, 3, 4, 9], dtype=np.int8), + 'valid_min': np.int8(1), + 'valid_max': np.int8(9), + 'flag_meanings': 'PASS NOT_EVALUATED SUSPECT FAIL MISSING', + 'standard_name': 'quality_flag', + 'long_name': 'Initial flag for {varname}', } for k in required.keys(): - if not (k in attrs.keys()): + if k not in attrs.keys(): attrs[k] = required[k] return attrs @@ -448,8 +493,12 @@ def get_file_id(ds): else: dt = ds.time.values[0].astype('datetime64[s]') _log.debug(f'dt, {dt}') - id = (ds.attrs['glider_name'] + ds.attrs['glider_serial'] + '-' + - dt.item().strftime('%Y%m%dT%H%M')) + id = ( + ds.attrs['glider_name'] + + ds.attrs['glider_serial'] + + '-' + + dt.item().strftime('%Y%m%dT%H%M') + ) return id @@ -498,7 +547,7 @@ def fill_metadata(ds, metadata, sensor_data): ds.attrs['standard_name_vocabulary'] = 'CF STandard Name Table v72' ds.attrs['date_created'] = str(np.datetime64('now')) + 'Z' ds.attrs['date_issued'] = str(np.datetime64('now')) + 'Z' - ds.attrs['date_modified'] = " " + ds.attrs['date_modified'] = ' ' ds.attrs['id'] = get_file_id(ds) ds.attrs['deployment_name'] = metadata['deployment_name'] ds.attrs['title'] = ds.attrs['id'] @@ -507,8 +556,9 @@ def fill_metadata(ds, metadata, sensor_data): ds.attrs['time_coverage_start'] = '%s' % dt[0] ds.attrs['time_coverage_end'] = '%s' % dt[-1] - ds.attrs['processing_level'] = ('Level 0 (L0) processed data timeseries; ' - 'no corrections or data screening') + ds.attrs['processing_level'] = ( + 'Level 0 (L0) processed data timeseries; ' 'no corrections or data screening' + ) for k, v in sensor_data.items(): ds.attrs[k] = str(v) @@ -526,8 +576,7 @@ def nmea2deg(nmea): """ Convert a NMEA float to a decimal degree float. e.g. -12640.3232 = -126.6721 """ - deg = (np.fix(nmea / 100) + - np.sign(nmea) * np.remainder(np.abs(nmea), 100) / 60) + deg = np.fix(nmea / 100) + np.sign(nmea) * np.remainder(np.abs(nmea), 100) / 60 return deg @@ -553,8 +602,10 @@ def oxygen_concentration_correction(ds, ncvar): oxy_yaml = ncvar['oxygen_concentration'] if 'reference_salinity' not in oxy_yaml.keys(): - _log.warning('No reference_salinity found in oxygen deployment yaml. ' - 'Assuming reference salinity of 0 psu') + _log.warning( + 'No reference_salinity found in oxygen deployment yaml. ' + 'Assuming reference salinity of 0 psu' + ) ref_sal = 0 else: ref_sal = float(oxy_yaml['reference_salinity']) @@ -562,18 +613,22 @@ def oxygen_concentration_correction(ds, ncvar): ds_oxy = ds.oxygen_concentration[~np.isnan(ds.oxygen_concentration)] # Match the nearest temperature and salinity values from their timestamps ds_temp = ds.potential_temperature[~np.isnan(ds.potential_temperature)].reindex( - time=ds_oxy.time, method="nearest") + time=ds_oxy.time, method='nearest' + ) ds_sal = ds.salinity[~np.isnan(ds.salinity)].reindex( - time=ds_oxy.time, method="nearest") + time=ds_oxy.time, method='nearest' + ) o2_sol = gsw.O2sol_SP_pt(ds_sal, ds_temp) - o2_sat = ds_oxy / gsw.O2sol_SP_pt(ds_sal*0 + ref_sal, ds_temp) + o2_sat = ds_oxy / gsw.O2sol_SP_pt(ds_sal * 0 + ref_sal, ds_temp) ds['oxygen_concentration'].values[~np.isnan(ds.oxygen_concentration)] = ( - o2_sat * o2_sol) + o2_sat * o2_sol + ) ds['oxygen_concentration'].attrs['oxygen_concentration_QC:RTQC_methodology'] = ( f'oxygen concentration corrected for salinity using gsw.O2sol_SP_pt with' f'salinity and potential temperature from dataset. Original oxygen ' f'concentration assumed to have been calculated using salinity = ' - f'{ref_sal} PSU') + f'{ref_sal} PSU' + ) return ds @@ -605,10 +660,13 @@ def gappy_fill_vertical(data): m, n = np.shape(data) for j in range(n): ind = np.where(~np.isnan(data[:, j]))[0] - if (len(ind) > 0 and len(ind) < (ind[-1] - ind[0]) - and len(ind) > (ind[-1] - ind[0]) * 0.05): + if ( + len(ind) > 0 + and len(ind) < (ind[-1] - ind[0]) + and len(ind) > (ind[-1] - ind[0]) * 0.05 + ): int = np.arange(ind[0], ind[-1]) - data[:, j][ind[0]:ind[-1]] = np.interp(int, ind, data[ind, j]) + data[:, j][ind[0] : ind[-1]] = np.interp(int, ind, data[ind, j]) return data @@ -619,7 +677,7 @@ def find_gaps(sample_time, timebase, maxgap): """ # figure out which sample each time in time base belongs to: time_index = np.searchsorted(sample_time, timebase, side='right') - time_index = np.clip(time_index, 0, len(sample_time)-1) + time_index = np.clip(time_index, 0, len(sample_time) - 1) # figure out the space between sample pairs dt = np.concatenate(([0], np.diff(sample_time))) @@ -628,7 +686,7 @@ def find_gaps(sample_time, timebase, maxgap): # get the indices of timebase that are too large and account for the # degenerate case when a timebase point falls directly on a sample time. - index = ~np.logical_or((ddt <= maxgap), (np.isin(timebase,sample_time))) + index = ~np.logical_or((ddt <= maxgap), (np.isin(timebase, sample_time))) return index @@ -642,8 +700,9 @@ def _parse_gliderxml_pos(fname): xmln = fname from bs4 import BeautifulSoup + with open(xmln, 'r') as fin: - y = BeautifulSoup(fin, features="xml") + y = BeautifulSoup(fin, features='xml') time = None lon = None lat = None @@ -675,8 +734,9 @@ def _parse_gliderxml_surfacedatetime(fname): xmln = fname from bs4 import BeautifulSoup + with open(xmln, 'r') as fin: - y = BeautifulSoup(fin, features="xml") + y = BeautifulSoup(fin, features='xml') time = None for a in y.find_all('report'): _log.debug(a) @@ -688,10 +748,14 @@ def _parse_gliderxml_surfacedatetime(fname): return time -def example_gridplot(filename, outname, - toplot=['potential_temperature', 'salinity', - 'oxygen_concentration'], - pdenlevels=np.arange(10, 30, 0.5), dpi=200, ylim=None): +def example_gridplot( + filename, + outname, + toplot=['potential_temperature', 'salinity', 'oxygen_concentration'], + pdenlevels=np.arange(10, 30, 0.5), + dpi=200, + ylim=None, +): """ Smoketest for plotting the netcdf files. """ @@ -700,12 +764,16 @@ def example_gridplot(filename, outname, ntoplot = len(toplot) with xr.open_dataset(filename) as ds: - fig, axs = plt.subplots(nrows=ntoplot, constrained_layout=True, - figsize=(7, 3*ntoplot), - sharex=True, sharey=True) + fig, axs = plt.subplots( + nrows=ntoplot, + constrained_layout=True, + figsize=(7, 3 * ntoplot), + sharex=True, + sharey=True, + ) for ax, vname in zip(axs, toplot): ds[vname].plot.pcolormesh(ax=ax) - (ds['potential_density']-1000).plot.contour(ax=ax, levels=pdenlevels) + (ds['potential_density'] - 1000).plot.contour(ax=ax, levels=pdenlevels) if ylim: ax.set_ylim(ylim) fig.savefig(outname, dpi=dpi) @@ -718,7 +786,9 @@ def _get_deployment(deploymentyaml): previous files. """ if isinstance(deploymentyaml, str): - deploymentyaml = [deploymentyaml,] + deploymentyaml = [ + deploymentyaml, + ] deployment = {} for nn, d in enumerate(deploymentyaml): with open(d) as fin: @@ -763,6 +833,13 @@ def _get_glider_name_slocum(current_directory): return glider, mission, slocum_glider -__all__ = ['get_distance_over_ground', 'get_glider_depth', 'get_profiles_new', - 'get_derived_eos_raw', "fill_metadata", "nmea2deg", - "gappy_fill_vertical", "oxygen_concentration_correction"] +__all__ = [ + 'get_distance_over_ground', + 'get_glider_depth', + 'get_profiles_new', + 'get_derived_eos_raw', + 'fill_metadata', + 'nmea2deg', + 'gappy_fill_vertical', + 'oxygen_concentration_correction', +] diff --git a/setup.py b/setup.py index 6f7ee00..32ea70f 100644 --- a/setup.py +++ b/setup.py @@ -1,32 +1,33 @@ -from setuptools import setup, find_packages +from setuptools import find_packages, setup + from pyglider._version import __version__ -setup(name="pyglider", - version=__version__, - description="Glider data to netCDF translation in python", - author="Jody Klymak", - author_email="jklymak@gmail.com", - url="https://pyglider.readthedocs.io", - packages=find_packages(exclude=['tests']), - python_requires='>=3.6', - install_requires=[ - "xarray", - "dask", - "netcdf4", - "gsw", - "scipy", - "bitstring", - "pooch", - "polars" - ], - license='Apache', - extras_require={ - "code_style": ["flake8<3.8.0,>=3.7.0", "black", "pre-commit==1.17.0"], - "testing": ["pytest", "pytest-cov"], - "docs": ["pydata-sphinx-theme", "numpydoc", "autoapi", "myst-parser", - "sphinx"] - }, - zip_safe=True, - long_description=open('README.md').read(), - long_description_content_type='text/markdown', - ) +setup( + name='pyglider', + version=__version__, + description='Glider data to netCDF translation in python', + author='Jody Klymak', + author_email='jklymak@gmail.com', + url='https://pyglider.readthedocs.io', + packages=find_packages(exclude=['tests']), + python_requires='>=3.6', + install_requires=[ + 'xarray', + 'dask', + 'netcdf4', + 'gsw', + 'scipy', + 'bitstring', + 'pooch', + 'polars', + ], + license='Apache', + extras_require={ + 'code_style': ['flake8<3.8.0,>=3.7.0', 'black', 'pre-commit==1.17.0'], + 'testing': ['pytest', 'pytest-cov'], + 'docs': ['pydata-sphinx-theme', 'numpydoc', 'autoapi', 'myst-parser', 'sphinx'], + }, + zip_safe=True, + long_description=open('README.md').read(), + long_description_content_type='text/markdown', +) diff --git a/tests-requirements.txt b/tests-requirements.txt index c31eb3c..161fdbb 100644 --- a/tests-requirements.txt +++ b/tests-requirements.txt @@ -8,4 +8,4 @@ netcdf4 gsw scipy bitstring -pooch \ No newline at end of file +pooch diff --git a/tests/_copyresultstoexpected.py b/tests/_copyresultstoexpected.py index 7dd1b77..02a1de2 100644 --- a/tests/_copyresultstoexpected.py +++ b/tests/_copyresultstoexpected.py @@ -1,17 +1,15 @@ """ If we run the tests and decide we want all the test results to be copied over, this is faster... """ -from pathlib import Path -from shutil import copy -todo = {'example-data/example-seaexplorer/L0-timeseries-test/dfo-eva035-20190718.nc': - 'expected/example-seaexplorer/L0-timeseries', - 'example-data/example-seaexplorer-raw/L0-timeseries-test/dfo-bb046-20200908.nc': 'expected/example-seaexplorer-raw/L0-timeseries', - 'example-data/example-slocum/L0-timeseries/dfo-rosie713-20190615.nc': 'expected/example-slocum/L0-timeseries', - 'example-data/example-slocum-littleendian/L0-timeseries-test/dfo-maria997-20220614.nc': 'expected/example-slocum-littleendian/L0-timeseries' +from shutil import copy - } +todo = { + 'example-data/example-seaexplorer/L0-timeseries-test/dfo-eva035-20190718.nc': 'expected/example-seaexplorer/L0-timeseries', + 'example-data/example-seaexplorer-raw/L0-timeseries-test/dfo-bb046-20200908.nc': 'expected/example-seaexplorer-raw/L0-timeseries', + 'example-data/example-slocum/L0-timeseries/dfo-rosie713-20190615.nc': 'expected/example-slocum/L0-timeseries', + 'example-data/example-slocum-littleendian/L0-timeseries-test/dfo-maria997-20220614.nc': 'expected/example-slocum-littleendian/L0-timeseries', +} for td in todo: copy(td, todo[td]) - diff --git a/tests/conftest.py b/tests/conftest.py index 0e9932c..83dd2f8 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,6 +2,7 @@ # # library_dir = Path(__file__).parent.parent.absolute() + def pytest_sessionstart(session): pass # os.mkdir(library_dir / '_example_data') @@ -9,4 +10,4 @@ def pytest_sessionstart(session): def pytest_sessionfinish(session, exitstatus): - pass \ No newline at end of file + pass diff --git a/tests/environment.yml b/tests/environment.yml index c3e117b..cde877e 100644 --- a/tests/environment.yml +++ b/tests/environment.yml @@ -19,4 +19,4 @@ dependencies: - compliance-checker - cc-plugin-glider - pip: - - dbdreader + - dbdreader diff --git a/tests/test_pyglider.py b/tests/test_pyglider.py index 0c003bd..f552d12 100644 --- a/tests/test_pyglider.py +++ b/tests/test_pyglider.py @@ -1,12 +1,11 @@ -import xarray as xr from pathlib import Path -import pytest + import numpy as np +import pytest +import xarray as xr import yaml -import pyglider.ncprocess as ncprocess import pyglider.seaexplorer as seaexplorer -import pyglider.slocum as slocum library_dir = Path(__file__).parent.parent.absolute() example_dir = library_dir / 'tests/example-data/' @@ -19,13 +18,13 @@ l0tsdir = str(example_dir / 'example-seaexplorer/L0-timeseries-test/') + '/' seaexplorer.raw_to_rawnc(rawdir, rawncdir, deploymentyaml) seaexplorer.merge_parquet(rawncdir, rawncdir, deploymentyaml, kind='sub') -outname = seaexplorer.raw_to_L0timeseries(rawncdir, l0tsdir, - deploymentyaml, kind='sub') +outname = seaexplorer.raw_to_L0timeseries(rawncdir, l0tsdir, deploymentyaml, kind='sub') output = xr.open_dataset(outname) # Open test data file test_data = xr.open_dataset( - library_dir / - 'tests/expected/example-seaexplorer/L0-timeseries/dfo-eva035-20190718.nc') + library_dir + / 'tests/expected/example-seaexplorer/L0-timeseries/dfo-eva035-20190718.nc' +) variables = list(output.variables) @@ -36,7 +35,7 @@ def test_variables_seaexplorer(): assert variables == test_variables -@pytest.mark.parametrize("var", variables) +@pytest.mark.parametrize('var', variables) def test_example_seaexplorer(var): # Test that each variable and its coordinates match assert output[var].attrs == test_data[var].attrs @@ -46,8 +45,8 @@ def test_example_seaexplorer(var): dt0 = output[var].values - np.datetime64('2000-01-01') dt1 = test_data[var].values - np.datetime64('2000-01-01') assert np.allclose( - np.array(dt0, dtype='float64'), - np.array(dt1, dtype='float64')) + np.array(dt0, dtype='float64'), np.array(dt1, dtype='float64') + ) def test_example_seaexplorer_metadata(): @@ -63,25 +62,32 @@ def test_example_seaexplorer_metadata(): with open(deploymentyaml) as fin: deployment = yaml.safe_load(fin) interp_yaml = str(example_dir / 'example-seaexplorer/deploymentRealtimeInterp.yml') -deployment['netcdf_variables']["interpolate"] = True -with open(interp_yaml, "w") as fout: +deployment['netcdf_variables']['interpolate'] = True +with open(interp_yaml, 'w') as fout: yaml.dump(deployment, fout) -l0tsdir_interp = str(example_dir / 'example-seaexplorer/L0-timeseries-test-interp/') + '/' +l0tsdir_interp = ( + str(example_dir / 'example-seaexplorer/L0-timeseries-test-interp/') + '/' +) -outname_interp = seaexplorer.raw_to_L0timeseries(rawncdir, l0tsdir_interp, interp_yaml, kind='sub') +outname_interp = seaexplorer.raw_to_L0timeseries( + rawncdir, l0tsdir_interp, interp_yaml, kind='sub' +) output_interp = xr.open_dataset(outname_interp) -@pytest.mark.parametrize("var", variables) + +@pytest.mark.parametrize('var', variables) def test_example_seaexplorer_interp_nrt(var): assert output_interp[var].attrs == test_data[var].attrs if var not in ['time']: - np.testing.assert_allclose(output_interp[var].values, test_data[var].values, rtol=1e-5) + np.testing.assert_allclose( + output_interp[var].values, test_data[var].values, rtol=1e-5 + ) else: dt0 = output_interp[var].values - np.datetime64('2000-01-01') dt1 = test_data[var].values - np.datetime64('2000-01-01') assert np.allclose( - np.array(dt0, dtype='float64'), - np.array(dt1, dtype='float64')) + np.array(dt0, dtype='float64'), np.array(dt1, dtype='float64') + ) # Test raw (full resolution) seaexplorer data. @@ -91,26 +97,31 @@ def test_example_seaexplorer_interp_nrt(var): l0tsdir = str(example_dir / 'example-seaexplorer-raw/L0-timeseries-test/') + '/' seaexplorer.raw_to_rawnc(rawdir, rawncdir, deploymentyaml_raw) seaexplorer.merge_parquet(rawncdir, rawncdir, deploymentyaml_raw, kind='raw') -outname_raw = seaexplorer.raw_to_L0timeseries(rawncdir, l0tsdir, - deploymentyaml_raw, kind='raw') +outname_raw = seaexplorer.raw_to_L0timeseries( + rawncdir, l0tsdir, deploymentyaml_raw, kind='raw' +) output_raw = xr.open_dataset(outname_raw) # Open test data file test_data_raw = xr.open_dataset( - library_dir / - 'tests/expected/example-seaexplorer-raw/L0-timeseries/dfo-bb046-20200908.nc') + library_dir + / 'tests/expected/example-seaexplorer-raw/L0-timeseries/dfo-bb046-20200908.nc' +) -@pytest.mark.parametrize("var", variables) + +@pytest.mark.parametrize('var', variables) def test_example_seaexplorer_raw(var): # Test that each variable and its coordinates match assert output_raw[var].attrs == test_data_raw[var].attrs if var not in ['time']: - np.testing.assert_allclose(output_raw[var].values, test_data_raw[var].values, rtol=1e-5) + np.testing.assert_allclose( + output_raw[var].values, test_data_raw[var].values, rtol=1e-5 + ) else: dt0 = output_raw[var].values - np.datetime64('2000-01-01') dt1 = test_data_raw[var].values - np.datetime64('2000-01-01') assert np.allclose( - np.array(dt0, dtype='float64'), - np.array(dt1, dtype='float64')) + np.array(dt0, dtype='float64'), np.array(dt1, dtype='float64') + ) def test_example_seaexplorer_metadata_raw(): @@ -128,23 +139,29 @@ def test_example_seaexplorer_metadata_raw(): with open(deploymentyaml_raw) as fin: deployment_raw = yaml.safe_load(fin) interp_yaml = str(example_dir / 'example-seaexplorer-raw/deploymentDelayedInterp.yml') -deployment_raw['netcdf_variables']["interpolate"] = True -with open(interp_yaml, "w") as fout: +deployment_raw['netcdf_variables']['interpolate'] = True +with open(interp_yaml, 'w') as fout: yaml.dump(deployment_raw, fout) -l0tsdir_interp_raw = str(example_dir / 'example-seaexplorer-raw/L0-timeseries-test-interp/') + '/' +l0tsdir_interp_raw = ( + str(example_dir / 'example-seaexplorer-raw/L0-timeseries-test-interp/') + '/' +) -outname_interp_raw = seaexplorer.raw_to_L0timeseries(rawncdir, l0tsdir_interp_raw, interp_yaml, kind='raw') +outname_interp_raw = seaexplorer.raw_to_L0timeseries( + rawncdir, l0tsdir_interp_raw, interp_yaml, kind='raw' +) output_interp_raw = xr.open_dataset(outname_interp_raw) -@pytest.mark.parametrize("var", variables) + +@pytest.mark.parametrize('var', variables) def test_example_seaexplorer_interp_raw(var): assert output_interp_raw[var].attrs == test_data_raw[var].attrs if var not in ['time']: - assert np.count_nonzero(~np.isnan(output_interp_raw[var].values)) >= \ - np.count_nonzero(~np.isnan(test_data_raw[var].values)) + assert np.count_nonzero( + ~np.isnan(output_interp_raw[var].values) + ) >= np.count_nonzero(~np.isnan(test_data_raw[var].values)) else: dt0 = output_interp_raw[var].values - np.datetime64('2000-01-01') dt1 = test_data_raw[var].values - np.datetime64('2000-01-01') assert np.allclose( - np.array(dt0, dtype='float64'), - np.array(dt1, dtype='float64')) + np.array(dt0, dtype='float64'), np.array(dt1, dtype='float64') + ) diff --git a/tests/test_seaexplorer.py b/tests/test_seaexplorer.py index 4eb3d94..c4b09f5 100644 --- a/tests/test_seaexplorer.py +++ b/tests/test_seaexplorer.py @@ -1,9 +1,11 @@ -import polars as pl +import os +from pathlib import Path + import numpy as np +import polars as pl import pytest -from pathlib import Path -import os import yaml + os.system('rm tests/data/realtime_rawnc/*') library_dir = Path(__file__).parent.parent.absolute() example_dir = library_dir / 'tests/example-data/' @@ -12,35 +14,42 @@ def test__outputname(): - fnout, filenum = seaexplorer._outputname('tests/data/realtime_raw/sea035.12.pld1.sub.36', - 'tests/data/realtime_rawnc/') + fnout, filenum = seaexplorer._outputname( + 'tests/data/realtime_raw/sea035.12.pld1.sub.36', 'tests/data/realtime_rawnc/' + ) assert fnout == 'tests/data/realtime_rawnc/sea035.0012.pld1.sub.0036.parquet' assert filenum == 36 def test_raw_to_rawnc(): # Default settings on a clean folder - result_default = seaexplorer.raw_to_rawnc('tests/data/realtime_raw/', - 'tests/data/realtime_rawnc/', None) + result_default = seaexplorer.raw_to_rawnc( + 'tests/data/realtime_raw/', 'tests/data/realtime_rawnc/', None + ) assert result_default is True # Test the reprocess flag works - result_reprocess = seaexplorer.raw_to_rawnc('tests/data/realtime_raw/', - 'tests/data/realtime_rawnc/', - None, incremental=False) + result_reprocess = seaexplorer.raw_to_rawnc( + 'tests/data/realtime_raw/', + 'tests/data/realtime_rawnc/', + None, + incremental=False, + ) assert result_reprocess is True # Check that reprocessing not preformed by default - result_no_new_files = seaexplorer.raw_to_rawnc('tests/data/realtime_raw/', - 'tests/data/realtime_rawnc/', - None) + result_no_new_files = seaexplorer.raw_to_rawnc( + 'tests/data/realtime_raw/', 'tests/data/realtime_rawnc/', None + ) assert result_no_new_files is False # Reject all payload files with fewer than 10000 lines - result_strict = seaexplorer.raw_to_rawnc('tests/data/realtime_raw/', - 'tests/data/realtime_rawnc/', - None, - incremental=False, - min_samples_in_file=10000) + result_strict = seaexplorer.raw_to_rawnc( + 'tests/data/realtime_raw/', + 'tests/data/realtime_rawnc/', + None, + incremental=False, + min_samples_in_file=10000, + ) assert result_strict is False @@ -56,32 +65,38 @@ def test__needsupdating(): def test_merge_rawnc(): result_default = seaexplorer.merge_parquet( - 'tests/data/realtime_rawnc/', - 'tests/data/realtime_rawnc/', - str(example_dir / 'example-seaexplorer/deploymentRealtime.yml')) + 'tests/data/realtime_rawnc/', + 'tests/data/realtime_rawnc/', + str(example_dir / 'example-seaexplorer/deploymentRealtime.yml'), + ) result_sub = seaexplorer.merge_parquet( - 'tests/data/realtime_rawnc/', - 'tests/data/realtime_rawnc/', - str(example_dir / 'example-seaexplorer/deploymentRealtime.yml'), - kind='sub') + 'tests/data/realtime_rawnc/', + 'tests/data/realtime_rawnc/', + str(example_dir / 'example-seaexplorer/deploymentRealtime.yml'), + kind='sub', + ) assert result_default is False assert result_sub is True def test__remove_fill_values(): # This should convert values equallling 9999 in the original df to nan - df_in = pl.read_parquet('tests/data/realtime_rawnc/sea035.0012.pld1.sub.0036.parquet') + df_in = pl.read_parquet( + 'tests/data/realtime_rawnc/sea035.0012.pld1.sub.0036.parquet' + ) df_out = seaexplorer._remove_fill_values(df_in) - assert (df_in.select("GPCTD_DOF").to_numpy()[:, 0] == 9999).all() - assert np.isnan(df_out.select("GPCTD_DOF").to_numpy()[:, 0]).all() + assert (df_in.select('GPCTD_DOF').to_numpy()[:, 0] == 9999).all() + assert np.isnan(df_out.select('GPCTD_DOF').to_numpy()[:, 0]).all() def test__interp_gli_to_pld(): # function should interpolate values from the glider dataset to sampling frequency of payload dataset - glider = pl.read_parquet('tests/data/realtime_rawnc/sea035.0012.gli.sub.0036.parquet') + glider = pl.read_parquet( + 'tests/data/realtime_rawnc/sea035.0012.gli.sub.0036.parquet' + ) ds = pl.read_parquet('tests/data/realtime_rawnc/sea035.0012.pld1.sub.0036.parquet') - val = glider.select("Pitch").to_numpy()[:, 0] + val = glider.select('Pitch').to_numpy()[:, 0] pitch_interp = seaexplorer._interp_gli_to_pld(glider, ds, val, None) assert len(pitch_interp) == ds.shape[0] @@ -89,14 +104,17 @@ def test__interp_gli_to_pld(): def test_raw_to_timeseries(): # Test default, will fail as we have sub data, not raw data with pytest.raises(FileNotFoundError) as missing_file_exc: - result_default = seaexplorer.raw_to_timeseries('tests/data/realtime_rawnc/', - 'tests/data/l0-profiles/', - str(example_dir / 'example-seaexplorer/deploymentRealtime.yml'), - ) - result_sub = seaexplorer.raw_to_timeseries('tests/data/realtime_rawnc/', - 'tests/data/l0-profiles/', - str(example_dir / 'example-seaexplorer/deploymentRealtime.yml'), - kind='sub') + result_default = seaexplorer.raw_to_timeseries( + 'tests/data/realtime_rawnc/', + 'tests/data/l0-profiles/', + str(example_dir / 'example-seaexplorer/deploymentRealtime.yml'), + ) + result_sub = seaexplorer.raw_to_timeseries( + 'tests/data/realtime_rawnc/', + 'tests/data/l0-profiles/', + str(example_dir / 'example-seaexplorer/deploymentRealtime.yml'), + kind='sub', + ) assert 'No such file or directory' in str(missing_file_exc) assert result_sub == 'tests/data/l0-profiles/dfo-eva035-20190718.nc' @@ -105,21 +123,25 @@ def test_missing_bad_timebase(): # Prepare yaml files with bad timebase and no timebase with open(example_dir / 'example-seaexplorer/deploymentRealtime.yml') as fin: deployment = yaml.safe_load(fin) - deployment['netcdf_variables']['timebase']['source'] = "non existing sensor" - with open(example_dir / 'example-seaexplorer/bad_timebase.yml', "w") as fin: + deployment['netcdf_variables']['timebase']['source'] = 'non existing sensor' + with open(example_dir / 'example-seaexplorer/bad_timebase.yml', 'w') as fin: yaml.dump(deployment, fin) deployment['netcdf_variables'].pop('timebase') - with open(example_dir / 'example-seaexplorer/no_timebase.yml', "w") as fin: + with open(example_dir / 'example-seaexplorer/no_timebase.yml', 'w') as fin: yaml.dump(deployment, fin) with pytest.raises(ValueError) as bad_timebase_exc: - result_bad_timebase = seaexplorer.raw_to_timeseries('tests/data/realtime_rawnc/', - 'tests/data/l0-profiles/', - str(example_dir / 'example-seaexplorer/bad_timebase.yml'), - kind='sub') + result_bad_timebase = seaexplorer.raw_to_timeseries( + 'tests/data/realtime_rawnc/', + 'tests/data/l0-profiles/', + str(example_dir / 'example-seaexplorer/bad_timebase.yml'), + kind='sub', + ) with pytest.raises(ValueError) as no_timebase_exc: - result_no_timebase = seaexplorer.raw_to_timeseries('tests/data/realtime_rawnc/', - 'tests/data/l0-profiles/', - str(example_dir / 'example-seaexplorer/no_timebase.yml'), - kind='sub') - assert "sensor not found in pld1 columns" in str(bad_timebase_exc) - assert "Must specify timebase" in str(no_timebase_exc) + result_no_timebase = seaexplorer.raw_to_timeseries( + 'tests/data/realtime_rawnc/', + 'tests/data/l0-profiles/', + str(example_dir / 'example-seaexplorer/no_timebase.yml'), + kind='sub', + ) + assert 'sensor not found in pld1 columns' in str(bad_timebase_exc) + assert 'Must specify timebase' in str(no_timebase_exc) diff --git a/tests/test_slocum.py b/tests/test_slocum.py index bc6beaa..565667a 100644 --- a/tests/test_slocum.py +++ b/tests/test_slocum.py @@ -1,18 +1,14 @@ -import xarray as xr - -from compliance_checker.runner import ComplianceChecker, CheckSuite import json from pathlib import Path -import pytest + import numpy as np -import yaml +import pytest +import xarray as xr +from compliance_checker.runner import CheckSuite, ComplianceChecker import pyglider.ncprocess as ncprocess -import pyglider.seaexplorer as seaexplorer import pyglider.slocum as slocum - - library_dir = Path(__file__).parent.parent.absolute() example_dir = library_dir / 'tests/example-data/' @@ -26,24 +22,31 @@ tsdir = str(example_dir / 'example-slocum/L0-timeseries/') + '/' scisuffix = 'tbd' glidersuffix = 'sbd' -profiledir = str(example_dir / 'example-slocum/L0-profiles/') +profiledir = str(example_dir / 'example-slocum/L0-profiles/') do_direct = True # This needs to get run every time the tests are run, so do at top level: # turn *.sbd and *.tbd into timeseries netcdf files -outname_slocum = slocum.binary_to_timeseries(binarydir, cacdir, tsdir, deploymentyaml_slocum, - search='*.[s|t]bd', profile_filt_time=20, - profile_min_time=20) +outname_slocum = slocum.binary_to_timeseries( + binarydir, + cacdir, + tsdir, + deploymentyaml_slocum, + search='*.[s|t]bd', + profile_filt_time=20, + profile_min_time=20, +) # make profiles... -ncprocess.extract_timeseries_profiles(outname_slocum, profiledir, deploymentyaml_slocum, - force=True) +ncprocess.extract_timeseries_profiles( + outname_slocum, profiledir, deploymentyaml_slocum, force=True +) output_slocum = xr.open_dataset(outname_slocum) # Open test data file test_data_slocum = xr.open_dataset( - library_dir / - 'tests/expected/example-slocum/L0-timeseries/dfo-rosie713-20190615.nc') + library_dir / 'tests/expected/example-slocum/L0-timeseries/dfo-rosie713-20190615.nc' +) variables_slocum = list(output_slocum.variables) @@ -54,19 +57,20 @@ def test_variables_slocum(): assert variables_slocum == test_variables -@pytest.mark.parametrize("var", variables_slocum) +@pytest.mark.parametrize('var', variables_slocum) def test_example_slocum(var): # Test that variables and coordinates match assert output_slocum[var].attrs == test_data_slocum[var].attrs if var not in ['time']: - np.testing.assert_allclose(output_slocum[var].values, - test_data_slocum[var].values, rtol=1e-6) + np.testing.assert_allclose( + output_slocum[var].values, test_data_slocum[var].values, rtol=1e-6 + ) else: dt0 = output_slocum[var].values - np.datetime64('2000-01-01') dt1 = test_data_slocum[var].values - np.datetime64('2000-01-01') assert np.allclose( - np.array(dt0, dtype='float64'), - np.array(dt1, dtype='float64')) + np.array(dt0, dtype='float64'), np.array(dt1, dtype='float64') + ) def test_example_slocum_metadata(): @@ -81,6 +85,7 @@ def test_example_slocum_metadata(): # test the profiles with compliance_checker... + def test_profiles_compliant(): # Load all available checker classes check_suite = CheckSuite() @@ -104,12 +109,14 @@ def test_profiles_compliant(): @returns If the tests failed (based on the criteria) """ - return_value, errors = ComplianceChecker.run_checker(path, - checker_names, - verbose, - criteria, - output_filename=output_filename, - output_format=output_format) + return_value, errors = ComplianceChecker.run_checker( + path, + checker_names, + verbose, + criteria, + output_filename=output_filename, + output_format=output_format, + ) # Open the JSON output and get the compliance scores with open(output_filename, 'r') as fp: cc_data = json.load(fp) @@ -146,12 +153,14 @@ def test_timeseries_compliant(): @returns If the tests failed (based on the criteria) """ - return_value, errors = ComplianceChecker.run_checker(path, - checker_names, - verbose, - criteria, - output_filename=output_filename, - output_format=output_format) + return_value, errors = ComplianceChecker.run_checker( + path, + checker_names, + verbose, + criteria, + output_filename=output_filename, + output_format=output_format, + ) # Open the JSON output and get the compliance scores with open(output_filename, 'r') as fp: cc_data = json.load(fp) diff --git a/tests/test_utils.py b/tests/test_utils.py index 69439e4..0d3e935 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,17 +1,22 @@ -import xarray as xr +from pathlib import Path + import numpy as np -import yaml import pytest -from pathlib import Path -import sys +import xarray as xr +import yaml import pyglider.utils as utils library_dir = Path(__file__).parent.parent.absolute() example_dir = library_dir / 'tests/example-data/' -test_data = xr.open_dataset(library_dir / 'tests/expected/example-seaexplorer/L0-timeseries/dfo-eva035-20190718.nc') -deploymentyaml = example_dir / 'example-seaexplorer-legato-flntu-arod-ad2cp/deploymentRealtime.yml' +test_data = xr.open_dataset( + library_dir + / 'tests/expected/example-seaexplorer/L0-timeseries/dfo-eva035-20190718.nc' +) +deploymentyaml = ( + example_dir / 'example-seaexplorer-legato-flntu-arod-ad2cp/deploymentRealtime.yml' +) with open(deploymentyaml) as fin: deployment = yaml.safe_load(fin) metadata = deployment['metadata'] @@ -23,13 +28,14 @@ vars.remove('oxygen_concentration') -@pytest.mark.parametrize("var", vars) +@pytest.mark.parametrize('var', vars) def test_oxygen_concentration_correction_variables_unchanged(var): # Test that each variable and its coordinates match assert ds_out[var].equals(ds_unchanged[var]) def test_oxygen_concentration_correction(): - oxygen_difference = np.nanmean(ds_out['oxygen_concentration'].values) \ - - np.nanmean(ds_unchanged['oxygen_concentration'].values) + oxygen_difference = np.nanmean(ds_out['oxygen_concentration'].values) - np.nanmean( + ds_unchanged['oxygen_concentration'].values + ) assert np.abs(oxygen_difference + 32.328) < 1e-2