diff --git a/.zenodo.json b/.zenodo.json index b3adb6ae..eaf889dd 100644 --- a/.zenodo.json +++ b/.zenodo.json @@ -13,6 +13,9 @@ "name": "Rondeau-Genesse, Gabriel", "affiliation": "Ouranos, Montréal, Québec, Canada", "orcid": "0000-0003-3389-9406" + }, + { + "name": "Langlois, Sébastien" } ], "keywords": [ diff --git a/AUTHORS.rst b/AUTHORS.rst index 9acf256d..0e578c14 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -12,6 +12,7 @@ Co-Developers * Trevor James Smith `@Zeitsperre `_ * Gabriel Rondeau-Genesse `@RondeauG `_ +* Sébastien Langlois `@sebastienlanglois `_ Contributors ------------ diff --git a/HISTORY.rst b/HISTORY.rst index 2cb26211..3763f62a 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -4,17 +4,20 @@ History v0.2.0 (unreleased) ------------------- -Contributors to this version: Trevor James Smith (:user:`Zeitsperre`), Gabriel Rondeau-Genesse (:user:`RondeauG`). +Contributors to this version: Trevor James Smith (:user:`Zeitsperre`), Gabriel Rondeau-Genesse (:user:`RondeauG`), Thomas-Charles Fortier Filion (:user:`TC-FF`), Sébastien Langlois (:user:`sebastienlanglois`) Announcements ^^^^^^^^^^^^^ * Support for Python3.8 and lower has been dropped. (:pull:`11`). * `xHydro` now hosts its documentation on `Read the Docs `_. (:issue:`22`, :pull:`26`). +* Local frequency analysis functions have been added under a new module `xhydro.frequency_analysis`. (:pull:`20`, :pull:`27`). New features and enhancements ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ * GitHub Workflows for automated testing using `tox` have been added. (:pull:`11`). * Support for various `xscen` functions has been added to compute indicators and various climate change metrics. (:pull:`21`). +* New function `xh.indicators.compute_volume` to convert streamflow data to volumes. (:pull:`20`, :pull:`27`). +* New function `xh.indicators.get_yearly_op` to compute block operation (e.g. block maxima, minima, etc.). (:pull:`20`, :pull:`27`). Breaking changes ^^^^^^^^^^^^^^^^ @@ -39,6 +42,7 @@ Internal changes * Deployments to TestPyPI and PyPI are now run using GitHub Workflow Environments as a safeguarding mechanism. (:pull:`28`). * Various cleanups of the environment files. (:issue:`23`, :pull:`30`). * `xhydro` now uses the trusted publishing mechanism for PyPI and TestPyPI deployment. (:pull:`32`). +* Added tests. (:pull:`27`). 0.1.2 (2023-05-10) ------------------ diff --git a/MANIFEST.in b/MANIFEST.in index 988f455d..3d696347 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -6,6 +6,7 @@ include README.rst include .zenodo.json recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif +recursive-include docs *.ipynb recursive-include tests * recursive-exclude * __pycache__ recursive-exclude * *.py[co] diff --git a/docs/index.rst b/docs/index.rst index 3f2923f5..a440446c 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -4,17 +4,19 @@ Welcome to xHydro's documentation! .. toctree:: :maxdepth: 2 :caption: Contents: + :hidden: readme installation usage + notebooks/local_frequency_analysis apidoc/modules contributing authors history -Indices and tables -================== -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` +.. Indices and tables +.. ================== +.. * :ref:`genindex` +.. * :ref:`modindex` +.. * :ref:`search` diff --git a/docs/installation.rst b/docs/installation.rst index 02b7458b..13bdfb09 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -15,7 +15,7 @@ To create a working environment and install xHydro, copy the `environment.yml` f This is the preferred method to install `xHydro`, as it will always install the most recent stable release. -If for some reason you wish to install the `PyPI` version of `xscen` into an existing Anaconda environment (*not recommended*), only run the last command above. +If for some reason you wish to install the `PyPI` version of `xhydro` into an existing Anaconda environment (*not recommended if requirements are not met*), only run the last command above. If you don't have `pip`_ installed, this `Python installation guide`_ can guide you through the process. diff --git a/docs/notebooks/local_frequency_analysis.ipynb b/docs/notebooks/local_frequency_analysis.ipynb new file mode 100644 index 00000000..e26ca457 --- /dev/null +++ b/docs/notebooks/local_frequency_analysis.ipynb @@ -0,0 +1,347 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Frequency analysis module" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Basic imports\n", + "import xarray as xr\n", + "import hvplot.xarray\n", + "import numpy as np\n", + "import xdatasets as xd\n", + "\n", + "import xhydro as xh\n", + "import xhydro.frequency_analysis as xhfa" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Extracting and preparing the data\n", + "For this example, we'll conduct a frequency analysis using historical time series from various sites. We begin by obtaining a dataset comprising hydrological information. Here, we use the [xdataset](https://hydrologie.github.io/xdatasets/notebooks/getting_started.html) library to acquire hydrological data from the [Ministère de l'Environnement, de la Lutte contre les changements climatiques, de la Faune et des Parcs](https://www.cehq.gouv.qc.ca/atlas-hydroclimatique/stations-hydrometriques/index.htm) in Quebec, Canada. Specifically, our query focuses on stations with IDs beginning with `020`, possessing a natural flow pattern and limited to streamflow data. \n", + "\n", + "Users may prefer to generate their own `xarray.DataArray` using their individual dataset. At a minimum, the `xarray.DataArray` used for frequency analysis has to follow these principles:\n", + "\n", + "- The dataset needs a `time` dimension.\n", + "- If there is a spatial dimension, such as `id` in the example below, it needs an attribute `cf_role` with `timeseries_id` as its value.\n", + "- The variable will at the very least need a `units` attribute, although other attributes such as `long_name` and `cell_methods` are also expected by `xclim` (which is called at various points during the frequency analysis) and warnings will be generated if they are missing." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ds = xd.Query(\n", + " **{\n", + " \"datasets\":{\n", + " \"deh\":{\n", + " \"id\" :[\"020*\"],\n", + " \"regulated\":[\"Natural\"],\n", + " \"variables\":[\"streamflow\"],\n", + " }\n", + " }, \"time\":{\"start\": \"1970-01-01\", \n", + " \"minimum_duration\":(15*365, 'd')},\n", + "\n", + " }\n", + ").data.squeeze().load()\n", + "\n", + "# This dataset lacks some of the aforementioned attributes, so we need to add them.\n", + "ds[\"id\"].attrs[\"cf_role\"] = \"timeseries_id\"\n", + "ds[\"streamflow\"].attrs = {\"long_name\": \"Streamflow\", \"units\": \"m3 s-1\", \"standard_name\": \"water_volume_transport_in_river_channel\", \"cell_methods\": \"time: mean\"}\n", + "\n", + "ds" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ds.streamflow.dropna('time', 'all').hvplot(x='time', grid=True, widget_location='bottom', groupby='id')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Customizing the analysis settings\n", + "\n", + "### a) Defining seasons\n", + "We can define seasons using indexers that are compatible with `xclim.core.calendar.select_time`. There are currently four accepted types of indexers:\n", + "\n", + "- `month`, followed by a sequence of month numbers.\n", + "- `season`, followed by one or more of 'DJF', 'MAM', 'JJA' and 'SON'.\n", + "- `doy_bounds`, followed by a sequence representing the inclusive bounds of the period to be considered (start, end).\n", + "- `date_bounds`, which is the same as above, but using a month-day (%m-%d) format.\n", + "\n", + "For the purpose of getting block maxima through `xhydro.indicators.get_yearly_op`, the indexers need to be grouped within a dictionary, with the key being the label to be given to the requested period of the year. A second key can be used to instruct on the resampling frequency, for example to wrap around the year for winter." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Some examples\n", + "timeargs = {\n", + " \"spring\": {\"date_bounds\": [\"02-11\", \"06-19\"]},\n", + " \"summer\": {\"doy_bounds\": [152, 243]},\n", + " \"fall\": {\"month\": [9, 10, 11]},\n", + " \"winter\": {\"season\": ['DJF'], \"freq\": \"AS-DEC\"}, # To correctly wrap around the year, we need to specify the resampling frequency.\n", + " \"august\": {\"month\": [8]},\n", + " \"annual\": {}\n", + " }" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### b) Getting block maxima\n", + "Upon selecting each desired season, we can extract block maxima timeseries from every station using `xhydro.indicators.get_yearly_op`. The main arguments are:\n", + "\n", + "- `op`: the operation to compute. One of \"max\", \"min\", \"mean\", \"sum\".\n", + "- `input_var`: the name of the variable. Defaults to \"streamflow\".\n", + "- `window`: the size of the rolling window. A \"mean\" is performed on the rolling window prior to the `op` operation.\n", + "- `timeargs`: as defined previously. Leave at `None` to get the annual maxima.\n", + "- `missing` and `missing_options`: to define tolerances for missing data. See [this page](https://xclim.readthedocs.io/en/stable/checks.html#missing-values-identification) for more information.\n", + "- `interpolate_na`: whether to interpolate missing data prior to the `op` operation. Only used for `sum`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The function returns a `xarray.Dataset` with 1 variable per indexer." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Here, we hide years with more than 15% of missing data.\n", + "ds_4fa = xh.indicators.get_yearly_op(ds, op=\"max\", timeargs=timeargs, missing=\"pct\", missing_options={\"tolerance\": 0.15})\n", + "\n", + "ds_4fa" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "ds_4fa.streamflow_max_summer.dropna('time', 'all').hvplot(x='time', grid=True, widget_location='bottom', groupby='id')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### c) Using custom seasons per year or per station\n", + "\n", + "Using individualized date ranges for each year or each catchment is not explicitely supported, so users should instead mask their data prior to calling `get_yearly_op`. Note that when doing this, `missing` should be adjusted accordingly." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a mask beforehand\n", + "import random\n", + "nyears = np.unique(ds.time.dt.year).size\n", + "dom_start = xr.DataArray(np.random.randint(1, 30, size=(nyears, )).astype(\"str\"), dims=(\"year\"), coords={\"year\": np.unique(ds.time.dt.year)})\n", + "dom_end = xr.DataArray(np.random.randint(1, 30, size=(nyears, )).astype(\"str\"), dims=(\"year\"), coords={\"year\": np.unique(ds.time.dt.year)})\n", + "\n", + "mask = xr.zeros_like(ds[\"streamflow\"])\n", + "for y in dom_start.year.values:\n", + " # Random mask of dates per year, between April and June.\n", + " mask.loc[{\"time\": slice(str(y)+\"-04-\"+str(dom_start.sel(year=y).item()), str(y)+\"-06-\"+str(dom_end.sel(year=y).item()))}] = 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mask.hvplot(x='time', grid=True, widget_location='bottom', groupby='id')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# The name of the indexer will be used to identify the variable created here\n", + "timeargs_custom = {\"custom\": {}}\n", + "\n", + "# We use where() to mask the data that we want to ignore\n", + "masked = ds.where(mask==1)\n", + "# Since we masked almost all of the year, our tolerance for missing data should be changed accordingly\n", + "missing = \"at_least_n\"\n", + "missing_options = {\"n\": 45}\n", + "\n", + "# We use xr.merge() to combine the results with the previous dataset.\n", + "ds_4fa = xr.merge([ds_4fa, xh.indicators.get_yearly_op(masked, op=\"max\", timeargs=timeargs_custom, missing=missing, missing_options=missing_options)])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ds_4fa.streamflow_max_custom.dropna('time', 'all').hvplot(x='time', grid=True, widget_location='bottom', groupby='id')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### d) Computing volumes\n", + "\n", + "The frequency analysis can also be performed on volumes, using a similar workflow. The main difference is that if we're starting from streamflow, we'll first need to convert them into volumes using `xhydro.indicators.compute_volume`. Also, if required, `get_yearly_op` has an argument `interpolate_na` that can be used to interpolate missing data prior to the sum." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get a daily volume from a daily streamflow\n", + "ds[\"volume\"] = xh.indicators.compute_volume(ds[\"streamflow\"], out_units=\"hm3\")\n", + "\n", + "# We'll take slightly different indexers\n", + "timeargs_vol = {\n", + " \"spring\": {\"date_bounds\": [\"04-30\", \"06-15\"]},\n", + " \"annual\": {}\n", + " }\n", + "\n", + "# The operation that we want here is the sum, not the max.\n", + "ds_4fa = xr.merge([ds_4fa, xh.indicators.get_yearly_op(ds, op=\"sum\", input_var=\"volume\", timeargs=timeargs_vol, missing=\"pct\", missing_options={\"tolerance\": 0.15}, interpolate_na=True)])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ds_4fa.volume_sum_spring.dropna('time', 'all').hvplot(x='time', grid=True, widget_location='bottom', groupby='id')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Local frequency analysis\n", + "\n", + "Once we have our yearly maximums (or volumes/minimums), the first step in a local frequency analysis is to call `xhfa.local.fit` to obtain distribution parameters. The options are:\n", + "\n", + "- `distributions`: list of [SciPy distributions](https://docs.scipy.org/doc/scipy/reference/stats.html#continuous-distributions). Defaults to [\"expon\", \"gamma\", \"genextreme\", \"genpareto\", \"gumbel_r\", \"pearson3\", \"weibull_min\"].\n", + "- `min_years`: minimum number of years required to fit the data.\n", + "- `method`: fitting method. Defaults to the maximum likelihood." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# To speed up the Notebook, we'll only perform the analysis on a subset of variables\n", + "params = xhfa.local.fit(ds_4fa[[\"streamflow_max_spring\", \"volume_sum_spring\"]], min_years=15)\n", + "\n", + "params" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Information Criteria such as the AIC, BIC, and AICC are useful to determine which statistical distribution is better suited to a given location. These three criteria can be computed using `xhfa.local.criteria`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "criteria = xhfa.local.criteria(ds_4fa[[\"streamflow_max_spring\", \"volume_sum_spring\"]], params)\n", + "\n", + "criteria" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, return periods can be obtained using `xhfa.local.parametric_quantiles`. The options are:\n", + "\n", + "- `t`: the return period(s) in years.\n", + "- `mode`: whether the return period is the probability of exceedance (max) or non-exceedance (min). Defaults to `max`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rp = xhfa.local.parametric_quantiles(params, t=[20, 100])\n", + "\n", + "rp" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + }, + "vscode": { + "interpreter": { + "hash": "e28391989cdb8b31df72dd917935faad186df3329a743c813090fc6af977a1ca" + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/environment-dev.yml b/environment-dev.yml index 58903a8e..9511ef47 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -5,6 +5,10 @@ dependencies: - python >=3.9,<3.12 # Don't forget to sync the changes here with environment.yml! # Main packages + - numpy + - scipy + - statsmodels + - xarray - xclim >=0.45.0 - xscen >=0.7.1 # Dev @@ -33,3 +37,7 @@ dependencies: # Packaging - build - wheel + # Notebooks + - hvplot + - pip: + - xdatasets diff --git a/environment.yml b/environment.yml index 09a03595..98b2726c 100644 --- a/environment.yml +++ b/environment.yml @@ -3,5 +3,9 @@ channels: - conda-forge dependencies: - python >=3.9,<3.12 + - numpy + - scipy + - statsmodels + - xarray - xclim >=0.45.0 - xscen >=0.7.1 diff --git a/setup.py b/setup.py index a013b625..0ac84413 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ with open("README.rst") as readme_file: readme = readme_file.read() -requirements = ["xscen", "xclim>=0.45.0"] +requirements = ["numpy", "scipy", "statsmodels", "xarray", "xclim>=0.45.0", "xscen"] dev_requirements = ["pytest", "pytest-cov"] diff --git a/tests/__init__.py b/tests/__init__.py deleted file mode 100644 index f30ee4f2..00000000 --- a/tests/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Unit test package for xhydro.""" diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 00000000..94451556 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1 @@ +# noqa: D100 diff --git a/tests/test_cc.py b/tests/test_cc.py new file mode 100644 index 00000000..41918422 --- /dev/null +++ b/tests/test_cc.py @@ -0,0 +1,14 @@ +import xhydro as xh + + +# Smoke test for xscen functions that are imported into xhydro +def test_xscen_imported(): + assert all( + callable(getattr(xh.cc, f)) + for f in [ + "climatological_op", + "compute_deltas", + "ensemble_stats", + "produce_horizon", + ] + ) diff --git a/tests/test_indicators.py b/tests/test_indicators.py new file mode 100644 index 00000000..c40419ca --- /dev/null +++ b/tests/test_indicators.py @@ -0,0 +1,249 @@ +from copy import deepcopy + +import numpy as np +import pytest +from xclim.testing.helpers import test_timeseries as timeseries + +import xhydro as xh + + +# Smoke test for xscen functions that are imported into xhydro +def test_xscen_imported(): + assert callable(getattr(xh.indicators, "compute_indicators")) + + +class TestComputeVolume: + @pytest.mark.parametrize("freq", ["D", "YS"]) + def test_compute_volume(self, freq): + tile = 365 if freq == "D" else 1 + da = timeseries( + np.tile(np.arange(1, tile + 1), 3), + variable="streamflow", + start="2001-01-01", + freq=freq, + ) + + out = xh.indicators.compute_volume(da, attrs={"long_name": "Foo"}) + mult = 86400 if freq == "D" else 86400 * 365 + np.testing.assert_array_equal(out, da * mult) + assert out.attrs["long_name"] == "Foo" + assert out.attrs["units"] == "m^3" + assert out.attrs["cell_methods"] == "time: sum" + assert out.attrs["description"] == "Volume of water" + + def test_units(self): + da = timeseries( + np.tile(np.arange(1, 366), 3), + variable="streamflow", + start="2001-01-01", + freq="D", + ) + + out_m3 = xh.indicators.compute_volume(da) + assert out_m3.attrs["units"] == "m^3" + out_hm3 = xh.indicators.compute_volume(da, out_units="hm3") + assert out_hm3.attrs["units"] == "hm^3" + np.testing.assert_array_equal(out_m3 * 1e-6, out_hm3) + + +class TestGetYearlyOp: + ds = timeseries( + np.arange(1, 365 * 3 + 1), + variable="streamflow", + start="2001-01-01", + freq="D", + as_dataset=True, + ) + + @pytest.mark.parametrize("op", ["max", "min"]) + def test_get_yearly_op(self, op): + timeargs = { + "annual": {}, + "winterdate": {"date_bounds": ["12-01", "02-28"], "freq": "AS-DEC"}, + "winterdoy": {"doy_bounds": [335, 59], "freq": "AS-DEC"}, + "winterdjf": {"season": ["DJF"], "freq": "AS-DEC"}, + "summer": {"doy_bounds": [200, 300]}, + } + + out = xh.indicators.get_yearly_op(self.ds, op=op, timeargs=timeargs) + assert all(["streamflow" in v for v in out.data_vars]) + assert len(out.data_vars) == len(timeargs) + + if op == "max": + np.testing.assert_array_equal( + out.streamflow_max_annual, + np.add(np.tile(365, 3), np.array([0, 365, 365 * 2])), + ) + np.testing.assert_array_equal( + out.streamflow_max_summer, + np.add(np.tile(300, 3), np.array([0, 365, 365 * 2])), + ) + np.testing.assert_array_equal( + out.streamflow_max_winterdate, + np.add( + np.array([365 + 59, 365 + 59, 365]), np.array([0, 365, 365 * 2]) + ), + ) + np.testing.assert_array_equal( + out.streamflow_max_winterdoy, out.streamflow_max_winterdate + ) + np.testing.assert_array_equal( + out.streamflow_max_winterdjf, out.streamflow_max_winterdate + ) + elif op == "min": + np.testing.assert_array_equal( + out.streamflow_min_annual, + np.add(np.tile(1, 3), np.array([0, 365, 365 * 2])), + ) + np.testing.assert_array_equal( + out.streamflow_min_summer, + np.add(np.tile(200, 3), np.array([0, 365, 365 * 2])), + ) + np.testing.assert_array_equal( + out.streamflow_min_winterdate, + np.add(np.tile(335, 3), np.array([0, 365, 365 * 2])), + ) + np.testing.assert_array_equal( + out.streamflow_min_winterdoy, out.streamflow_min_winterdate + ) + np.testing.assert_array_equal( + out.streamflow_min_winterdjf, out.streamflow_min_winterdate + ) + + def test_missing(self): + timeargs = {"winterdate": {"date_bounds": ["12-01", "02-28"], "freq": "AS-DEC"}} + out = xh.indicators.get_yearly_op( + self.ds, + op="max", + timeargs=timeargs, + missing="pct", + missing_options={"tolerance": 0.1}, + ) + + np.testing.assert_array_equal( + out.streamflow_max_winterdate, + np.add(np.array([365 + 59, 365 + 59, np.nan]), np.array([0, 365, 365 * 2])), + ) + + def test_window(self): + out = xh.indicators.get_yearly_op(self.ds, op="max", window=2) + + assert all(["streamflow2" in v for v in out.data_vars]) + np.testing.assert_array_equal( + out.streamflow2_max_annual, np.array([364.5, 729.5, 1094.5]) + ) + + def test_sum(self): + ds = timeseries( + np.arange(1, 365 * 3 + 1), + variable="streamflow", + start="2001-01-01", + freq="D", + as_dataset=True, + ) + ds["volume"] = xh.indicators.compute_volume(ds.streamflow) + ds["volume"] = ds["volume"].where( + ~((ds.time.dt.month == 1) & (ds.time.dt.day == 3)) + ) + + timeargs = { + "annual": {}, + "winterdate": {"date_bounds": ["12-01", "02-28"], "freq": "AS-DEC"}, + "summer": {"doy_bounds": [200, 300]}, + } + out_sum = xh.indicators.get_yearly_op( + ds, input_var="volume", op="sum", timeargs=timeargs + ) + out_interp = xh.indicators.get_yearly_op( + ds, input_var="volume", op="sum", timeargs=timeargs, interpolate_na=True + ) + + ans = { + "annual": np.array( + [ + np.sum(np.arange(1, 365 + 1)) * 86400, + np.sum(np.arange(1 + 365, 365 + 365 + 1)) * 86400, + np.sum(np.arange(1 + 730, 365 + 730 + 1)) * 86400, + ] + ), + "summer": np.array( + [ + np.sum(np.arange(200, 300 + 1)) * 86400, + np.sum(np.arange(200 + 365, 300 + 365 + 1)) * 86400, + np.sum(np.arange(200 + 730, 300 + 730 + 1)) * 86400, + ] + ), + "winterdate": np.array( + [ + np.sum( + np.concatenate( + (np.arange(335, 365 + 1), np.arange(1 + 365, 59 + 365 + 1)) + ) + ) + * 86400, + np.sum( + np.concatenate( + ( + np.arange(335 + 365, 365 + 365 + 1), + np.arange(1 + 730, 59 + 730 + 1), + ) + ) + ) + * 86400, + np.sum(np.arange(335 + 730, 365 + 730 + 1)) * 86400, + ] + ), + } + + assert all(["volume" in v for v in out_interp.data_vars]) + np.testing.assert_array_equal( + out_interp.volume_sum_summer, out_sum.volume_sum_summer + ) + np.testing.assert_array_equal(out_interp.volume_sum_annual, ans["annual"]) + np.testing.assert_array_equal(out_interp.volume_sum_summer, ans["summer"]) + np.testing.assert_array_equal( + out_interp.volume_sum_winterdate, ans["winterdate"] + ) + + np.testing.assert_array_equal( + out_sum.volume_sum_annual, (ans["annual"] - np.array([3, 368, 733]) * 86400) + ) + np.testing.assert_array_equal(out_sum.volume_sum_summer, ans["summer"]) + np.testing.assert_array_equal( + out_sum.volume_sum_winterdate, + ans["winterdate"] - np.array([368, 733, 0]) * 86400, + ) + + def test_errors(self): + with pytest.raises(ValueError, match="Operation foo is not supported."): + xh.indicators.get_yearly_op(self.ds, op="foo") + with pytest.raises(ValueError, match="Cannot use a rolling window"): + xh.indicators.get_yearly_op(self.ds, op="sum", window=2) + with pytest.raises(ValueError, match="Frequency D is not supported"): + xh.indicators.get_yearly_op( + self.ds, op="max", timeargs={"annual": {"freq": "D"}} + ) + with pytest.raises(ValueError, match="Only one indexer"): + xh.indicators.get_yearly_op( + self.ds, + op="max", + timeargs={"annual": {"season": ["DJF"], "doy_bounds": [200, 300]}}, + ) + with pytest.warns(UserWarning, match="The frequency is not AS-DEC"): + xh.indicators.get_yearly_op( + self.ds, op="max", timeargs={"annual": {"season": ["DJF"]}} + ) + with pytest.warns(UserWarning, match="The bounds wrap around the year"): + xh.indicators.get_yearly_op( + self.ds, + op="max", + timeargs={"annual": {"date_bounds": ["06-15", "06-14"]}}, + ) + with pytest.warns(UserWarning, match="but the bounds"): + xh.indicators.get_yearly_op( + self.ds, + op="max", + timeargs={ + "annual": {"date_bounds": ["06-01", "04-30"], "freq": "AS-DEC"} + }, + ) diff --git a/tests/test_local.py b/tests/test_local.py new file mode 100644 index 00000000..ac4db5d3 --- /dev/null +++ b/tests/test_local.py @@ -0,0 +1,153 @@ +from copy import deepcopy + +import numpy as np +import pytest +from xclim.testing.helpers import test_timeseries as timeseries + +import xhydro.frequency_analysis as xhfa + + +class TestFit: + def test_fit(self): + ds = timeseries( + np.array([50, 65, 80, 95, 110, 125, 140, 155, 170, 185, 200]), + variable="streamflow", + start="2001-01-01", + freq="YS", + as_dataset=True, + ) + params = xhfa.local.fit(ds, distributions=["gamma", "pearson3"]) + + np.testing.assert_array_equal(params.dparams, ["a", "skew", "loc", "scale"]) + np.testing.assert_array_equal(params.scipy_dist, ["gamma", "pearson3"]) + assert params.streamflow.attrs["scipy_dist"] == ["gamma", "pearson3"] + assert params.streamflow.attrs["estimator"] == "Maximum likelihood" + assert params.streamflow.attrs["long_name"] == "Distribution parameters" + assert ( + params.streamflow.attrs["description"] == "Parameters of the distributions" + ) + + np.testing.assert_array_almost_equal( + params.streamflow, + [ + [9.95357815e00, np.nan, -3.07846650e01, 1.56498193e01], + [np.nan, -2.25674044e-05, 1.25012261e02, 4.74238877e01], + ], + ) + + def test_default(self): + ds = timeseries( + np.array([50, 65, 80, 95, 110, 125, 140, 155, 170, 185, 200]), + variable="streamflow", + start="2001-01-01", + freq="YS", + as_dataset=True, + ) + params = xhfa.local.fit(ds) + np.testing.assert_array_equal( + params.dparams, ["a", "c", "skew", "loc", "scale"] + ) + np.testing.assert_array_equal( + params.scipy_dist, + [ + "expon", + "gamma", + "genextreme", + "genpareto", + "gumbel_r", + "pearson3", + "weibull_min", + ], + ) + + @pytest.mark.parametrize("miny", [10, 15]) + def test_min_years(self, miny): + ds = timeseries( + np.array([50, 65, 80, 95, 110, 125, 140, 155, 170, 185, 200]), + variable="streamflow", + start="2001-01-01", + freq="YS", + as_dataset=True, + ) + params = xhfa.local.fit(ds, distributions=["gamma"], min_years=miny) + np.testing.assert_array_almost_equal( + params.streamflow, + [[9.95357815e00, -3.07846650e01, 1.56498193e01]] + if miny == 10 + else [[np.nan, np.nan, np.nan]], + ) + + +@pytest.mark.parametrize("mode", ["max", "min", "foo"]) +def test_quantiles(mode): + ds = timeseries( + np.array([50, 65, 80, 95, 110, 125, 140, 155, 170, 185, 200]), + variable="streamflow", + start="2001-01-01", + freq="YS", + as_dataset=True, + ) + params = xhfa.local.fit(ds, distributions=["gamma", "pearson3"]) + + if mode == "foo": + with pytest.raises(ValueError): + xhfa.local.parametric_quantiles(params, [10, 20], mode=mode) + else: + rp = xhfa.local.parametric_quantiles(params, [10, 20], mode=mode) + + np.testing.assert_array_equal( + rp.return_period, [0.9, 0.95] if mode == "max" else [0.1, 0.05] + ) + np.testing.assert_array_equal(rp.scipy_dist, ["gamma", "pearson3"]) + assert rp.streamflow.attrs["long_name"] == "Distribution quantiles" + assert ( + rp.streamflow.attrs["description"] + == "Quantiles estimated by statistic distributions" + ) + assert rp.streamflow.attrs["cell_methods"] == "dparams: ppf" + + ans = ( + [[190.66041057, 214.08102761], [185.78830382, 203.01731036]] + if mode == "max" + else [[66.00067153, 53.58658639], [64.23598869, 47.00660287]] + ) + np.testing.assert_array_almost_equal(rp.streamflow, ans) + + +def test_criteria(): + ds = timeseries( + np.array([50, 65, 80, 95, 110, 125, 140, 155, 170, 185, 200]), + variable="streamflow", + start="2001-01-01", + freq="YS", + as_dataset=True, + ) + params = xhfa.local.fit(ds, distributions=["gamma", "pearson3"]) + crit = xhfa.local.criteria(ds, params) + crit_with_otherdim = xhfa.local.criteria( + ds.expand_dims("otherdim"), params.expand_dims("otherdim") + ) + np.testing.assert_array_almost_equal( + crit.streamflow, crit_with_otherdim.streamflow.squeeze("otherdim") + ) + + np.testing.assert_array_equal(crit.scipy_dist, ["gamma", "pearson3"]) + np.testing.assert_array_equal(crit.criterion, ["aic", "bic", "aicc"]) + assert crit.streamflow.attrs["long_name"] == "Information criteria" + assert ( + crit.streamflow.attrs["description"] + == "Information criteria for the distribution parameters." + ) + assert crit.streamflow.attrs["scipy_dist"] == ["gamma", "pearson3"] + assert all( + attr not in crit.streamflow.attrs + for attr in ["estimator", "method", "min_years"] + ) + + np.testing.assert_array_almost_equal( + crit.streamflow, + [ + [118.19066549, 118.58856076, 118.63510993], + [118.12140939, 118.51930466, 118.56585383], + ], + ) diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 00000000..52e62450 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,6 @@ +import xhydro as xh + + +# Smoke test for xscen functions that are imported into xhydro +def test_xscen_imported(): + assert callable(getattr(xh.utils, "health_checks")) diff --git a/tests/test_xhydro.py b/tests/test_xhydro.py index ccf0d111..138c06b7 100644 --- a/tests/test_xhydro.py +++ b/tests/test_xhydro.py @@ -7,8 +7,6 @@ import pytest -import xhydro - @pytest.fixture def response(): diff --git a/xhydro/__init__.py b/xhydro/__init__.py index 1f718d19..79a36766 100644 --- a/xhydro/__init__.py +++ b/xhydro/__init__.py @@ -1,7 +1,8 @@ """Top-level package for xHydro.""" +from . import cc, indicators, utils + # Import top-level functions -from .cc import * -from .indicators import * +# TODO: Decide which functions to import at the top level __author__ = """Thomas-Charles Fortier Filion""" __email__ = "tcff_hydro@outlook.com" diff --git a/xhydro/frequency_analysis/__init__.py b/xhydro/frequency_analysis/__init__.py new file mode 100644 index 00000000..1e344031 --- /dev/null +++ b/xhydro/frequency_analysis/__init__.py @@ -0,0 +1,3 @@ +"""Frequency analysis module.""" + +from . import local diff --git a/xhydro/frequency_analysis/local.py b/xhydro/frequency_analysis/local.py new file mode 100644 index 00000000..47404db5 --- /dev/null +++ b/xhydro/frequency_analysis/local.py @@ -0,0 +1,268 @@ +"""Local frequency analysis functions and utilities.""" + +import datetime +from typing import Union + +import numpy as np +import scipy.stats +import statsmodels +import xarray as xr +import xclim.indices.stats +from statsmodels.tools import eval_measures + +__all__ = [ + "fit", + "parametric_quantiles", + "criteria", +] + + +def fit( + ds, distributions: list = None, min_years=None, method: str = "ML" +) -> xr.Dataset: + """Fit multiple distributions to data. + + Parameters + ---------- + ds : xr.Dataset + Dataset containing the data to fit. All variables will be fitted. + distributions : list of str + List of distribution names as defined in `scipy.stats`. See https://docs.scipy.org/doc/scipy/reference/stats.html#continuous-distributions. + Defaults to ["expon", "gamma", "genextreme", "genpareto", "gumbel_r", "pearson3", "weibull_min"]. + min_years : int + Minimum number of years required for a distribution to be fitted. + method : str + Fitting method. Defaults to "ML" (maximum likelihood). + + Returns + ------- + xr.Dataset + Dataset containing the parameters of the fitted distributions, with a new dimension `scipy_dist` containing the distribution names. + + Notes + ----- + In order to combine the parameters of multiple distributions, the size of the `dparams` dimension is set to the maximum number of unique parameters between the distributions. + """ + distributions = distributions or [ + "expon", + "gamma", + "genextreme", + "genpareto", + "gumbel_r", + "pearson3", + "weibull_min", + ] + out = [] + for v in ds.data_vars: + p = [] + for d in distributions: + p.append( + xclim.indices.stats.fit( + ds[v].chunk({"time": -1}), dist=d, method=method + ) + .assign_coords(scipy_dist=d) + .expand_dims("scipy_dist") + ) + params = xr.concat(p, dim="scipy_dist") + + # Reorder dparams to match the order of the parameters across all distributions, since subsequent operations rely on this. + p_order = [list(pi.dparams.values) for pi in p] + if any(len(pi) != 2 and len(pi) != 3 for pi in p_order): + raise NotImplementedError( + "Only distributions with 2 or 3 parameters are currently supported." + ) + skew = np.unique([pi[0] for pi in p_order if len(pi) == 3]) + loc = np.unique([pi[0] if len(pi) == 2 else pi[1] for pi in p_order]) + scale = np.unique([pi[1] if len(pi) == 2 else pi[2] for pi in p_order]) + # if any common name between skew, loc and scale, raise error + if ( + any([lo in skew for lo in loc]) + or any([s in skew for s in scale]) + or any([s in loc for s in scale]) + ): + raise ValueError("Guessed skew, loc, and scale parameters are not unique.") + params = params.sel(dparams=np.concatenate([skew, loc, scale])) + + # Check that it worked by making sure that the index is in ascending order + for pi in p: + p_order = [list(params.dparams.values).index(p) for p in pi.dparams.values] + if not np.all(np.diff(p_order) > 0): + raise ValueError( + "Something went wrong when ordering the parameters across distributions." + ) + + if min_years is not None: + params = params.where(ds[v].notnull().sum("time") >= min_years) + params.attrs["scipy_dist"] = distributions + params.attrs["description"] = "Parameters of the distributions" + params.attrs["long_name"] = "Distribution parameters" + params.attrs["min_years"] = min_years + out.append(params) + + out = xr.merge(out) + out.attrs = ds.attrs + + return out + + +def parametric_quantiles(p, t: Union[float, list], mode: str = "max") -> xr.Dataset: + """Compute quantiles from fitted distributions. + + Parameters + ---------- + p : xr.Dataset + Dataset containing the parameters of the fitted distributions. + Must have a dimension `dparams` containing the parameter names and a dimension `scipy_dist` containing the distribution names. + t : float or list of float + Return period(s) in years. + mode : {'max', 'min'} + Whether the return period is the probability of exceedance (max) or non-exceedance (min). + + Returns + ------- + xr.Dataset + Dataset containing the quantiles of the distributions. + """ + distributions = list(p["scipy_dist"].values) + + t = np.atleast_1d(t) + if mode == "max": + t = 1 - 1.0 / t + elif mode == "min": + t = 1.0 / t + else: + raise ValueError(f"'mode' must be 'max' or 'min', got '{mode}'.") + + out = [] + for v in p.data_vars: + quantiles = [] + for d in distributions: + da = ( + p[v] + .sel(scipy_dist=d) + .dropna("dparams", how="all") + .transpose("dparams", ...) + ) + da.attrs["scipy_dist"] = d + q = ( + xclim.indices.stats.parametric_quantile(da, q=t) + .rename({"quantile": "return_period"}) + .assign_coords(scipy_dist=d) + .expand_dims("scipy_dist") + ) + quantiles.append(q) + quantiles = xr.concat(quantiles, dim="scipy_dist") + quantiles.attrs["scipy_dist"] = distributions + quantiles.attrs[ + "description" + ] = "Quantiles estimated by statistic distributions" + quantiles.attrs["long_name"] = "Distribution quantiles" + out.append(quantiles) + + out = xr.merge(out) + out.attrs = p.attrs + + return out + + +def criteria(ds, p) -> xr.Dataset: + """Compute information criteria (AIC, BIC, AICC) from fitted distributions, using the log-likelihood. + + Parameters + ---------- + ds : xr.Dataset + Dataset containing the yearly data that was fitted. + p : xr.Dataset + Dataset containing the parameters of the fitted distributions. + Must have a dimension `dparams` containing the parameter names and a dimension `scipy_dist` containing the distribution names. + + Returns + ------- + xr.Dataset + Dataset containing the information criteria for the distributions. + """ + + def _get_criteria_1d(da, params, dist): + da = np.atleast_2d(da).T + params = np.atleast_2d(params).T + + llf = np.nansum(dist.logpdf(da, *params), axis=0) # log-likelihood + nobs = np.sum(np.isfinite(da), axis=0) + df_modelwc = len(p) + dof_eff = nobs - df_modelwc - 1.0 + + aic = eval_measures.aic(llf=llf, nobs=nobs, df_modelwc=len(p)) + bic = eval_measures.bic(llf=llf, nobs=nobs, df_modelwc=len(p)) + # Custom AICC, because the one in statsmodels does not support multiple dimensions + aicc = np.where( + dof_eff > 0, -2.0 * llf + 2.0 * df_modelwc * nobs / dof_eff, np.nan + ) + + return np.stack([aic, bic, aicc], axis=1) + + out = [] + distributions = list(p["scipy_dist"].values) + + common_vars = list(set(ds.data_vars).intersection(p.data_vars)) + for v in common_vars: + c = [] + for d in distributions: + da = ds[v].transpose("time", ...) + params = ( + p[v] + .sel(scipy_dist=d) + .dropna("dparams", how="all") + .transpose("dparams", ...) + ) + dist_obj = getattr(scipy.stats, d) + + if len(da.dims) == 1: + crit = xr.apply_ufunc( + _get_criteria_1d, + da.expand_dims("tmp"), + params.expand_dims("tmp"), + kwargs={"dist": dist_obj}, + input_core_dims=[["time"], ["dparams"]], + output_core_dims=[["criterion"]], + dask_gufunc_kwargs=dict(output_sizes={"criterion": 3}), + dask="parallelized", + ).squeeze("tmp") + else: + crit = xr.apply_ufunc( + _get_criteria_1d, + da, + params, + kwargs={"dist": dist_obj}, + input_core_dims=[["time"], ["dparams"]], + output_core_dims=[["criterion"]], + dask_gufunc_kwargs=dict(output_sizes={"criterion": 3}), + dask="parallelized", + ) + + # Add attributes + crit = crit.assign_coords(criterion=["aic", "bic", "aicc"]).expand_dims( + "scipy_dist" + ) + crit.attrs = p[v].attrs + crit.attrs["history"] = ( + crit.attrs.get("history", "") + + f", [{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] criteria: computed AIC, BIC and AICC. - statsmodels version: {statsmodels.__version__}" + ) + crit.attrs[ + "description" + ] = "Information criteria for the distribution parameters." + crit.attrs["long_name"] = "Information criteria" + + # Remove a few attributes that are not relevant anymore + crit.attrs.pop("method", None) + crit.attrs.pop("estimator", None) + crit.attrs.pop("min_years", None) + c.append(crit) + + c = xr.concat(c, dim="scipy_dist") + out.append(c) + + out = xr.merge(out) + out.attrs = ds.attrs + + return out diff --git a/xhydro/indicators.py b/xhydro/indicators.py index b330aa8a..52f51e67 100644 --- a/xhydro/indicators.py +++ b/xhydro/indicators.py @@ -1,8 +1,241 @@ """Module to compute indicators using xclim's build_indicator_module_from_yaml.""" +import warnings + +import xarray as xr +import xclim as xc +import xscen as xs +from xclim.core.units import rate2amount # Special imports from xscen from xscen import compute_indicators __all__ = [ "compute_indicators", + "compute_volume", + "get_yearly_op", ] + + +def compute_volume( + da: xr.DataArray, *, out_units: str = "m3", attrs: dict = None +) -> xr.DataArray: + """Compute the volume of water from a streamflow variable, keeping the same frequency. + + Parameters + ---------- + da : xr.DataArray + Streamflow variable. + out_units : str + Output units. Defaults to "m3". + attrs : dict + Attributes to add to the output variable. + Default attributes for "long_name", "units", "cell_methods" and "description" will be added if not provided. + + Returns + ------- + xr.DataArray + Volume of water. + """ + default_attrs = { + "long_name": "Volume of water", + "cell_methods": "time: sum", + "description": "Volume of water", + } + attrs = attrs or {} + # Add default attributes + for k, v in default_attrs.items(): + attrs.setdefault(k, v) + + out = rate2amount(da, out_units=out_units) + out.attrs.update(attrs) + + return out + + +def get_yearly_op( + ds, + op, + *, + input_var: str = "streamflow", + window: int = 1, + timeargs: dict = None, + missing: str = "skip", + missing_options: dict = None, + interpolate_na: bool = False, +) -> xr.Dataset: + """ + Compute yearly operations on a variable. + + Parameters + ---------- + ds: xr.Dataset + Dataset containing the variable to compute the operation on. + op: str + Operation to compute. One of ["max", "min", "mean", "sum"]. + input_var: str + Name of the input variable. Defaults to "streamflow". + window: int + Size of the rolling window. A "mean" operation is performed on the rolling window before the call to xclim. + This parameter cannot be used with the "sum" operation. + timeargs: dict + Dictionary of time arguments for the operation. Keys are the name of the period that will be added to the results (e.g. "winter", "summer", "annual"). + Values are up to two dictionaries, with both being optional. + The first is {'freq': str}, where str is a frequency supported by xarray (e.g. "YS", "AS-JAN", "AS-DEC"). It needs to be a yearly frequency. Defaults to "AS-JAN". + The second is an indexer as supported by :py:func:`xclim.core.calendar.select_time`. Defaults to {}, which means the whole year. + See :py:func:`xclim.core.calendar.select_time` for more information. + Examples: {"winter": {"freq": "AS-DEC", "date_bounds": ['12-01', '02-28']}}, {"jan": {"freq": "YS", "month": 1}}, {"annual": {}}. + missing: str + How to handle missing values. One of "skip", "any", "at_least_n", "pct", "wmo". + See :py:func:`xclim.core.missing` for more information. + missing_options: dict + Dictionary of options for the missing values' method. See :py:func:`xclim.core.missing` for more information. + interpolate_na: bool + Whether to interpolate missing values before computing the operation. Only used with the "sum" operation. Defaults to False. + + Returns + ------- + xr.Dataset + Dataset containing the computed operations, with one variable per indexer. The name of the variable follows the pattern `{input_var}{window}_{op}_{indexer}`. + + Notes + ----- + If you want to perform a frequency analysis on a frequency that is finer than annual, simply use multiple timeargs (e.g. 1 per month) to create multiple distinct variables. + + """ + missing_options = missing_options or {} + timeargs = timeargs or {"annual": {}} + + if op not in ["max", "min", "mean", "sum"]: + raise ValueError( + f"Operation {op} is not supported. Please use one of ['max', 'min', 'mean', 'sum']." + ) + if op == "sum": + if window > 1: + raise ValueError("Cannot use a rolling window with a sum operation.") + if interpolate_na: + ds[input_var] = ds[input_var].interpolate_na(dim="time", method="linear") + + # Add the variable to xclim to avoid raising an error + if input_var not in xc.core.utils.VARIABLES: + attrs = { + "long_name": None, + "units": None, + "cell_methods": None, + "description": None, + } + attrs.update(ds[input_var].attrs) + attrs["canonical_units"] = attrs["units"] + attrs.pop("units") + xc.core.utils.VARIABLES[input_var] = attrs + + # FIXME: This should be handled by xclim once it supports rolling stats (Issue #1480) + # rolling window + if window > 1: + ds[input_var] = ( + ds[input_var] + .rolling(dim={"time": window}, min_periods=window, center=False) + .mean() + ) + + indicators = [] + month_labels = [ + "JAN", + "FEB", + "MAR", + "APR", + "MAY", + "JUN", + "JUL", + "AUG", + "SEP", + "OCT", + "NOV", + "DEC", + ] + for i in timeargs: + freq = timeargs[i].get("freq", "AS-JAN") + if not xc.core.calendar.compare_offsets(freq, "==", "YS"): + raise ValueError( + f"Frequency {freq} is not supported. Please use a yearly frequency." + ) + indexer = {k: v for k, v in timeargs[i].items() if k != "freq"} + if len(indexer) > 1: + raise ValueError("Only one indexer is supported per operation.") + + # Manage the frequency + if ( + "season" in indexer.keys() + and "DJF" in indexer["season"] + and freq != "AS-DEC" + ): + warnings.warn( + "The frequency is not AS-DEC, but the season indexer includes DJF. This will lead to misleading results." + ) + elif ( + "doy_bounds" in indexer.keys() + and indexer["doy_bounds"][0] >= indexer["doy_bounds"][1] + ) or ( + "date_bounds" in indexer.keys() + and int(indexer["date_bounds"][0].split("-")[0]) + >= int(indexer["date_bounds"][1].split("-")[0]) + ): + if "doy_bounds" in indexer.keys(): + # transform doy to a date to find the month + ts = xr.cftime_range( + start="2000-01-01", + periods=366, + freq="D", + calendar=ds.time.dt.calendar, + ) + month_start = ts[indexer["doy_bounds"][0] - 1].month + month_end = ts[indexer["doy_bounds"][1] - 1].month + else: + month_start = int(indexer["date_bounds"][0].split("-")[0]) + month_end = int(indexer["date_bounds"][1].split("-")[0]) + if month_end == month_start: + warnings.warn( + "The bounds wrap around the year, but the month is the same between the both of them. This is not supported and will lead to wrong results." + ) + if freq == "YS" or (month_start != month_labels.index(freq.split("-")[1])): + warnings.warn( + f"The frequency is {freq}, but the bounds are between months {month_start} and {month_end}. You should use 'AS-{month_labels[month_start - 1]}' as the frequency." + ) + + identifier = f"{input_var}{window if window > 1 else ''}_{op}_{i.lower()}" + ind = xc.core.indicator.Indicator.from_dict( + data={ + "base": "stats", + "input": {"da": input_var}, + "parameters": { + "op": op if op != "sum" else "integral", + "indexer": indexer, + "freq": freq, + }, + "missing": missing, + "missing_options": missing_options, + }, + identifier=identifier, + module="fa", + ) + indicators.append((identifier, ind)) + + # Compute the indicators + ind_dict = compute_indicators(ds, indicators=indicators) + + # Combine all the indicators into one dataset + out = xr.merge( + [ + da.assign_coords( + time=xr.date_range( + da.time[0].dt.strftime("%Y-01-01").item(), + periods=da.time.size, + calendar=da.time.dt.calendar, + freq="YS", + ) + ) + for da in ind_dict.values() + ] + ) + out = xs.clean_up(out, common_attrs_only=ind_dict) + + return out diff --git a/xhydro/utils.py b/xhydro/utils.py new file mode 100644 index 00000000..b1392915 --- /dev/null +++ b/xhydro/utils.py @@ -0,0 +1,5 @@ +"""Utility functions for xhydro.""" + +from xscen.diagnostics import health_checks + +__all__ = ["health_checks"]