Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Better degree_days_exceedance_date and ensemble stats #1647

Merged
merged 9 commits into from
Feb 16, 2024
2 changes: 2 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ New features and enhancements
* New ``xclim.core.calendar.stack_periods`` and ``unstack_periods`` for performing ``rolling(time=...).construct(..., stride=...)`` but with non-uniform temporal periods like years or months. They replace ``xclim.sdba.processing.construct_moving_yearly_window`` and ``unpack_moving_yearly_window`` which are deprecated and will be removed in a future release.
* New ``as_dataset`` options for ``xclim.set_options``. When True, indicators will output Datasets instead of DataArrays. (:issue:`1257`, :pull:`1625`).
* Added new option for UTCI calculation to cap low wind velocities to a minimum of 0.5 m/s following Bröde (2012) guidelines. (:issue:`1634`, :pull:`1635`).
* Added option ``never_reached`` to ``degree_days_exceedance_date`` to assign a custom value when the sum threshold is never reached. (:issue:`1459`, :pull:`1647`).
* Added option ``min_members`` to ensemble statistics to mask elements when the number of valid members is under a threshold. (:issue:`1459`, :pull:`1647`).

Breaking changes
^^^^^^^^^^^^^^^^
Expand Down
28 changes: 28 additions & 0 deletions tests/test_ensembles.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,34 @@ def test_calc_mean_std_min_max(self, ensemble_dataset_objects, open_dataset):
out1.tg_mean_min[0, 5, 5], out2.tg_mean_min[0, 5, 5]
)

@pytest.mark.parametrize(
"aggfunc", [ensembles.ensemble_percentiles, ensembles.ensemble_mean_std_max_min]
)
def test_stats_min_members(self, ensemble_dataset_objects, open_dataset, aggfunc):
ds_all = [open_dataset(n) for n in ensemble_dataset_objects["nc_files_simple"]]
ens = ensembles.create_ensemble(ds_all).isel(lat=0, lon=0)
ens = ens.where(ens.realization > 0)
ens = xr.where((ens.realization == 1) & (ens.time.dt.year == 1950), np.NaN, ens)

def first(ds):
return ds[list(ds.data_vars.keys())[0]]

# Default, no masking
out = first(aggfunc(ens))
assert not out.isnull().any()

# A number
out = first(aggfunc(ens, min_members=3))
# Only 1950 is null
np.testing.assert_array_equal(
out.isnull(), [True] + [False] * (ens.time.size - 1)
)

# Special value
out = first(aggfunc(ens, min_members=None))
# All null
assert out.isnull().all()


@pytest.mark.slow
class TestEnsembleReduction:
Expand Down
21 changes: 21 additions & 0 deletions tests/test_temperature.py
Original file line number Diff line number Diff line change
Expand Up @@ -1289,6 +1289,27 @@ def test_degree_days_exceedance_date(open_dataset):
np.testing.assert_array_equal(out, np.array([[np.nan, 280, 241, 244]]).T)


@pytest.mark.parametrize(
"never_reached,exp", [(None, np.NaN), (300, 300), ("12-01", 335)]
)
def test_degree_days_exceedance_date_never_reached(open_dataset, never_reached, exp):
tas = open_dataset("FWI/GFWED_sample_2017.nc").tas
tas.attrs.update(
cell_methods="time: mean within days", standard_name="air_temperature"
)
# Default -> NaN
out = atmos.degree_days_exceedance_date(
tas=tas,
thresh="4 degC",
op=">",
sum_thresh="1000 K days",
after_date="07-01",
never_reached=never_reached,
freq="YS",
).squeeze("time")
np.testing.assert_array_equal(out, np.array([exp, 242, 222, 223]))


class TestWarmSpellDurationIndex:
def test_warm_spell_duration_index(self, open_dataset):
tasmax = open_dataset("ERA5/daily_surface_cancities_1990-1993.nc").tasmax
Expand Down
7 changes: 7 additions & 0 deletions xclim/core/calendar.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
"datetime_to_decimal_year",
"days_in_year",
"days_since_to_doy",
"doy_from_string",
"doy_to_days_since",
"ensure_cftime_array",
"get_calendar",
Expand Down Expand Up @@ -88,6 +89,12 @@ def days_in_year(year: int, calendar: str = "default") -> int:
)


def doy_from_string(doy: DayOfYearStr, year: int, calendar: str) -> int:
"""Return the day-of-year corresponding to a "MM-DD" string for a given year and calendar."""
MM, DD = doy.split("-")
return datetime_classes[calendar](year, int(MM), int(DD)).timetuple().tm_yday


def date_range(
*args, calendar: str = "default", **kwargs
) -> pd.DatetimeIndex | CFTimeIndex:
Expand Down
10 changes: 5 additions & 5 deletions xclim/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,9 +139,9 @@ def decorator(func):
def wrapper(*args, **kwargs):
msg = (
f"`{func.__name__}` is deprecated"
f"{' from version {}'.format(from_version) if from_version else ''} "
f"{f' from version {from_version}' if from_version else ''} "
"and will be removed in a future version of xclim"
f"{'. Use `{}` instead'.format(suggested) if suggested else ''}. "
f"{f'. Use `{suggested}` instead' if suggested else ''}. "
"Please update your scripts accordingly."
)
warnings.warn(
Expand Down Expand Up @@ -683,6 +683,9 @@ def infer_kind_from_parameter(param) -> InputKind:
if annot == {"Quantified"}:
return InputKind.QUANTIFIED

if "DayOfYearStr" in annot:
return InputKind.DAY_OF_YEAR

if annot.issubset({"int", "float"}):
return InputKind.NUMBER

Expand All @@ -692,9 +695,6 @@ def infer_kind_from_parameter(param) -> InputKind:
if annot == {"str"}:
return InputKind.STRING

if annot == {"DayOfYearStr"}:
return InputKind.DAY_OF_YEAR

if annot == {"DateStr"}:
return InputKind.DATE

Expand Down
24 changes: 23 additions & 1 deletion xclim/ensembles/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def create_ensemble(


def ensemble_mean_std_max_min(
ens: xr.Dataset, weights: xr.DataArray | None = None
ens: xr.Dataset, min_members: int | None = 1, weights: xr.DataArray | None = None
) -> xr.Dataset:
"""Calculate ensemble statistics between a results from an ensemble of climate simulations.

Expand All @@ -138,6 +138,10 @@ def ensemble_mean_std_max_min(
----------
ens : xr.Dataset
Ensemble dataset (see xclim.ensembles.create_ensemble).
min_members : int, optional
The minimum number of valid ensemble members for a statistic to be valid.
Passing None is equivalent to setting min_members to the size of the realization dimension.
The default (1) essentially skips this check.
weights : xr.DataArray, optional
Weights to apply along the 'realization' dimension. This array cannot contain missing values.

Expand All @@ -158,6 +162,8 @@ def ensemble_mean_std_max_min(
# Calculate ensemble statistics:
ens_mean_std = ensemble_mean_std_max_min(ens)
"""
if min_members is None:
min_members = ens.realization.size
ds_out = xr.Dataset(attrs=ens.attrs)
for v in ens.data_vars:
if weights is None:
Expand All @@ -170,9 +176,14 @@ def ensemble_mean_std_max_min(
ds_out[f"{v}_max"] = ens[v].max(dim="realization")
ds_out[f"{v}_min"] = ens[v].min(dim="realization")

if min_members != 1:
enough = ens[v].notnull().sum("realization") >= min_members

# Re-add attributes
for stat in ["mean", "stdev", "max", "min"]:
vv = f"{v}_{stat}"
if min_members != 1:
ds_out[vv] = ds_out[vv].where(enough)
ds_out[vv].attrs = ens[v].attrs
if "description" in ds_out[vv].attrs.keys():
vv.split()
Expand All @@ -182,6 +193,7 @@ def ensemble_mean_std_max_min(
+ vv.split("_")[-1]
+ " of ensemble"
)

ds_out.attrs["history"] = update_history(
f"Computation of statistics on {ens.realization.size} ensemble members.", ds_out
)
Expand All @@ -192,6 +204,7 @@ def ensemble_percentiles(
ens: xr.Dataset | xr.DataArray,
values: Sequence[int] | None = None,
keep_chunk_size: bool | None = None,
min_members: int | None = 1,
weights: xr.DataArray | None = None,
split: bool = True,
) -> xr.DataArray | xr.Dataset:
Expand All @@ -211,6 +224,10 @@ def ensemble_percentiles(
so that the chunks keep the same size (approximately).
If False, no shrinking is performed, resulting in much larger chunks.
If not defined, the function decides which is best.
min_members : int, optional
The minimum number of valid ensemble members for a statistic to be valid.
Passing None is equivalent to setting min_members to the size of the realization dimension.
The default (1) essentially skips this check.
weights : xr.DataArray, optional
Weights to apply along the 'realization' dimension. This array cannot contain missing values.
When given, the function uses xarray's quantile method which is slower than xclim's NaN-optimized algorithm.
Expand Down Expand Up @@ -244,6 +261,8 @@ def ensemble_percentiles(
"""
if values is None:
values = [10, 50, 90]
if min_members is None:
min_members = ens.realization.size

if isinstance(ens, xr.Dataset):
out = xr.merge(
Expand All @@ -253,6 +272,7 @@ def ensemble_percentiles(
values,
keep_chunk_size=keep_chunk_size,
split=split,
min_members=min_members,
weights=weights,
)
for da in ens.data_vars.values()
Expand Down Expand Up @@ -313,6 +333,8 @@ def ensemble_percentiles(
.rename({"quantile": "percentiles"})
)

if min_members != 1:
out = out.where(ens.notnull().sum("realization") >= min_members)
out = out.assign_coords(
percentiles=xr.DataArray(list(values), dims=("percentiles",))
)
Expand Down
24 changes: 20 additions & 4 deletions xclim/indices/_threshold.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import numpy as np
import xarray

from xclim.core.calendar import get_calendar, select_time
from xclim.core.calendar import doy_from_string, get_calendar, select_time
from xclim.core.missing import at_least_n_valid
from xclim.core.units import (
convert_units_to,
Expand Down Expand Up @@ -2966,6 +2966,7 @@ def degree_days_exceedance_date(
sum_thresh: Quantified = "25 K days",
op: str = ">",
after_date: DayOfYearStr | None = None,
never_reached: DayOfYearStr | int | None = None,
freq: str = "YS",
) -> xarray.DataArray:
r"""Degree-days exceedance date.
Expand All @@ -2986,7 +2987,12 @@ def degree_days_exceedance_date(
equivalent to '<', they are computed as `thresh - tas`.
after_date: str, optional
Date at which to start the cumulative sum.
In "mm-dd" format, defaults to the start of the sampling period.
In "MM-DD" format, defaults to the start of the sampling period.
never_reached: int, str, optional
What to do when `sum_thresh` is never exceeded.
If an int, the value to assign as a day-of-year.
If a string, must be in "MM-DD" format, the day-of-year of that date is assigned.
Default (None) assigns "NaN".
freq : str
Resampling frequency. If `after_date` is given, `freq` should be annual.

Expand Down Expand Up @@ -3030,12 +3036,22 @@ def _exceedance_date(grp):
strt_idx.size == 0
): # The date is not within the group. Happens at boundaries.
return xarray.full_like(grp.isel(time=0), np.nan, float).drop_vars("time") # type: ignore
cumsum = grp.where(grp.time >= grp.time[strt_idx][0]).cumsum("time")

return rl.first_run_after_date(
grp.where(grp.time >= grp.time[strt_idx][0]).cumsum("time") > sum_thresh,
out = rl.first_run_after_date(
cumsum > sum_thresh,
window=1,
date=None,
)
if never_reached is None:
# This is slightly faster in numpy and generates fewer tasks in dask
return out
never_reached_val = (
doy_from_string(never_reached, grp.time.dt.year[0], grp.time.dt.calendar)
if isinstance(never_reached, str)
else never_reached
)
return xarray.where((cumsum <= sum_thresh).all("time"), never_reached_val, out)

out = c.clip(0).resample(time=freq).map(_exceedance_date)
out.attrs.update(units="", is_dayofyear=np.int32(1), calendar=get_calendar(tas))
Expand Down