Skip to content

Commit

Permalink
Support additional dtypes in resample (pydata#9413)
Browse files Browse the repository at this point in the history
* Support additional dtypes to resample

pandas.BaseOffset, pandas.Timedelta, datetime.timedelta, and BaseCFTimeOffset are now all supported datatypes for resampling.

* Update whats-new

* Fix types

* Add unit test

* Fix test

* Support more dtypes for CFTimeIndex resampling

* Tidy resample type hints

* Fix some mypy bugs

* Fixes

* Fix tests

* WIP

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update doc/whats-new.rst

* Apply suggestions from code review

Co-authored-by: Spencer Clark <spencerkclark@gmail.com>

* Fix mypy error

* Fix bad edit

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Deepak Cherian <dcherian@users.noreply.github.com>
Co-authored-by: Spencer Clark <spencerkclark@gmail.com>
  • Loading branch information
4 people authored and hollymandel committed Sep 23, 2024
1 parent 4981dc4 commit eda216d
Show file tree
Hide file tree
Showing 9 changed files with 168 additions and 33 deletions.
4 changes: 4 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,10 @@ Bug fixes
- Fix deprecation warning that was raised when calling ``np.array`` on an ``xr.DataArray``
in NumPy 2.0 (:issue:`9312`, :pull:`9393`)
By `Andrew Scherer <https://github.com/andrew-s28>`_.
- Fix support for using ``pandas.DateOffset``, ``pandas.Timedelta``, and
``datetime.timedelta`` objects as ``resample`` frequencies
(:issue:`9408`, :pull:`9413`).
By `Oliver Higgs <https://github.com/oliverhiggs>`_.

Performance
~~~~~~~~~~~
Expand Down
42 changes: 39 additions & 3 deletions xarray/coding/cftime_offsets.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
from collections.abc import Mapping
from datetime import datetime, timedelta
from functools import partial
from typing import TYPE_CHECKING, ClassVar, Literal
from typing import TYPE_CHECKING, ClassVar, Literal, TypeVar

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -80,6 +80,7 @@


DayOption: TypeAlias = Literal["start", "end"]
T_FreqStr = TypeVar("T_FreqStr", str, None)


def _nanosecond_precision_timestamp(*args, **kwargs):
Expand Down Expand Up @@ -772,11 +773,18 @@ def _emit_freq_deprecation_warning(deprecated_freq):
emit_user_level_warning(message, FutureWarning)


def to_offset(freq: BaseCFTimeOffset | str, warn: bool = True) -> BaseCFTimeOffset:
def to_offset(
freq: BaseCFTimeOffset | str | timedelta | pd.Timedelta | pd.DateOffset,
warn: bool = True,
) -> BaseCFTimeOffset:
"""Convert a frequency string to the appropriate subclass of
BaseCFTimeOffset."""
if isinstance(freq, BaseCFTimeOffset):
return freq
if isinstance(freq, timedelta | pd.Timedelta):
return delta_to_tick(freq)
if isinstance(freq, pd.DateOffset):
freq = _legacy_to_new_freq(freq.freqstr)

match = re.match(_PATTERN, freq)
if match is None:
Expand All @@ -791,6 +799,34 @@ def to_offset(freq: BaseCFTimeOffset | str, warn: bool = True) -> BaseCFTimeOffs
return _FREQUENCIES[freq](n=multiples)


def delta_to_tick(delta: timedelta | pd.Timedelta) -> Tick:
"""Adapted from pandas.tslib.delta_to_tick"""
if isinstance(delta, pd.Timedelta) and delta.nanoseconds != 0:
# pandas.Timedelta has nanoseconds, but these are not supported
raise ValueError(
"Unable to convert 'pandas.Timedelta' object with non-zero "
"nanoseconds to 'CFTimeOffset' object"
)
if delta.microseconds == 0:
if delta.seconds == 0:
return Day(n=delta.days)
else:
seconds = delta.days * 86400 + delta.seconds
if seconds % 3600 == 0:
return Hour(n=seconds // 3600)
elif seconds % 60 == 0:
return Minute(n=seconds // 60)
else:
return Second(n=seconds)
else:
# Regardless of the days and seconds this will always be a Millisecond
# or Microsecond object
if delta.microseconds % 1_000 == 0:
return Millisecond(n=delta.microseconds // 1_000)
else:
return Microsecond(n=delta.microseconds)


def to_cftime_datetime(date_str_or_date, calendar=None):
if cftime is None:
raise ModuleNotFoundError("No module named 'cftime'")
Expand Down Expand Up @@ -1332,7 +1368,7 @@ def _new_to_legacy_freq(freq):
return freq


def _legacy_to_new_freq(freq):
def _legacy_to_new_freq(freq: T_FreqStr) -> T_FreqStr:
# to avoid internal deprecation warnings when freq is determined using pandas < 2.2

# TODO: remove once requiring pandas >= 2.2
Expand Down
16 changes: 10 additions & 6 deletions xarray/core/common.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import datetime
import warnings
from collections.abc import Callable, Hashable, Iterable, Iterator, Mapping
from contextlib import suppress
Expand All @@ -13,6 +14,7 @@
from xarray.core import dtypes, duck_array_ops, formatting, formatting_html, ops
from xarray.core.indexing import BasicIndexer, ExplicitlyIndexed
from xarray.core.options import OPTIONS, _get_keep_attrs
from xarray.core.types import ResampleCompatible
from xarray.core.utils import (
Frozen,
either_dict_or_kwargs,
Expand All @@ -32,8 +34,6 @@


if TYPE_CHECKING:
import datetime

from numpy.typing import DTypeLike

from xarray.core.dataarray import DataArray
Expand Down Expand Up @@ -891,14 +891,14 @@ def rolling_exp(
def _resample(
self,
resample_cls: type[T_Resample],
indexer: Mapping[Hashable, str | Resampler] | None,
indexer: Mapping[Hashable, ResampleCompatible | Resampler] | None,
skipna: bool | None,
closed: SideOptions | None,
label: SideOptions | None,
offset: pd.Timedelta | datetime.timedelta | str | None,
origin: str | DatetimeLike,
restore_coord_dims: bool | None,
**indexer_kwargs: str | Resampler,
**indexer_kwargs: ResampleCompatible | Resampler,
) -> T_Resample:
"""Returns a Resample object for performing resampling operations.
Expand Down Expand Up @@ -1078,14 +1078,18 @@ def _resample(
)

grouper: Resampler
if isinstance(freq, str):
if isinstance(freq, ResampleCompatible):
grouper = TimeResampler(
freq=freq, closed=closed, label=label, origin=origin, offset=offset
)
elif isinstance(freq, Resampler):
grouper = freq
else:
raise ValueError("freq must be a str or a Resampler object")
raise ValueError(
"freq must be an object of type 'str', 'datetime.timedelta', "
"'pandas.Timedelta', 'pandas.DateOffset', or 'TimeResampler'. "
f"Received {type(freq)} instead."
)

rgrouper = ResolvedGrouper(grouper, group, self)

Expand Down
9 changes: 5 additions & 4 deletions xarray/core/dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@
QueryEngineOptions,
QueryParserOptions,
ReindexMethodOptions,
ResampleCompatible,
Self,
SideOptions,
T_ChunkDimFreq,
Expand Down Expand Up @@ -7269,15 +7270,15 @@ def coarsen(
@_deprecate_positional_args("v2024.07.0")
def resample(
self,
indexer: Mapping[Hashable, str | Resampler] | None = None,
indexer: Mapping[Hashable, ResampleCompatible | Resampler] | None = None,
*,
skipna: bool | None = None,
closed: SideOptions | None = None,
label: SideOptions | None = None,
offset: pd.Timedelta | datetime.timedelta | str | None = None,
origin: str | DatetimeLike = "start_day",
restore_coord_dims: bool | None = None,
**indexer_kwargs: str | Resampler,
**indexer_kwargs: ResampleCompatible | Resampler,
) -> DataArrayResample:
"""Returns a Resample object for performing resampling operations.
Expand All @@ -7288,7 +7289,7 @@ def resample(
Parameters
----------
indexer : Mapping of Hashable to str, optional
indexer : Mapping of Hashable to str, datetime.timedelta, pd.Timedelta, pd.DateOffset, or Resampler, optional
Mapping from the dimension name to resample frequency [1]_. The
dimension must be datetime-like.
skipna : bool, optional
Expand All @@ -7312,7 +7313,7 @@ def resample(
restore_coord_dims : bool, optional
If True, also restore the dimension order of multi-dimensional
coordinates.
**indexer_kwargs : str
**indexer_kwargs : str, datetime.timedelta, pd.Timedelta, pd.DateOffset, or Resampler
The keyword arguments form of ``indexer``.
One of indexer or indexer_kwargs must be provided.
Expand Down
9 changes: 5 additions & 4 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@
QueryEngineOptions,
QueryParserOptions,
ReindexMethodOptions,
ResampleCompatible,
SideOptions,
T_ChunkDimFreq,
T_DatasetPadConstantValues,
Expand Down Expand Up @@ -10710,15 +10711,15 @@ def coarsen(
@_deprecate_positional_args("v2024.07.0")
def resample(
self,
indexer: Mapping[Any, str | Resampler] | None = None,
indexer: Mapping[Any, ResampleCompatible | Resampler] | None = None,
*,
skipna: bool | None = None,
closed: SideOptions | None = None,
label: SideOptions | None = None,
offset: pd.Timedelta | datetime.timedelta | str | None = None,
origin: str | DatetimeLike = "start_day",
restore_coord_dims: bool | None = None,
**indexer_kwargs: str | Resampler,
**indexer_kwargs: ResampleCompatible | Resampler,
) -> DatasetResample:
"""Returns a Resample object for performing resampling operations.
Expand All @@ -10729,7 +10730,7 @@ def resample(
Parameters
----------
indexer : Mapping of Hashable to str, optional
indexer : Mapping of Hashable to str, datetime.timedelta, pd.Timedelta, pd.DateOffset, or Resampler, optional
Mapping from the dimension name to resample frequency [1]_. The
dimension must be datetime-like.
skipna : bool, optional
Expand All @@ -10753,7 +10754,7 @@ def resample(
restore_coord_dims : bool, optional
If True, also restore the dimension order of multi-dimensional
coordinates.
**indexer_kwargs : str
**indexer_kwargs : str, datetime.timedelta, pd.Timedelta, pd.DateOffset, or Resampler
The keyword arguments form of ``indexer``.
One of indexer or indexer_kwargs must be provided.
Expand Down
4 changes: 2 additions & 2 deletions xarray/core/resample_cftime.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@
from xarray.core.types import SideOptions

if typing.TYPE_CHECKING:
from xarray.core.types import CFTimeDatetime
from xarray.core.types import CFTimeDatetime, ResampleCompatible


class CFTimeGrouper:
Expand All @@ -75,7 +75,7 @@ class CFTimeGrouper:

def __init__(
self,
freq: str | BaseCFTimeOffset,
freq: ResampleCompatible | BaseCFTimeOffset,
closed: SideOptions | None = None,
label: SideOptions | None = None,
origin: str | CFTimeDatetime = "start_day",
Expand Down
2 changes: 2 additions & 0 deletions xarray/core/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,3 +320,5 @@ def copy(
Bins = Union[
int, Sequence[int], Sequence[float], Sequence[pd.Timestamp], np.ndarray, pd.Index
]

ResampleCompatible: TypeAlias = str | datetime.timedelta | pd.Timedelta | pd.DateOffset
20 changes: 16 additions & 4 deletions xarray/groupers.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,20 @@
import numpy as np
import pandas as pd

from xarray.coding.cftime_offsets import _new_to_legacy_freq
from xarray.coding.cftime_offsets import BaseCFTimeOffset, _new_to_legacy_freq
from xarray.core import duck_array_ops
from xarray.core.coordinates import Coordinates
from xarray.core.dataarray import DataArray
from xarray.core.groupby import T_Group, _DummyGroup
from xarray.core.indexes import safe_cast_to_index
from xarray.core.resample_cftime import CFTimeGrouper
from xarray.core.types import Bins, DatetimeLike, GroupIndices, SideOptions
from xarray.core.types import (
Bins,
DatetimeLike,
GroupIndices,
ResampleCompatible,
SideOptions,
)
from xarray.core.variable import Variable

__all__ = [
Expand Down Expand Up @@ -336,7 +342,7 @@ class TimeResampler(Resampler):
Attributes
----------
freq : str
freq : str, datetime.timedelta, pandas.Timestamp, or pandas.DateOffset
Frequency to resample to. See `Pandas frequency
aliases <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`_
for a list of possible values.
Expand All @@ -358,7 +364,7 @@ class TimeResampler(Resampler):
An offset timedelta added to the origin.
"""

freq: str
freq: ResampleCompatible
closed: SideOptions | None = field(default=None)
label: SideOptions | None = field(default=None)
origin: str | DatetimeLike = field(default="start_day")
Expand Down Expand Up @@ -388,6 +394,12 @@ def _init_properties(self, group: T_Group) -> None:
offset=offset,
)
else:
if isinstance(self.freq, BaseCFTimeOffset):
raise ValueError(
"'BaseCFTimeOffset' resample frequencies are only supported "
"when resampling a 'CFTimeIndex'"
)

self.index_grouper = pd.Grouper(
# TODO remove once requiring pandas >= 2.2
freq=_new_to_legacy_freq(self.freq),
Expand Down
Loading

0 comments on commit eda216d

Please sign in to comment.