From ba0f8c668f41c1e6f5b17275d588917444a80615 Mon Sep 17 00:00:00 2001 From: richard Date: Mon, 30 May 2022 14:07:48 -0400 Subject: [PATCH 1/2] DEPR: numeric_only default in resampler ops --- doc/source/whatsnew/v1.5.0.rst | 7 +- pandas/compat/numpy/function.py | 18 --- pandas/core/resample.py | 125 ++++++++++++------- pandas/tests/resample/test_datetime_index.py | 15 --- pandas/tests/resample/test_resample_api.py | 44 +++++-- 5 files changed, 122 insertions(+), 87 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index eb08034bb92eb..279a2df8b728d 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -605,7 +605,7 @@ In the case where ``df.columns`` is not unique, use :meth:`DataFrame.isetitem`: ``numeric_only`` default value ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Across the DataFrame and DataFrameGroupBy operations such as +Across the :class:`DataFrame`, :class:`.DataFrameGroupBy`, and :class:`.Resampler` operations such as ``min``, ``sum``, and ``idxmax``, the default value of the ``numeric_only`` argument, if it exists at all, was inconsistent. Furthermore, operations with the default value ``None`` can lead to surprising @@ -644,6 +644,11 @@ gained the ``numeric_only`` argument. - :meth:`.GroupBy.std` - :meth:`.GroupBy.sem` - :meth:`.DataFrameGroupBy.quantile` +- :meth:`.Resampler.mean` +- :meth:`.Resampler.median` +- :meth:`.Resampler.sem` +- :meth:`.Resampler.std` +- :meth:`.Resampler.var` .. _whatsnew_150.deprecations.other: diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index e3aa5bb52f2ba..b9e6a071a48f6 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -371,24 +371,6 @@ def validate_groupby_func(name, args, kwargs, allowed=None) -> None: ) -RESAMPLER_NUMPY_OPS = ("min", "max", "sum", "prod", "mean", "std", "var") - - -def validate_resampler_func(method: str, args, kwargs) -> None: - """ - 'args' and 'kwargs' should be empty because all of their necessary - parameters are explicitly listed in the function signature - """ - if len(args) + len(kwargs) > 0: - if method in RESAMPLER_NUMPY_OPS: - raise UnsupportedFunctionCall( - "numpy operations are not valid with resample. " - f"Use .resample(...).{method}() instead" - ) - else: - raise TypeError("too many arguments passed in") - - def validate_minmax_axis(axis: int | None, ndim: int = 1) -> None: """ Ensure that the axis argument passed to min, max, argmin, or argmax is zero diff --git a/pandas/core/resample.py b/pandas/core/resample.py index dcd9aceaf8474..89327a575411b 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -33,7 +33,6 @@ TimestampConvertibleTypes, npt, ) -from pandas.compat.numpy import function as nv from pandas.errors import ( AbstractMethodError, DataError, @@ -393,7 +392,7 @@ def transform(self, arg, *args, **kwargs): """ return self._selected_obj.groupby(self.groupby).transform(arg, *args, **kwargs) - def _downsample(self, f): + def _downsample(self, f, **kwargs): raise AbstractMethodError(self) def _upsample(self, f, limit=None, fill_value=None): @@ -937,7 +936,7 @@ def asfreq(self, fill_value=None): """ return self._upsample("asfreq", fill_value=fill_value) - def std(self, ddof=1, *args, **kwargs): + def std(self, ddof=1, numeric_only: bool = False): """ Compute standard deviation of groups, excluding missing values. @@ -945,17 +944,19 @@ def std(self, ddof=1, *args, **kwargs): ---------- ddof : int, default 1 Degrees of freedom. + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 Returns ------- DataFrame or Series Standard deviation of values within each group. """ - nv.validate_resampler_func("std", args, kwargs) - # error: Unexpected keyword argument "ddof" for "_downsample" - return self._downsample("std", ddof=ddof) # type: ignore[call-arg] + return self._downsample("std", ddof=ddof, numeric_only=numeric_only) - def var(self, ddof=1, *args, **kwargs): + def var(self, ddof=1, numeric_only: bool = False): """ Compute variance of groups, excluding missing values. @@ -964,14 +965,17 @@ def var(self, ddof=1, *args, **kwargs): ddof : int, default 1 Degrees of freedom. + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + Returns ------- DataFrame or Series Variance of values within each group. """ - nv.validate_resampler_func("var", args, kwargs) - # error: Unexpected keyword argument "ddof" for "_downsample" - return self._downsample("var", ddof=ddof) # type: ignore[call-arg] + return self._downsample("var", ddof=ddof, numeric_only=numeric_only) @doc(GroupBy.size) def size(self): @@ -1027,53 +1031,84 @@ def quantile(self, q=0.5, **kwargs): Return a DataFrame, where the coulmns are groupby columns, and the values are its quantiles. """ - # error: Unexpected keyword argument "q" for "_downsample" - # error: Too many arguments for "_downsample" - return self._downsample("quantile", q=q, **kwargs) # type: ignore[call-arg] + return self._downsample("quantile", q=q, **kwargs) -# downsample methods -for method in ["sum", "prod", "min", "max", "first", "last"]: +def _add_downsample_kernel( + name: str, args: tuple[str, ...], docs_class: type = GroupBy +) -> None: + """ + Add a kernel to Resampler. + + Arguments + --------- + name : str + Name of the kernel. + args : tuple + Arguments of the method. + docs_class : type + Class to get kernel docstring from. + """ + assert args in ( + ("numeric_only", "min_count"), + ("numeric_only",), + ("ddof", "numeric_only"), + (), + ) - def f( - self, - _method: str = method, - numeric_only: bool | lib.NoDefault = lib.no_default, - min_count: int = 0, - *args, - **kwargs, - ): - if numeric_only is lib.no_default: - if _method != "sum": + # Explicitly provide args rather than args/kwargs for API docs + if args == ("numeric_only", "min_count"): + + def f( + self, + numeric_only: bool | lib.NoDefault = lib.no_default, + min_count: int = 0, + ): + if numeric_only is lib.no_default and name != "sum": # For DataFrameGroupBy, set it to be False for methods other than `sum`. numeric_only = False - nv.validate_resampler_func(_method, args, kwargs) - return self._downsample(_method, numeric_only=numeric_only, min_count=min_count) - - f.__doc__ = getattr(GroupBy, method).__doc__ - setattr(Resampler, method, f) - + return self._downsample( + name, numeric_only=numeric_only, min_count=min_count + ) -# downsample methods -for method in ["mean", "sem", "median", "ohlc"]: + elif args == ("numeric_only",): + # error: All conditional function variants must have identical signatures + def f( # type: ignore[misc] + self, numeric_only: bool | lib.NoDefault = lib.no_default + ): + return self._downsample(name, numeric_only=numeric_only) + + elif args == ("ddof", "numeric_only"): + # error: All conditional function variants must have identical signatures + def f( # type: ignore[misc] + self, + ddof: int = 1, + numeric_only: bool | lib.NoDefault = lib.no_default, + ): + return self._downsample(name, ddof=ddof, numeric_only=numeric_only) - def g(self, _method=method, *args, **kwargs): - nv.validate_resampler_func(_method, args, kwargs) - return self._downsample(_method) + else: + # error: All conditional function variants must have identical signatures + def f( # type: ignore[misc] + self, + ): + return self._downsample(name) - g.__doc__ = getattr(GroupBy, method).__doc__ - setattr(Resampler, method, g) + f.__doc__ = getattr(docs_class, name).__doc__ + setattr(Resampler, name, f) -# series only methods +for method in ["sum", "prod", "min", "max", "first", "last"]: + _add_downsample_kernel(method, ("numeric_only", "min_count")) +for method in ["mean", "median"]: + _add_downsample_kernel(method, ("numeric_only",)) +for method in ["sem"]: + _add_downsample_kernel(method, ("ddof", "numeric_only")) +for method in ["ohlc"]: + _add_downsample_kernel(method, ()) for method in ["nunique"]: - - def h(self, _method=method): - return self._downsample(_method) - - h.__doc__ = getattr(SeriesGroupBy, method).__doc__ - setattr(Resampler, method, h) + _add_downsample_kernel(method, (), SeriesGroupBy) class _GroupByMixin(PandasObject): diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index fbc3b385e5098..190e92e59d0e4 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -9,7 +9,6 @@ from pandas._libs import lib from pandas._typing import DatetimeNaTType -from pandas.errors import UnsupportedFunctionCall import pandas as pd from pandas import ( @@ -226,20 +225,6 @@ def _ohlc(group): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("func", ["min", "max", "sum", "prod", "mean", "var", "std"]) -def test_numpy_compat(func): - # see gh-12811 - s = Series([1, 2, 3, 4, 5], index=date_range("20130101", periods=5, freq="s")) - r = s.resample("2s") - - msg = "numpy operations are not valid with resample" - - with pytest.raises(UnsupportedFunctionCall, match=msg): - getattr(r, func)(func, 1, 2, 3) - with pytest.raises(UnsupportedFunctionCall, match=msg): - getattr(r, func)(axis=1) - - def test_resample_how_callables(): # GH#7929 data = np.arange(5, dtype=np.int64) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 43050c0338671..5e10b9ee5277c 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -814,6 +814,7 @@ def test_end_and_end_day_origin( @pytest.mark.parametrize( + # expected_data is a string when op raises a ValueError "method, numeric_only, expected_data", [ ("sum", True, {"num": [25]}), @@ -834,6 +835,21 @@ def test_end_and_end_day_origin( ("last", True, {"num": [20]}), ("last", False, {"cat": ["cat_2"], "num": [20]}), ("last", lib.no_default, {"cat": ["cat_2"], "num": [20]}), + ("mean", True, {"num": [12.5]}), + ("mean", False, {"num": [12.5]}), + ("mean", lib.no_default, {"num": [12.5]}), + ("median", True, {"num": [12.5]}), + ("median", False, {"num": [12.5]}), + ("median", lib.no_default, {"num": [12.5]}), + ("std", True, {"num": [10.606601717798213]}), + ("std", False, "could not convert string to float"), + ("std", lib.no_default, {"num": [10.606601717798213]}), + ("var", True, {"num": [112.5]}), + ("var", False, "could not convert string to float"), + ("var", lib.no_default, {"num": [112.5]}), + ("sem", True, {"num": [7.5]}), + ("sem", False, "could not convert string to float"), + ("sem", lib.no_default, {"num": [7.5]}), ], ) def test_frame_downsample_method(method, numeric_only, expected_data): @@ -845,20 +861,32 @@ def test_frame_downsample_method(method, numeric_only, expected_data): resampled = df.resample("Y") func = getattr(resampled, method) - if method == "prod" and numeric_only is not True: + if numeric_only is lib.no_default and method not in ( + "min", + "max", + "first", + "last", + "prod", + ): warn = FutureWarning - msg = "Dropping invalid columns in DataFrameGroupBy.prod is deprecated" - elif method == "sum" and numeric_only is lib.no_default: + msg = ( + f"default value of numeric_only in DataFrameGroupBy.{method} is deprecated" + ) + elif method in ("prod", "mean", "median") and numeric_only is not True: warn = FutureWarning - msg = "The default value of numeric_only in DataFrameGroupBy.sum is deprecated" + msg = f"Dropping invalid columns in DataFrameGroupBy.{method} is deprecated" else: warn = None msg = "" with tm.assert_produces_warning(warn, match=msg): - result = func(numeric_only=numeric_only) - - expected = DataFrame(expected_data, index=expected_index) - tm.assert_frame_equal(result, expected) + if isinstance(expected_data, str): + klass = TypeError if method == "var" else ValueError + with pytest.raises(klass, match=expected_data): + _ = func(numeric_only=numeric_only) + else: + result = func(numeric_only=numeric_only) + expected = DataFrame(expected_data, index=expected_index) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( From 73bac86da7b17c83446abe8021ac22a9ea9b5081 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 4 Jun 2022 11:15:53 -0400 Subject: [PATCH 2/2] Revert removal of args/kwargs --- pandas/compat/numpy/function.py | 18 ++++++++++++++++++ pandas/core/resample.py | 19 ++++++++++++++++--- pandas/tests/resample/test_datetime_index.py | 15 +++++++++++++++ 3 files changed, 49 insertions(+), 3 deletions(-) diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index b9e6a071a48f6..e3aa5bb52f2ba 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -371,6 +371,24 @@ def validate_groupby_func(name, args, kwargs, allowed=None) -> None: ) +RESAMPLER_NUMPY_OPS = ("min", "max", "sum", "prod", "mean", "std", "var") + + +def validate_resampler_func(method: str, args, kwargs) -> None: + """ + 'args' and 'kwargs' should be empty because all of their necessary + parameters are explicitly listed in the function signature + """ + if len(args) + len(kwargs) > 0: + if method in RESAMPLER_NUMPY_OPS: + raise UnsupportedFunctionCall( + "numpy operations are not valid with resample. " + f"Use .resample(...).{method}() instead" + ) + else: + raise TypeError("too many arguments passed in") + + def validate_minmax_axis(axis: int | None, ndim: int = 1) -> None: """ Ensure that the axis argument passed to min, max, argmin, or argmax is zero diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 89327a575411b..0a62861cdaba7 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -33,6 +33,7 @@ TimestampConvertibleTypes, npt, ) +from pandas.compat.numpy import function as nv from pandas.errors import ( AbstractMethodError, DataError, @@ -936,7 +937,7 @@ def asfreq(self, fill_value=None): """ return self._upsample("asfreq", fill_value=fill_value) - def std(self, ddof=1, numeric_only: bool = False): + def std(self, ddof=1, numeric_only: bool = False, *args, **kwargs): """ Compute standard deviation of groups, excluding missing values. @@ -954,9 +955,10 @@ def std(self, ddof=1, numeric_only: bool = False): DataFrame or Series Standard deviation of values within each group. """ + nv.validate_resampler_func("std", args, kwargs) return self._downsample("std", ddof=ddof, numeric_only=numeric_only) - def var(self, ddof=1, numeric_only: bool = False): + def var(self, ddof=1, numeric_only: bool = False, *args, **kwargs): """ Compute variance of groups, excluding missing values. @@ -975,6 +977,7 @@ def var(self, ddof=1, numeric_only: bool = False): DataFrame or Series Variance of values within each group. """ + nv.validate_resampler_func("var", args, kwargs) return self._downsample("var", ddof=ddof, numeric_only=numeric_only) @doc(GroupBy.size) @@ -1063,7 +1066,10 @@ def f( self, numeric_only: bool | lib.NoDefault = lib.no_default, min_count: int = 0, + *args, + **kwargs, ): + nv.validate_resampler_func(name, args, kwargs) if numeric_only is lib.no_default and name != "sum": # For DataFrameGroupBy, set it to be False for methods other than `sum`. numeric_only = False @@ -1075,8 +1081,9 @@ def f( elif args == ("numeric_only",): # error: All conditional function variants must have identical signatures def f( # type: ignore[misc] - self, numeric_only: bool | lib.NoDefault = lib.no_default + self, numeric_only: bool | lib.NoDefault = lib.no_default, *args, **kwargs ): + nv.validate_resampler_func(name, args, kwargs) return self._downsample(name, numeric_only=numeric_only) elif args == ("ddof", "numeric_only"): @@ -1085,14 +1092,20 @@ def f( # type: ignore[misc] self, ddof: int = 1, numeric_only: bool | lib.NoDefault = lib.no_default, + *args, + **kwargs, ): + nv.validate_resampler_func(name, args, kwargs) return self._downsample(name, ddof=ddof, numeric_only=numeric_only) else: # error: All conditional function variants must have identical signatures def f( # type: ignore[misc] self, + *args, + **kwargs, ): + nv.validate_resampler_func(name, args, kwargs) return self._downsample(name) f.__doc__ = getattr(docs_class, name).__doc__ diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 190e92e59d0e4..fbc3b385e5098 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -9,6 +9,7 @@ from pandas._libs import lib from pandas._typing import DatetimeNaTType +from pandas.errors import UnsupportedFunctionCall import pandas as pd from pandas import ( @@ -225,6 +226,20 @@ def _ohlc(group): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("func", ["min", "max", "sum", "prod", "mean", "var", "std"]) +def test_numpy_compat(func): + # see gh-12811 + s = Series([1, 2, 3, 4, 5], index=date_range("20130101", periods=5, freq="s")) + r = s.resample("2s") + + msg = "numpy operations are not valid with resample" + + with pytest.raises(UnsupportedFunctionCall, match=msg): + getattr(r, func)(func, 1, 2, 3) + with pytest.raises(UnsupportedFunctionCall, match=msg): + getattr(r, func)(axis=1) + + def test_resample_how_callables(): # GH#7929 data = np.arange(5, dtype=np.int64)