From 121e2dfb9cd667157ae15654265a9a8976f34a31 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 31 Jul 2021 12:09:59 -0700 Subject: [PATCH 1/3] DEPR: dropping nuisance columns in rolling methods --- pandas/core/window/rolling.py | 11 +++++++++ pandas/tests/window/test_api.py | 3 ++- pandas/tests/window/test_ewm.py | 5 +++-- pandas/tests/window/test_groupby.py | 34 +++++++++++++++------------- pandas/tests/window/test_numba.py | 35 ++++++++++++++++++++--------- 5 files changed, 60 insertions(+), 28 deletions(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 8a253726ab0b6..fc03604603c36 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -32,6 +32,7 @@ from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv from pandas.util._decorators import doc +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( ensure_float64, @@ -436,6 +437,16 @@ def hfunc2d(values: ArrayLike) -> ArrayLike: new_mgr = mgr.apply_2d(hfunc2d, ignore_failures=True) else: new_mgr = mgr.apply(hfunc, ignore_failures=True) + + if 0 != len(new_mgr.items) != len(mgr.items): + # ignore_failures dropped nuisance columns + warnings.warn( + "Dropping of nuisance columns in rolling operations " + "is deprecated; in a future version this will raise TypeError. " + "Select only valid columns before calling the operation.", + FutureWarning, + stacklevel=find_stack_level(), + ) out = obj._constructor(new_mgr) return self._resolve_output(out, obj) diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index e70d079739003..f39f5792decc3 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -68,7 +68,8 @@ def tests_skip_nuisance(): def test_skip_sum_object_raises(): df = DataFrame({"A": range(5), "B": range(5, 10), "C": "foo"}) r = df.rolling(window=3) - result = r.sum() + with tm.assert_produces_warning(FutureWarning, match="nuisance columns"): + result = r.sum() expected = DataFrame( {"A": [np.nan, np.nan, 3, 6, 9], "B": [np.nan, np.nan, 18, 21, 24]}, columns=list("AB"), diff --git a/pandas/tests/window/test_ewm.py b/pandas/tests/window/test_ewm.py index 8da902ea830d1..e36d2f24a2f15 100644 --- a/pandas/tests/window/test_ewm.py +++ b/pandas/tests/window/test_ewm.py @@ -116,8 +116,9 @@ def test_ewma_with_times_equal_spacing(halflife_with_times, times, min_periods): data = np.arange(10.0) data[::2] = np.nan df = DataFrame({"A": data, "time_col": date_range("2000", freq="D", periods=10)}) - result = df.ewm(halflife=halflife, min_periods=min_periods, times=times).mean() - expected = df.ewm(halflife=1.0, min_periods=min_periods).mean() + with tm.assert_produces_warning(FutureWarning, match="nuisance columns"): + result = df.ewm(halflife=halflife, min_periods=min_periods, times=times).mean() + expected = df.ewm(halflife=1.0, min_periods=min_periods).mean() tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index 03b43026c9a6c..61031b8d0b931 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -923,7 +923,11 @@ def test_methods(self, method, expected_data): ) tm.assert_frame_equal(result, expected) - expected = df.groupby("A").apply(lambda x: getattr(x.ewm(com=1.0), method)()) + with tm.assert_produces_warning(FutureWarning, match="nuisance"): + expected = df.groupby("A").apply( + lambda x: getattr(x.ewm(com=1.0), method)() + ) + # There may be a bug in the above statement; not returning the correct index tm.assert_frame_equal(result.reset_index(drop=True), expected) @@ -955,7 +959,8 @@ def test_pairwise_methods(self, method, expected_data): def test_times(self, times_frame): # GH 40951 halflife = "23 days" - result = times_frame.groupby("A").ewm(halflife=halflife, times="C").mean() + with tm.assert_produces_warning(FutureWarning, match="nuisance"): + result = times_frame.groupby("A").ewm(halflife=halflife, times="C").mean() expected = DataFrame( { "B": [ @@ -992,22 +997,21 @@ def test_times(self, times_frame): def test_times_vs_apply(self, times_frame): # GH 40951 halflife = "23 days" - result = times_frame.groupby("A").ewm(halflife=halflife, times="C").mean() - expected = ( - times_frame.groupby("A") - .apply(lambda x: x.ewm(halflife=halflife, times="C").mean()) - .iloc[[0, 3, 6, 9, 1, 4, 7, 2, 5, 8]] - .reset_index(drop=True) - ) + with tm.assert_produces_warning(FutureWarning, match="nuisance"): + result = times_frame.groupby("A").ewm(halflife=halflife, times="C").mean() + expected = ( + times_frame.groupby("A") + .apply(lambda x: x.ewm(halflife=halflife, times="C").mean()) + .iloc[[0, 3, 6, 9, 1, 4, 7, 2, 5, 8]] + .reset_index(drop=True) + ) tm.assert_frame_equal(result.reset_index(drop=True), expected) def test_times_array(self, times_frame): # GH 40951 halflife = "23 days" - result = times_frame.groupby("A").ewm(halflife=halflife, times="C").mean() - expected = ( - times_frame.groupby("A") - .ewm(halflife=halflife, times=times_frame["C"].values) - .mean() - ) + gb = times_frame.groupby("A") + with tm.assert_produces_warning(FutureWarning, match="nuisance"): + result = gb.ewm(halflife=halflife, times="C").mean() + expected = gb.ewm(halflife=halflife, times=times_frame["C"].values).mean() tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index a8ec9086e6b02..258312235e2c4 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -170,26 +170,38 @@ def test_invalid_engine_kwargs(self, grouper): engine="cython", engine_kwargs={"nopython": True} ) - @pytest.mark.parametrize( - "grouper", [lambda x: x, lambda x: x.groupby("A")], ids=["None", "groupby"] - ) + @pytest.mark.parametrize("grouper", ["None", "groupby"]) def test_cython_vs_numba( self, grouper, nogil, parallel, nopython, ignore_na, adjust ): + if grouper == "None": + grouper = lambda x: x + warn = FutureWarning + else: + grouper = lambda x: x.groupby("A") + warn = None + df = DataFrame({"A": ["a", "b", "a", "b"], "B": range(4)}) ewm = grouper(df).ewm(com=1.0, adjust=adjust, ignore_na=ignore_na) engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} - result = ewm.mean(engine="numba", engine_kwargs=engine_kwargs) - expected = ewm.mean(engine="cython") + with tm.assert_produces_warning(warn, match="nuisance"): + result = ewm.mean(engine="numba", engine_kwargs=engine_kwargs) + expected = ewm.mean(engine="cython") tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( - "grouper", [lambda x: x, lambda x: x.groupby("A")], ids=["None", "groupby"] - ) + @pytest.mark.parametrize("grouper", ["None", "groupby"]) def test_cython_vs_numba_times(self, grouper, nogil, parallel, nopython, ignore_na): # GH 40951 + + if grouper == "None": + grouper = lambda x: x + warn = FutureWarning + else: + grouper = lambda x: x.groupby("A") + warn = None + halflife = "23 days" times = to_datetime( [ @@ -207,8 +219,11 @@ def test_cython_vs_numba_times(self, grouper, nogil, parallel, nopython, ignore_ ) engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} - result = ewm.mean(engine="numba", engine_kwargs=engine_kwargs) - expected = ewm.mean(engine="cython") + + # TODO: why only in these cases? + with tm.assert_produces_warning(warn, match="nuisance"): + result = ewm.mean(engine="numba", engine_kwargs=engine_kwargs) + expected = ewm.mean(engine="cython") tm.assert_frame_equal(result, expected) From 0c8f3329ff5a932d9927d5656abd60143f99918e Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 31 Jul 2021 12:12:22 -0700 Subject: [PATCH 2/3] whatsnew --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/core/window/rolling.py | 2 +- pandas/tests/window/test_api.py | 1 + pandas/tests/window/test_ewm.py | 1 + pandas/tests/window/test_groupby.py | 4 ++++ pandas/tests/window/test_numba.py | 2 ++ 6 files changed, 10 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index ce0158b05c2ab..52ee4d65045e2 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -159,6 +159,7 @@ Deprecations - Deprecated treating ``numpy.datetime64`` objects as UTC times when passed to the :class:`Timestamp` constructor along with a timezone. In a future version, these will be treated as wall-times. To retain the old behavior, use ``Timestamp(dt64).tz_localize("UTC").tz_convert(tz)`` (:issue:`24559`) - Deprecated ignoring missing labels when indexing with a sequence of labels on a level of a MultiIndex (:issue:`42351`) - Creating an empty Series without a dtype will now raise a more visible ``FutureWarning`` instead of a ``DeprecationWarning`` (:issue:`30017`) +- Deprecated dropping of nuisance columns in :class:`Rolling` aggregations (:issue:`42738`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index fc03604603c36..104e09b2a178b 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -439,7 +439,7 @@ def hfunc2d(values: ArrayLike) -> ArrayLike: new_mgr = mgr.apply(hfunc, ignore_failures=True) if 0 != len(new_mgr.items) != len(mgr.items): - # ignore_failures dropped nuisance columns + # GH#42738 ignore_failures dropped nuisance columns warnings.warn( "Dropping of nuisance columns in rolling operations " "is deprecated; in a future version this will raise TypeError. " diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index f39f5792decc3..eadd72d936678 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -69,6 +69,7 @@ def test_skip_sum_object_raises(): df = DataFrame({"A": range(5), "B": range(5, 10), "C": "foo"}) r = df.rolling(window=3) with tm.assert_produces_warning(FutureWarning, match="nuisance columns"): + # GH#42738 result = r.sum() expected = DataFrame( {"A": [np.nan, np.nan, 3, 6, 9], "B": [np.nan, np.nan, 18, 21, 24]}, diff --git a/pandas/tests/window/test_ewm.py b/pandas/tests/window/test_ewm.py index e36d2f24a2f15..011f44a674014 100644 --- a/pandas/tests/window/test_ewm.py +++ b/pandas/tests/window/test_ewm.py @@ -117,6 +117,7 @@ def test_ewma_with_times_equal_spacing(halflife_with_times, times, min_periods): data[::2] = np.nan df = DataFrame({"A": data, "time_col": date_range("2000", freq="D", periods=10)}) with tm.assert_produces_warning(FutureWarning, match="nuisance columns"): + # GH#42738 result = df.ewm(halflife=halflife, min_periods=min_periods, times=times).mean() expected = df.ewm(halflife=1.0, min_periods=min_periods).mean() tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index 61031b8d0b931..2523ec585a491 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -924,6 +924,7 @@ def test_methods(self, method, expected_data): tm.assert_frame_equal(result, expected) with tm.assert_produces_warning(FutureWarning, match="nuisance"): + # GH#42738 expected = df.groupby("A").apply( lambda x: getattr(x.ewm(com=1.0), method)() ) @@ -960,6 +961,7 @@ def test_times(self, times_frame): # GH 40951 halflife = "23 days" with tm.assert_produces_warning(FutureWarning, match="nuisance"): + # GH#42738 result = times_frame.groupby("A").ewm(halflife=halflife, times="C").mean() expected = DataFrame( { @@ -998,6 +1000,7 @@ def test_times_vs_apply(self, times_frame): # GH 40951 halflife = "23 days" with tm.assert_produces_warning(FutureWarning, match="nuisance"): + # GH#42738 result = times_frame.groupby("A").ewm(halflife=halflife, times="C").mean() expected = ( times_frame.groupby("A") @@ -1012,6 +1015,7 @@ def test_times_array(self, times_frame): halflife = "23 days" gb = times_frame.groupby("A") with tm.assert_produces_warning(FutureWarning, match="nuisance"): + # GH#42738 result = gb.ewm(halflife=halflife, times="C").mean() expected = gb.ewm(halflife=halflife, times=times_frame["C"].values).mean() tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index 258312235e2c4..586ca7ee259f8 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -186,6 +186,7 @@ def test_cython_vs_numba( engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} with tm.assert_produces_warning(warn, match="nuisance"): + # GH#42738 result = ewm.mean(engine="numba", engine_kwargs=engine_kwargs) expected = ewm.mean(engine="cython") @@ -222,6 +223,7 @@ def test_cython_vs_numba_times(self, grouper, nogil, parallel, nopython, ignore_ # TODO: why only in these cases? with tm.assert_produces_warning(warn, match="nuisance"): + # GH#42738 result = ewm.mean(engine="numba", engine_kwargs=engine_kwargs) expected = ewm.mean(engine="cython") From 083d21bb77bb2ff497bb5dbc9a743d722d9dba55 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 8 Aug 2021 21:03:15 -0700 Subject: [PATCH 3/3] add dropped columns to warning message --- pandas/core/window/rolling.py | 4 +++- pandas/tests/window/test_api.py | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 47367ba11f104..d4c0eb946505d 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -440,10 +440,12 @@ def hfunc2d(values: ArrayLike) -> ArrayLike: if 0 != len(new_mgr.items) != len(mgr.items): # GH#42738 ignore_failures dropped nuisance columns + dropped = mgr.items.difference(new_mgr.items) warnings.warn( "Dropping of nuisance columns in rolling operations " "is deprecated; in a future version this will raise TypeError. " - "Select only valid columns before calling the operation.", + "Select only valid columns before calling the operation. " + f"Dropped columns were {dropped}", FutureWarning, stacklevel=find_stack_level(), ) diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index eadd72d936678..7a5fcebfd23d7 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -68,7 +68,8 @@ def tests_skip_nuisance(): def test_skip_sum_object_raises(): df = DataFrame({"A": range(5), "B": range(5, 10), "C": "foo"}) r = df.rolling(window=3) - with tm.assert_produces_warning(FutureWarning, match="nuisance columns"): + msg = r"nuisance columns.*Dropped columns were Index\(\['C'\], dtype='object'\)" + with tm.assert_produces_warning(FutureWarning, match=msg): # GH#42738 result = r.sum() expected = DataFrame(