diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index 79a33c437ea5c..5a36cff7908f0 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -225,6 +225,20 @@ def time_rolling_offset(self, method): getattr(self.groupby_roll_offset, method)() +class GroupbyLargeGroups: + # https://github.com/pandas-dev/pandas/issues/38038 + # specific example where the rolling operation on a larger dataframe + # is relatively cheap (few but large groups), but creation of + # MultiIndex of result can be expensive + + def setup(self): + N = 100000 + self.df = pd.DataFrame({"A": [1, 2] * int(N / 2), "B": np.random.randn(N)}) + + def time_rolling_multiindex_creation(self): + self.df.groupby("A").rolling(3).mean() + + class GroupbyEWM: params = ["cython", "numba"] diff --git a/doc/source/whatsnew/v1.1.5.rst b/doc/source/whatsnew/v1.1.5.rst index d0a935bfb4e32..53e7db492d8bb 100644 --- a/doc/source/whatsnew/v1.1.5.rst +++ b/doc/source/whatsnew/v1.1.5.rst @@ -24,6 +24,7 @@ Fixed regressions - Fixed regression in ``df.groupby(..).rolling(..)`` with the resulting :class:`MultiIndex` when grouping by a label that is in the index (:issue:`37641`) - Fixed regression in :meth:`DataFrame.fillna` not filling ``NaN`` after other operations such as :meth:`DataFrame.pivot` (:issue:`36495`). - Fixed performance regression for :meth:`DataFrame.__setitem__` with list-like indexers (:issue:`37954`) +- Fixed performance regression in ``df.groupby(..).rolling(..)`` (:issue:`38038`) - Fixed regression in :meth:`MultiIndex.intersection` returning duplicates when at least one of the indexes had duplicates (:issue:`36915`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 51a1e2102c273..e6185f8ae0679 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -50,7 +50,6 @@ from pandas.core.aggregation import aggregate from pandas.core.base import DataError, SelectionMixin -import pandas.core.common as com from pandas.core.construction import extract_array from pandas.core.groupby.base import GotItemMixin, ShallowMixin from pandas.core.indexes.api import Index, MultiIndex @@ -791,22 +790,29 @@ def _apply( # Our result will have still kept the column in the result result = result.drop(columns=column_keys, errors="ignore") - result_index_data = [] - for key, values in self._groupby.grouper.indices.items(): - for value in values: - data = [ - *com.maybe_make_list(key), - *com.maybe_make_list( - grouped_object_index[value] - if grouped_object_index is not None - else [] - ), - ] - result_index_data.append(tuple(data)) - - result_index = MultiIndex.from_tuples( - result_index_data, names=result_index_names + codes = self._groupby.grouper.codes + levels = self._groupby.grouper.levels + + group_indices = self._groupby.grouper.indices.values() + if group_indices: + indexer = np.concatenate(list(group_indices)) + else: + indexer = np.array([], dtype=np.intp) + codes = [c.take(indexer) for c in codes] + + # if the index of the original dataframe needs to be preserved, append + # this index (but reordered) to the codes/levels from the groupby + if grouped_object_index is not None: + idx = grouped_object_index.take(indexer) + if not isinstance(idx, MultiIndex): + idx = MultiIndex.from_arrays([idx]) + codes.extend(list(idx.codes)) + levels.extend(list(idx.levels)) + + result_index = MultiIndex( + levels, codes, names=result_index_names, verify_integrity=False ) + result.index = result_index return result diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index f9b5a5fe9a3c1..b89fb35ac3a70 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -1,7 +1,15 @@ import numpy as np import pytest -from pandas import DataFrame, MultiIndex, Series, Timestamp, date_range, to_datetime +from pandas import ( + DataFrame, + Index, + MultiIndex, + Series, + Timestamp, + date_range, + to_datetime, +) import pandas._testing as tm from pandas.api.indexers import BaseIndexer from pandas.core.groupby.groupby import get_groupby @@ -418,12 +426,23 @@ def test_groupby_rolling_empty_frame(self): # GH 36197 expected = DataFrame({"s1": []}) result = expected.groupby("s1").rolling(window=1).sum() - expected.index = MultiIndex.from_tuples([], names=["s1", None]) + # GH-38057 from_tuples gives empty object dtype, we now get float/int levels + # expected.index = MultiIndex.from_tuples([], names=["s1", None]) + expected.index = MultiIndex.from_product( + [Index([], dtype="float64"), Index([], dtype="int64")], names=["s1", None] + ) tm.assert_frame_equal(result, expected) expected = DataFrame({"s1": [], "s2": []}) result = expected.groupby(["s1", "s2"]).rolling(window=1).sum() - expected.index = MultiIndex.from_tuples([], names=["s1", "s2", None]) + expected.index = MultiIndex.from_product( + [ + Index([], dtype="float64"), + Index([], dtype="float64"), + Index([], dtype="int64"), + ], + names=["s1", "s2", None], + ) tm.assert_frame_equal(result, expected) def test_groupby_rolling_string_index(self): @@ -567,6 +586,60 @@ def test_groupby_rolling_index_level_and_column_label(self): ) tm.assert_frame_equal(result, expected) + def test_groupby_rolling_resulting_multiindex(self): + # a few different cases checking the created MultiIndex of the result + # https://github.com/pandas-dev/pandas/pull/38057 + + # grouping by 1 columns -> 2-level MI as result + df = DataFrame({"a": np.arange(8.0), "b": [1, 2] * 4}) + result = df.groupby("b").rolling(3).mean() + expected_index = MultiIndex.from_tuples( + [(1, 0), (1, 2), (1, 4), (1, 6), (2, 1), (2, 3), (2, 5), (2, 7)], + names=["b", None], + ) + tm.assert_index_equal(result.index, expected_index) + + # grouping by 2 columns -> 3-level MI as result + df = DataFrame({"a": np.arange(12.0), "b": [1, 2] * 6, "c": [1, 2, 3, 4] * 3}) + result = df.groupby(["b", "c"]).rolling(2).sum() + expected_index = MultiIndex.from_tuples( + [ + (1, 1, 0), + (1, 1, 4), + (1, 1, 8), + (1, 3, 2), + (1, 3, 6), + (1, 3, 10), + (2, 2, 1), + (2, 2, 5), + (2, 2, 9), + (2, 4, 3), + (2, 4, 7), + (2, 4, 11), + ], + names=["b", "c", None], + ) + tm.assert_index_equal(result.index, expected_index) + + # grouping with 1 level on dataframe with 2-level MI -> 3-level MI as result + df = DataFrame({"a": np.arange(8.0), "b": [1, 2] * 4, "c": [1, 2, 3, 4] * 2}) + df = df.set_index("c", append=True) + result = df.groupby("b").rolling(3).mean() + expected_index = MultiIndex.from_tuples( + [ + (1, 0, 1), + (1, 2, 3), + (1, 4, 1), + (1, 6, 3), + (2, 1, 2), + (2, 3, 4), + (2, 5, 2), + (2, 7, 4), + ], + names=["b", None, "c"], + ) + tm.assert_index_equal(result.index, expected_index) + class TestExpanding: def setup_method(self): diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 1658cca347786..10b23cadfe279 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1085,8 +1085,15 @@ def test_groupby_rolling_nan_included(): result = df.groupby("group", dropna=False).rolling(1, min_periods=1).mean() expected = DataFrame( {"B": [0.0, 2.0, 3.0, 1.0, 4.0]}, - index=MultiIndex.from_tuples( - [("g1", 0), ("g1", 2), ("g2", 3), (np.nan, 1), (np.nan, 4)], + # GH-38057 from_tuples puts the NaNs in the codes, result expects them + # to be in the levels, at the moment + # index=MultiIndex.from_tuples( + # [("g1", 0), ("g1", 2), ("g2", 3), (np.nan, 1), (np.nan, 4)], + # names=["group", None], + # ), + index=MultiIndex( + [["g1", "g2", np.nan], [0, 1, 2, 3, 4]], + [[0, 0, 1, 2, 2], [0, 2, 3, 1, 4]], names=["group", None], ), )