diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 00553edca3259..17a830788be3f 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -902,6 +902,10 @@ Groupby/resample/rolling - Bug in :meth:`Series.groupby` would raise ``ValueError`` when grouping by :class:`PeriodIndex` level (:issue:`34010`) - Bug in :meth:`GroupBy.agg`, :meth:`GroupBy.transform`, and :meth:`GroupBy.resample` where subclasses are not preserved (:issue:`28330`) - Bug in :meth:`GroupBy.rolling.apply` ignores args and kwargs parameters (:issue:`33433`) +- Bug in :meth:`core.groupby.DataFrameGroupBy.apply` where the output index shape for functions returning a DataFrame which is equally indexed + to the input DataFrame is inconsistent. An internal heuristic to detect index mutation would behave differently for equal but not identical + indices. In particular, the result index shape might change if a copy of the input would be returned. + The behaviour now is consistent, independent of internal heuristics. (:issue:`31612`, :issue:`14927`, :issue:`13056`) Reshaping ^^^^^^^^^ diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 0988cd7ff0dde..18422c2f86129 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -502,7 +502,7 @@ def apply_frame_axis0(object frame, object f, object names, # Need to infer if low level index slider will cause segfaults require_slow_apply = i == 0 and piece is chunk try: - if piece.index is not chunk.index: + if not piece.index.equals(chunk.index): mutated = True except AttributeError: # `piece` might not have an index, could be e.g. an int diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index e2b5118922a5a..bc8067212d60e 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -190,6 +190,46 @@ def f_constant_df(group): assert names == group_names +def test_apply_fast_slow_identical(): + # GH 31613 + + df = DataFrame({"A": [0, 0, 1], "b": range(3)}) + + # For simple index structures we check for fast/slow apply using + # an identity check on in/output + def slow(group): + return group + + def fast(group): + return group.copy() + + fast_df = df.groupby("A").apply(fast) + slow_df = df.groupby("A").apply(slow) + + tm.assert_frame_equal(fast_df, slow_df) + + +@pytest.mark.parametrize( + "func", + [ + lambda x: x, + lambda x: x[:], + lambda x: x.copy(deep=False), + lambda x: x.copy(deep=True), + ], +) +def test_groupby_apply_identity_maybecopy_index_identical(func): + # GH 14927 + # Whether the function returns a copy of the input data or not should not + # have an impact on the index structure of the result since this is not + # transparent to the user + + df = pd.DataFrame({"g": [1, 2, 2, 2], "a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + + result = df.groupby("g").apply(func) + tm.assert_frame_equal(result, df) + + def test_apply_with_mixed_dtype(): # GH3480, apply with mixed dtype on axis=1 breaks in 0.11 df = DataFrame(