diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index efb4a572486e3..28c4b89b925dd 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -716,7 +716,9 @@ Groupby/resample/rolling - Bug in :class:`.DataFrameGroupBy` and :class:`.SeriesGroupBy` would not include unobserved categories in result when grouping by categorical indexes (:issue:`49354`) - Bug in :class:`.DataFrameGroupBy` and :class:`.SeriesGroupBy` would change result order depending on the input index when grouping by categoricals (:issue:`49223`) - Bug in :class:`.DataFrameGroupBy` and :class:`.SeriesGroupBy` when grouping on categorical data would sort result values even when used with ``sort=False`` (:issue:`42482`) -- +- Bug in :meth:`.DataFrameGroupBy.apply` and :class:`SeriesGroupBy.apply` with ``as_index=False`` would not attempt the computation without using the grouping keys when using them failed with a ``TypeError`` (:issue:`49256`) +- Bug in :meth:`.DataFrameGroupBy.describe` would describe the group keys (:issue:`49256`) +- Bug in :meth:`.SeriesGroupBy.describe` with ``as_index=False`` would have the incorrect shape (:issue:`49256`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 7a225712b63a7..1d8271a845f9a 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1061,8 +1061,7 @@ def _set_group_selection(self) -> None: # This is a no-op for SeriesGroupBy grp = self.grouper if not ( - self.as_index - and grp.groupings is not None + grp.groupings is not None and self.obj.ndim > 1 and self._group_selection is None ): @@ -2640,7 +2639,14 @@ def describe(self, **kwargs): ) if self.axis == 1: return result.T - return result.unstack() + + # GH#49256 - properly handle the grouping column(s) + if self._selected_obj.ndim != 1 or self.as_index: + result = result.unstack() + if not self.as_index: + self._insert_inaxis_grouper_inplace(result) + + return result @final def resample(self, rule, *args, **kwargs): diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index eb61f8defeaf8..af523928e0bc8 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -974,15 +974,21 @@ def test_apply_function_index_return(function): def test_apply_function_with_indexing_return_column(): - # GH#7002, GH#41480 + # GH#7002, GH#41480, GH#49256 df = DataFrame( { "foo1": ["one", "two", "two", "three", "one", "two"], "foo2": [1, 2, 4, 4, 5, 6], } ) - with pytest.raises(TypeError, match="Could not convert"): - df.groupby("foo1", as_index=False).apply(lambda x: x.mean()) + result = df.groupby("foo1", as_index=False).apply(lambda x: x.mean()) + expected = DataFrame( + { + "foo1": ["one", "three", "two"], + "foo2": [3.0, 4.0, 4.0], + } + ) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index b848ff81f35ee..ba3db6037245c 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -336,13 +336,7 @@ def test_describe(self, df, gb, gni): result = gb.describe() tm.assert_frame_equal(result, expected) - expected = pd.concat( - [ - df[df.A == 1].describe().unstack().to_frame().T, - df[df.A == 3].describe().unstack().to_frame().T, - ] - ) - expected.index = Index([0, 1]) + expected = expected.reset_index() result = gni.describe() tm.assert_frame_equal(result, expected) @@ -1093,6 +1087,38 @@ def test_series_describe_single(): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("keys", ["key1", ["key1", "key2"]]) +def test_series_describe_as_index(as_index, keys): + # GH#49256 + df = DataFrame( + { + "key1": ["one", "two", "two", "three", "two"], + "key2": ["one", "two", "two", "three", "two"], + "foo2": [1, 2, 4, 4, 6], + } + ) + gb = df.groupby(keys, as_index=as_index)["foo2"] + result = gb.describe() + expected = DataFrame( + { + "key1": ["one", "three", "two"], + "count": [1.0, 1.0, 3.0], + "mean": [1.0, 4.0, 4.0], + "std": [np.nan, np.nan, 2.0], + "min": [1.0, 4.0, 2.0], + "25%": [1.0, 4.0, 3.0], + "50%": [1.0, 4.0, 4.0], + "75%": [1.0, 4.0, 5.0], + "max": [1.0, 4.0, 6.0], + } + ) + if len(keys) == 2: + expected.insert(1, "key2", expected["key1"]) + if as_index: + expected = expected.set_index(keys) + tm.assert_frame_equal(result, expected) + + def test_series_index_name(df): grouped = df.loc[:, ["C"]].groupby(df["A"]) result = grouped.agg(lambda x: x.mean()) @@ -1177,29 +1203,25 @@ def test_frame_describe_unstacked_format(): "pandas.errors.PerformanceWarning" ) @pytest.mark.parametrize("as_index", [True, False]) -def test_describe_with_duplicate_output_column_names(as_index): +@pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]]) +def test_describe_with_duplicate_output_column_names(as_index, keys): # GH 35314 df = DataFrame( { - "a": [99, 99, 99, 88, 88, 88], + "a1": [99, 99, 99, 88, 88, 88], + "a2": [99, 99, 99, 88, 88, 88], "b": [1, 2, 3, 4, 5, 6], "c": [10, 20, 30, 40, 50, 60], }, - columns=["a", "b", "b"], + columns=["a1", "a2", "b", "b"], copy=False, ) + if keys == ["a1"]: + df = df.drop(columns="a2") expected = ( DataFrame.from_records( [ - ("a", "count", 3.0, 3.0), - ("a", "mean", 88.0, 99.0), - ("a", "std", 0.0, 0.0), - ("a", "min", 88.0, 99.0), - ("a", "25%", 88.0, 99.0), - ("a", "50%", 88.0, 99.0), - ("a", "75%", 88.0, 99.0), - ("a", "max", 88.0, 99.0), ("b", "count", 3.0, 3.0), ("b", "mean", 5.0, 2.0), ("b", "std", 1.0, 1.0), @@ -1222,14 +1244,17 @@ def test_describe_with_duplicate_output_column_names(as_index): .T ) expected.columns.names = [None, None] - expected.index = Index([88, 99], name="a") - - if as_index: - expected = expected.drop(columns=["a"], level=0) + if len(keys) == 2: + expected.index = MultiIndex( + levels=[[88, 99], [88, 99]], codes=[[0, 1], [0, 1]], names=["a1", "a2"] + ) else: - expected = expected.reset_index(drop=True) + expected.index = Index([88, 99], name="a1") + + if not as_index: + expected = expected.reset_index() - result = df.groupby("a", as_index=as_index).describe() + result = df.groupby(keys, as_index=as_index).describe() tm.assert_frame_equal(result, expected)