diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 5adc8540e6864..14f33cc2c8535 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -895,6 +895,8 @@ Groupby/resample/rolling - Bug in :meth:`SeriesGroupBy.agg` failing to retain ordered :class:`CategoricalDtype` on order-preserving aggregations (:issue:`41147`) - Bug in :meth:`DataFrameGroupBy.min` and :meth:`DataFrameGroupBy.max` with multiple object-dtype columns and ``numeric_only=False`` incorrectly raising ``ValueError`` (:issue:41111`) - Bug in :meth:`DataFrameGroupBy.rank` with the GroupBy object's ``axis=0`` and the ``rank`` method's keyword ``axis=1`` (:issue:`41320`) +- Bug in :meth:`DataFrameGroupBy.__getitem__` with non-unique columns incorrectly returning a malformed :class:`SeriesGroupBy` instead of :class:`DataFrameGroupBy` (:issue:`41427`) +- Bug in :meth:`DataFrameGroupBy.transform` with non-unique columns incorrectly raising ``AttributeError`` (:issue:`41427`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/base.py b/pandas/core/base.py index e45c4bf514973..28f19a6beeec0 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -214,7 +214,7 @@ def ndim(self) -> int: @cache_readonly def _obj_with_exclusions(self): if self._selection is not None and isinstance(self.obj, ABCDataFrame): - return self.obj.reindex(columns=self._selection_list) + return self.obj[self._selection_list] if len(self.exclusions) > 0: return self.obj.drop(self.exclusions, axis=1) @@ -239,7 +239,9 @@ def __getitem__(self, key): else: if key not in self.obj: raise KeyError(f"Column not found: {key}") - return self._gotitem(key, ndim=1) + subset = self.obj[key] + ndim = subset.ndim + return self._gotitem(key, ndim=ndim, subset=subset) def _gotitem(self, key, ndim: int, subset=None): """ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index c5d9144893f48..2997deb41c78b 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1417,12 +1417,19 @@ def _choose_path(self, fast_path: Callable, slow_path: Callable, group: DataFram return path, res def _transform_item_by_item(self, obj: DataFrame, wrapper) -> DataFrame: - # iterate through columns + # iterate through columns, see test_transform_exclude_nuisance output = {} inds = [] for i, col in enumerate(obj): + subset = obj.iloc[:, i] + sgb = SeriesGroupBy( + subset, + selection=col, + grouper=self.grouper, + exclusions=self.exclusions, + ) try: - output[col] = self[col].transform(wrapper) + output[i] = sgb.transform(wrapper) except TypeError: # e.g. trying to call nanmean with string values pass @@ -1434,7 +1441,9 @@ def _transform_item_by_item(self, obj: DataFrame, wrapper) -> DataFrame: columns = obj.columns.take(inds) - return self.obj._constructor(output, index=obj.index, columns=columns) + result = self.obj._constructor(output, index=obj.index) + result.columns = columns + return result def filter(self, func, dropna=True, *args, **kwargs): """ @@ -1504,7 +1513,7 @@ def filter(self, func, dropna=True, *args, **kwargs): return self._apply_filter(indices, dropna) - def __getitem__(self, key): + def __getitem__(self, key) -> DataFrameGroupBy | SeriesGroupBy: if self.axis == 1: # GH 37725 raise ValueError("Cannot subset columns when using axis=1") diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index b22e4749bfdfc..09317cbeec658 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -20,6 +20,10 @@ date_range, ) import pandas._testing as tm +from pandas.core.groupby.generic import ( + DataFrameGroupBy, + SeriesGroupBy, +) from pandas.core.groupby.groupby import DataError @@ -391,13 +395,31 @@ def test_transform_select_columns(df): tm.assert_frame_equal(result, expected) -def test_transform_exclude_nuisance(df): +@pytest.mark.parametrize("duplicates", [True, False]) +def test_transform_exclude_nuisance(df, duplicates): + # case that goes through _transform_item_by_item + + if duplicates: + # make sure we work with duplicate columns GH#41427 + df.columns = ["A", "C", "C", "D"] # this also tests orderings in transform between # series/frame to make sure it's consistent expected = {} grouped = df.groupby("A") - expected["C"] = grouped["C"].transform(np.mean) + + gbc = grouped["C"] + expected["C"] = gbc.transform(np.mean) + if duplicates: + # squeeze 1-column DataFrame down to Series + expected["C"] = expected["C"]["C"] + + assert isinstance(gbc.obj, DataFrame) + assert isinstance(gbc, DataFrameGroupBy) + else: + assert isinstance(gbc, SeriesGroupBy) + assert isinstance(gbc.obj, Series) + expected["D"] = grouped["D"].transform(np.mean) expected = DataFrame(expected) result = df.groupby("A").transform(np.mean)