diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index ea7e0f88ff81e..e379f3126fa77 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -789,6 +789,7 @@ Groupby/resample/rolling - :class:`core.window.ewm.ExponentialMovingWindow` now raises a ``NotImplementedError`` when specifying ``times`` with ``adjust=False`` due to an incorrect calculation (:issue:`40098`) - Bug in :meth:`Series.asfreq` and :meth:`DataFrame.asfreq` dropping rows when the index is not sorted (:issue:`39805`) - Bug in aggregation functions for :class:`DataFrame` not respecting ``numeric_only`` argument when ``level`` keyword was given (:issue:`40660`) +- Bug in :meth:`SeriesGroupBy.aggregate` where using a user-defined function to aggregate a ``Series`` with an object-typed :class:`Index` causes an incorrect :class:`Index` shape (issue:`40014`) - Bug in :class:`core.window.RollingGroupby` where ``as_index=False`` argument in ``groupby`` was ignored (:issue:`39433`) - Bug in :meth:`.GroupBy.any` and :meth:`.GroupBy.all` raising ``ValueError`` when using with nullable type columns holding ``NA`` even with ``skipna=True`` (:issue:`40585`) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 9acff1cac305c..ec0678cd87f7e 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -56,7 +56,7 @@ cdef class _BaseGrouper: cdef inline _update_cached_objs(self, object cached_typ, object cached_ityp, Slider islider, Slider vslider): if cached_typ is None: - cached_ityp = self.ityp(islider.buf) + cached_ityp = self.ityp(islider.buf, dtype=self.idtype) cached_typ = self.typ( vslider.buf, dtype=vslider.buf.dtype, index=cached_ityp, name=self.name ) @@ -70,7 +70,6 @@ cdef class _BaseGrouper: cached_typ._mgr.set_values(vslider.buf) object.__setattr__(cached_typ, '_index', cached_ityp) object.__setattr__(cached_typ, 'name', self.name) - return cached_typ, cached_ityp cdef inline object _apply_to_group(self, @@ -106,7 +105,7 @@ cdef class SeriesBinGrouper(_BaseGrouper): cdef public: ndarray arr, index, dummy_arr, dummy_index - object values, f, bins, typ, ityp, name + object values, f, bins, typ, ityp, name, idtype def __init__(self, object series, object f, object bins): @@ -122,6 +121,7 @@ cdef class SeriesBinGrouper(_BaseGrouper): self.arr = values self.typ = series._constructor self.ityp = series.index._constructor + self.idtype = series.index.dtype self.index = series.index.values self.name = series.name @@ -199,7 +199,7 @@ cdef class SeriesGrouper(_BaseGrouper): cdef public: ndarray arr, index, dummy_arr, dummy_index - object f, labels, values, typ, ityp, name + object f, labels, values, typ, ityp, name, idtype def __init__(self, object series, object f, ndarray[intp_t] labels, Py_ssize_t ngroups): @@ -218,6 +218,7 @@ cdef class SeriesGrouper(_BaseGrouper): self.arr = values self.typ = series._constructor self.ityp = series.index._constructor + self.idtype = series.index.dtype self.index = series.index.values self.name = series.name diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index fc0b4d86e81bf..28344897a686f 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1221,3 +1221,18 @@ def test_aggregate_numeric_object_dtype(): {"key": ["A", "B"], "col1": ["a", "c"], "col2": [0, 2]} ).set_index("key") tm.assert_frame_equal(result, expected) + + +def test_groupby_index_object_dtype(): + # GH 40014 + df = DataFrame({"c0": ["x", "x", "x"], "c1": ["x", "x", "y"], "p": [0, 1, 2]}) + df.index = df.index.astype("O") + grouped = df.groupby(["c0", "c1"]) + res = grouped.p.agg(lambda x: all(x > 0)) + # Check that providing a user-defined function in agg() + # produces the correct index shape when using an object-typed index. + expected_index = MultiIndex.from_tuples( + [("x", "x"), ("x", "y")], names=("c0", "c1") + ) + expected = Series([False, True], index=expected_index, name="p") + tm.assert_series_equal(res, expected) diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py index 5fcf4a73479a5..13fddad30eeba 100644 --- a/pandas/tests/groupby/test_bin_groupby.py +++ b/pandas/tests/groupby/test_bin_groupby.py @@ -27,6 +27,22 @@ def test_series_grouper(): tm.assert_almost_equal(counts, exp_counts) +def test_series_grouper_result_length_difference(): + # GH 40014 + obj = Series(np.random.randn(10), dtype="float64") + obj.index = obj.index.astype("O") + labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.intp) + + grouper = libreduction.SeriesGrouper(obj, lambda x: all(x > 0), labels, 2) + result, counts = grouper.get_result() + + expected = np.array([all(obj[3:6] > 0), all(obj[6:] > 0)]) + tm.assert_equal(result, expected) + + exp_counts = np.array([3, 4], dtype=np.int64) + tm.assert_equal(counts, exp_counts) + + def test_series_grouper_requires_nonempty_raises(): # GH#29500 obj = Series(np.random.randn(10))