From b5f01b7e817f565164c1118e71c0afe8ba4c8e32 Mon Sep 17 00:00:00 2001 From: Chris Zimmerman Date: Thu, 19 Sep 2019 14:52:27 -0500 Subject: [PATCH 1/7] Added failing test for groupby mutation --- pandas/tests/groupby/test_groupby.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index bec5cbc5fecb8..8b3fad8ba911d 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -433,6 +433,23 @@ def test_frame_groupby_columns(tsframe): assert len(v.columns) == 2 +def test_frame_groupby_avoids_mutate(): + # GH28523 + df = pd.DataFrame({"A": ["foo", "bar", "foo", "bar"], "B": [1, 2, 3, 4]}) + grouped = df.groupby("A") + + expected = grouped.apply(lambda x: x) + + funcs = ["sum", "prod", "min", "max", "last", "first"] + for fn in funcs: + func = getattr(grouped, fn) + func() + + result = grouped.apply(lambda x: x) + + assert tm.assert_frame_equal(expected, result) + + def test_frame_set_name_single(df): grouped = df.groupby("A") From 1dbb12dcc83429477897d219bbdd0816d3752cfe Mon Sep 17 00:00:00 2001 From: Chris Zimmerman Date: Thu, 19 Sep 2019 15:18:26 -0500 Subject: [PATCH 2/7] Switched implementation to use _group_selection_context --- pandas/core/groupby/groupby.py | 66 ++++++++++++++-------------- pandas/tests/groupby/test_groupby.py | 22 +++++++--- 2 files changed, 50 insertions(+), 38 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index e010e615e176e..68ef0cf57faed 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1364,27 +1364,27 @@ def f(self, **kwargs): if "min_count" not in kwargs: kwargs["min_count"] = min_count - self._set_group_selection() + with _group_selection_context(self): - # try a cython aggregation if we can - try: - return self._cython_agg_general(alias, alt=npfunc, **kwargs) - except AssertionError as e: - raise SpecificationError(str(e)) - except Exception: - pass + # try a cython aggregation if we can + try: + return self._cython_agg_general(alias, alt=npfunc, **kwargs) + except AssertionError as e: + raise SpecificationError(str(e)) + except Exception: + pass - # apply a non-cython aggregation - result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) + # apply a non-cython aggregation + result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) - # coerce the resulting columns if we can - if isinstance(result, DataFrame): - for col in result.columns: - result[col] = self._try_cast(result[col], self.obj[col]) - else: - result = self._try_cast(result, self.obj) + # coerce the resulting columns if we can + if isinstance(result, DataFrame): + for col in result.columns: + result[col] = self._try_cast(result[col], self.obj[col]) + else: + result = self._try_cast(result, self.obj) - return result + return result set_function_name(f, name, cls) @@ -1757,28 +1757,30 @@ def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFra nth_values = list(set(n)) nth_array = np.array(nth_values, dtype=np.intp) - self._set_group_selection() - mask_left = np.in1d(self._cumcount_array(), nth_array) - mask_right = np.in1d(self._cumcount_array(ascending=False) + 1, -nth_array) - mask = mask_left | mask_right + with _group_selection_context(self): + mask_left = np.in1d(self._cumcount_array(), nth_array) + mask_right = np.in1d( + self._cumcount_array(ascending=False) + 1, -nth_array + ) + mask = mask_left | mask_right - ids, _, _ = self.grouper.group_info + ids, _, _ = self.grouper.group_info - # Drop NA values in grouping - mask = mask & (ids != -1) + # Drop NA values in grouping + mask = mask & (ids != -1) - out = self._selected_obj[mask] - if not self.as_index: - return out + out = self._selected_obj[mask] + if not self.as_index: + return out - result_index = self.grouper.result_index - out.index = result_index[ids[mask]] + result_index = self.grouper.result_index + out.index = result_index[ids[mask]] - if not self.observed and isinstance(result_index, CategoricalIndex): - out = out.reindex(result_index) + if not self.observed and isinstance(result_index, CategoricalIndex): + out = out.reindex(result_index) - return out.sort_index() if self.sort else out + return out.sort_index() if self.sort else out # dropna is truthy if isinstance(n, valid_containers): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 8b3fad8ba911d..c9e5b591c7e79 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -433,21 +433,31 @@ def test_frame_groupby_columns(tsframe): assert len(v.columns) == 2 -def test_frame_groupby_avoids_mutate(): +@pytest.mark.parametrize( + "func, args", + [ + ("sum", []), + ("prod", []), + ("min", []), + ("max", []), + ("nth", [0]), + ("last", []), + ("first", []), + ], +) +def test_frame_groupby_avoids_mutate(func, args): # GH28523 df = pd.DataFrame({"A": ["foo", "bar", "foo", "bar"], "B": [1, 2, 3, 4]}) grouped = df.groupby("A") expected = grouped.apply(lambda x: x) - funcs = ["sum", "prod", "min", "max", "last", "first"] - for fn in funcs: - func = getattr(grouped, fn) - func() + fn = getattr(grouped, func) + fn(*args) result = grouped.apply(lambda x: x) - assert tm.assert_frame_equal(expected, result) + tm.assert_frame_equal(expected, result) def test_frame_set_name_single(df): From bcab4a0726ec528df5f38b998b6dc8a5c29a0d3d Mon Sep 17 00:00:00 2001 From: Chris Zimmerman Date: Thu, 19 Sep 2019 15:38:04 -0500 Subject: [PATCH 3/7] added whatsnew entry --- doc/source/whatsnew/v1.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 5e6a9093554eb..8572403e5214e 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -240,6 +240,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.rolling` not allowing for rolling over datetimes when ``axis=1`` (:issue: `28192`) - Bug in :meth:`DataFrame.groupby` not offering selection by column name when ``axis=1`` (:issue:`27614`) - Bug in :meth:`DataFrameGroupby.agg` not able to use lambda function with named aggregation (:issue:`27519`) +- Bug in :meth:`DataFrameGroupby` causing unexpected mutations of the groupby object (:issue:`28523`) Reshaping ^^^^^^^^^ From ca26b64bc5ee3b0e9dce9e7e820c3d1d48ebbba6 Mon Sep 17 00:00:00 2001 From: Chris Zimmerman Date: Thu, 19 Sep 2019 19:05:45 -0500 Subject: [PATCH 4/7] updated test, removed last instance of not using context manager to fix funcs --- pandas/core/groupby/groupby.py | 91 ++++++++++++++-------------- pandas/tests/groupby/test_groupby.py | 16 +---- 2 files changed, 48 insertions(+), 59 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 68ef0cf57faed..c53f3b58ac4cb 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -606,56 +606,55 @@ def _make_wrapper(self, name): ) raise AttributeError(msg) - self._set_group_selection() - - # need to setup the selection - # as are not passed directly but in the grouper - f = getattr(self._selected_obj, name) - if not isinstance(f, types.MethodType): - return self.apply(lambda self: getattr(self, name)) - - f = getattr(type(self._selected_obj), name) - - def wrapper(*args, **kwargs): - # a little trickery for aggregation functions that need an axis - # argument - kwargs_with_axis = kwargs.copy() - if "axis" not in kwargs_with_axis or kwargs_with_axis["axis"] is None: - kwargs_with_axis["axis"] = self.axis - - def curried_with_axis(x): - return f(x, *args, **kwargs_with_axis) - - def curried(x): - return f(x, *args, **kwargs) - - # preserve the name so we can detect it when calling plot methods, - # to avoid duplicates - curried.__name__ = curried_with_axis.__name__ = name - - # special case otherwise extra plots are created when catching the - # exception below - if name in base.plotting_methods: - return self.apply(curried) + with _group_selection_context(self): + # need to setup the selection + # as are not passed directly but in the grouper + f = getattr(self._selected_obj, name) + if not isinstance(f, types.MethodType): + return self.apply(lambda self: getattr(self, name)) + + f = getattr(type(self._selected_obj), name) + + def wrapper(*args, **kwargs): + # a little trickery for aggregation functions that need an axis + # argument + kwargs_with_axis = kwargs.copy() + if "axis" not in kwargs_with_axis or kwargs_with_axis["axis"] is None: + kwargs_with_axis["axis"] = self.axis + + def curried_with_axis(x): + return f(x, *args, **kwargs_with_axis) + + def curried(x): + return f(x, *args, **kwargs) + + # preserve the name so we can detect it when calling plot methods, + # to avoid duplicates + curried.__name__ = curried_with_axis.__name__ = name + + # special case otherwise extra plots are created when catching the + # exception below + if name in base.plotting_methods: + return self.apply(curried) - try: - return self.apply(curried_with_axis) - except Exception: try: - return self.apply(curried) + return self.apply(curried_with_axis) except Exception: - - # related to : GH3688 - # try item-by-item - # this can be called recursively, so need to raise - # ValueError - # if we don't have this method to indicated to aggregate to - # mark this column as an error try: - return self._aggregate_item_by_item(name, *args, **kwargs) - except AttributeError: - # e.g. SparseArray has no flags attr - raise ValueError + return self.apply(curried) + except Exception: + + # related to : GH3688 + # try item-by-item + # this can be called recursively, so need to raise + # ValueError + # if we don't have this method to indicated to aggregate to + # mark this column as an error + try: + return self._aggregate_item_by_item(name, *args, **kwargs) + except AttributeError: + # e.g. SparseArray has no flags attr + raise ValueError return wrapper diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index c9e5b591c7e79..ccacf4ae7ce30 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -433,25 +433,15 @@ def test_frame_groupby_columns(tsframe): assert len(v.columns) == 2 -@pytest.mark.parametrize( - "func, args", - [ - ("sum", []), - ("prod", []), - ("min", []), - ("max", []), - ("nth", [0]), - ("last", []), - ("first", []), - ], -) -def test_frame_groupby_avoids_mutate(func, args): +def test_frame_groupby_avoids_mutate(reduction_func): # GH28523 + func = reduction_func df = pd.DataFrame({"A": ["foo", "bar", "foo", "bar"], "B": [1, 2, 3, 4]}) grouped = df.groupby("A") expected = grouped.apply(lambda x: x) + args = {"nth": [0], "quantile": [0.5]}.get(func, []) fn = getattr(grouped, func) fn(*args) From f24c05ca22f7f36f296e94ebf1d1d3648f7587e5 Mon Sep 17 00:00:00 2001 From: Chris Zimmerman Date: Sun, 20 Oct 2019 21:31:15 -0500 Subject: [PATCH 5/7] Missed a merge --- pandas/core/groupby/groupby.py | 33 +++++++++++---------------------- 1 file changed, 11 insertions(+), 22 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index db5d0d1bf0d56..414b001a702b2 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1335,32 +1335,21 @@ def f(self, **kwargs): kwargs["min_count"] = min_count with _group_selection_context(self): - -<<<<<<< HEAD # try a cython aggregation if we can try: return self._cython_agg_general(alias, alt=npfunc, **kwargs) - except AssertionError as e: - raise SpecificationError(str(e)) - except Exception: - pass -======= - # try a cython aggregation if we can - try: - return self._cython_agg_general(alias, alt=npfunc, **kwargs) - except DataError: - pass - except NotImplementedError as err: - if "function is not implemented for this dtype" in str(err): - # raised in _get_cython_function, in some cases can - # be trimmed by implementing cython funcs for more dtypes + except DataError: pass - elif "decimal does not support skipna=True" in str(err): - # FIXME: kludge for test_decimal:test_in_numeric_groupby - pass - else: - raise ->>>>>>> upstream/master + except NotImplementedError as err: + if "function is not implemented for this dtype" in str(err): + # raised in _get_cython_function, in some cases can + # be trimmed by implementing cython funcs for more dtypes + pass + elif "decimal does not support skipna=True" in str(err): + # FIXME: kludge for test_decimal:test_in_numeric_groupby + pass + else: + raise # apply a non-cython aggregation result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) From 5cc3cb17bc88664d09b098d22e83532896d7519b Mon Sep 17 00:00:00 2001 From: Chris Zimmerman Date: Sun, 20 Oct 2019 21:44:46 -0500 Subject: [PATCH 6/7] Fixed two tests --- pandas/tests/groupby/aggregate/test_other.py | 4 ++-- pandas/tests/groupby/test_grouping.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 5dad868c8c3aa..e89d5cfeb10cd 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -482,13 +482,13 @@ def test_agg_timezone_round_trip(): assert ts == grouped.first()["B"].iloc[0] # GH#27110 applying iloc should return a DataFrame - assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 0] + assert ts == grouped.apply(lambda x: x.iloc[0]).drop("A", 1).iloc[0, 0] ts = df["B"].iloc[2] assert ts == grouped.last()["B"].iloc[0] # GH#27110 applying iloc should return a DataFrame - assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 0] + assert ts == grouped.apply(lambda x: x.iloc[-1]).drop("A", 1).iloc[0, 0] def test_sum_uint64_overflow(): diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 403f5f11ee768..0adc1cda6c499 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -172,7 +172,7 @@ def test_grouper_creation_bug(self): result = g.sum() assert_frame_equal(result, expected) - result = g.apply(lambda x: x.sum()) + result = g.apply(lambda x: x.sum()).drop("A", 1) assert_frame_equal(result, expected) g = df.groupby(pd.Grouper(key="A", axis=0)) From 30509636e34bfaa79eddd9cfad785d716fc9a8bb Mon Sep 17 00:00:00 2001 From: Chris Zimmerman Date: Mon, 21 Oct 2019 08:38:27 -0500 Subject: [PATCH 7/7] Another context needed to be set --- pandas/core/groupby/groupby.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 414b001a702b2..34e78ff82acee 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -648,8 +648,9 @@ def curried(x): # in tests.groupby.test_function.test_non_cython_api raise ValueError - wrapper.__name__ = name - return wrapper + wrapper.__name__ = name + + return wrapper def get_group(self, name, obj=None): """ @@ -720,7 +721,9 @@ def f(g): f = func # ignore SettingWithCopy here in case the user mutates - with option_context("mode.chained_assignment", None): + with option_context( + "mode.chained_assignment", None + ) as _, _group_selection_context(self) as _: try: result = self._python_apply_general(f) except TypeError: @@ -731,9 +734,7 @@ def f(g): # except if the udf is trying an operation that # fails on *some* columns, e.g. a numeric operation # on a string grouper column - - with _group_selection_context(self): - return self._python_apply_general(f) + return self._python_apply_general(f) return result