From 843dbcabe89addaf4e23722c1c03698f36ead458 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Fri, 23 Sep 2022 17:51:32 -0400 Subject: [PATCH] Backport PR #48702: REGR: dropna affects observed in groupby --- doc/source/whatsnew/v1.5.1.rst | 55 +++++++++++++++++++++ pandas/core/groupby/grouper.py | 2 +- pandas/tests/groupby/conftest.py | 5 ++ pandas/tests/groupby/test_categorical.py | 17 +++++++ pandas/tests/groupby/test_groupby_dropna.py | 8 ++- 5 files changed, 85 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.5.1.rst b/doc/source/whatsnew/v1.5.1.rst index 209548a4adaf9..087cfa642fb64 100644 --- a/doc/source/whatsnew/v1.5.1.rst +++ b/doc/source/whatsnew/v1.5.1.rst @@ -10,6 +10,61 @@ including other versions of pandas. .. --------------------------------------------------------------------------- +.. _whatsnew_151.groupby_categorical_regr: + +Behavior of ``groupby`` with categorical groupers (:issue:`48645`) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In versions of pandas prior to 1.5, ``groupby`` with ``dropna=False`` would still drop +NA values when the grouper was a categorical dtype. A fix for this was attempted in +1.5, however it introduced a regression where passing ``observed=False`` and +``dropna=False`` to ``groupby`` would result in only observed categories. It was found +that the patch fixing the ``dropna=False`` bug is incompatible with ``observed=False``, +and decided that the best resolution is to restore the correct ``observed=False`` +behavior at the cost of reintroducing the ``dropna=False`` bug. + +.. ipython:: python + + df = pd.DataFrame( + { + "x": pd.Categorical([1, None], categories=[1, 2, 3]), + "y": [3, 4], + } + ) + df + +*1.5.0 behavior*: + +.. code-block:: ipython + + In [3]: # Correct behavior, NA values are not dropped + df.groupby("x", observed=True, dropna=False).sum() + Out[3]: + y + x + 1 3 + NaN 4 + + + In [4]: # Incorrect behavior, only observed categories present + df.groupby("x", observed=False, dropna=False).sum() + Out[4]: + y + x + 1 3 + NaN 4 + + +*1.5.1 behavior*: + +.. ipython:: python + + # Incorrect behavior, NA values are dropped + df.groupby("x", observed=True, dropna=False).sum() + + # Correct behavior, unobserved categories present (NA values still dropped) + df.groupby("x", observed=False, dropna=False).sum() + .. _whatsnew_151.regressions: Fixed regressions diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 72f54abdced27..e1df2d5d2f91b 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -658,7 +658,7 @@ def group_index(self) -> Index: @cache_readonly def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: - if self._dropna and self._passed_categorical: + if self._passed_categorical: # we make a CategoricalIndex out of the cat grouper # preserving the categories / ordered attributes; # doesn't (yet - GH#46909) handle dropna=False diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py index 58d9e500554dd..7e7b97d9273dc 100644 --- a/pandas/tests/groupby/conftest.py +++ b/pandas/tests/groupby/conftest.py @@ -24,6 +24,11 @@ def dropna(request): return request.param +@pytest.fixture(params=[True, False]) +def observed(request): + return request.param + + @pytest.fixture def mframe(multiindex_dataframe_random_data): return multiindex_dataframe_random_data diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 6d22c676a3c16..e99d1325a7e4f 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1828,3 +1828,20 @@ def test_groupby_categorical_aggregate_functions(): ) tm.assert_series_equal(result, expected) + + +def test_groupby_categorical_dropna(observed, dropna): + # GH#48645 - dropna should have no impact on the result when there are no NA values + cat = Categorical([1, 2], categories=[1, 2, 3]) + df = DataFrame({"x": Categorical([1, 2], categories=[1, 2, 3]), "y": [3, 4]}) + gb = df.groupby("x", observed=observed, dropna=dropna) + result = gb.sum() + + if observed: + expected = DataFrame({"y": [3, 4]}, index=cat) + else: + index = CategoricalIndex([1, 2, 3], [1, 2, 3]) + expected = DataFrame({"y": [3, 4, 0]}, index=index) + expected.index.name = "x" + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index b2426ffa9dad3..360e3096ceb63 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -408,7 +408,13 @@ def test_groupby_drop_nan_with_multi_index(): ([2, np.nan, 1, 2], "Float32"), ([2, np.nan, 1, 2], "Int64"), ([2, np.nan, 1, 2], "Float64"), - (["y", None, "x", "y"], "category"), + pytest.param( + ["y", None, "x", "y"], + "category", + marks=pytest.mark.xfail( + reason="dropna=False not correct for categorical, GH#48645" + ), + ), (["y", pd.NA, "x", "y"], "string"), pytest.param( ["y", pd.NA, "x", "y"],