From fe85112fa4e715a2e11fab241444189ee2f76de5 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 28 Oct 2022 13:27:06 -0400 Subject: [PATCH 1/2] BUG: groupby with sort=False still sorts an ordered categorical --- doc/source/whatsnew/v2.0.0.rst | 2 + pandas/core/groupby/categorical.py | 4 +- pandas/core/groupby/groupby.py | 4 +- pandas/core/groupby/grouper.py | 2 +- pandas/tests/groupby/test_categorical.py | 155 ++++++----------------- 5 files changed, 49 insertions(+), 118 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 73a75667b46da..4e9d4be2d91e3 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -649,6 +649,8 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupBy.resample` produces inconsistent results when passing empty DataFrame (:issue:`47705`) - Bug in :class:`.DataFrameGroupBy` and :class:`.SeriesGroupBy` would not include unobserved categories in result when grouping by categorical indexes (:issue:`49354`) - Bug in :class:`.DataFrameGroupBy` and :class:`.SeriesGroupBy` would change result order depending on the input index when grouping by categoricals (:issue:`49223`) +- Bug in :class:`.DataFrameGroupBy` and :class:`.SeriesGroupBy` when grouping on categorical data would sort result values even when used with ``sort=False`` (:issue:`42482`) +- Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/categorical.py b/pandas/core/groupby/categorical.py index 0a8e12caead1c..20248cd69bfb9 100644 --- a/pandas/core/groupby/categorical.py +++ b/pandas/core/groupby/categorical.py @@ -53,7 +53,7 @@ def recode_for_groupby( unique_codes = unique1d(c.codes) take_codes = unique_codes[unique_codes != -1] - if c.ordered or sort: + if sort: take_codes = np.sort(take_codes) # we recode according to the uniques @@ -75,7 +75,7 @@ def recode_for_groupby( all_codes = np.arange(c.categories.nunique()) # GH 38140: exclude nan from indexer for categories unique_notnan_codes = unique1d(c.codes[c.codes != -1]) - if c.ordered: + if sort: unique_notnan_codes = np.sort(unique_notnan_codes) if len(all_codes) > len(unique_notnan_codes): # GH 13179: All categories need to be present, even if missing from the data diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index e0baaaeb3c8f9..f1c18b7762f66 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -4116,7 +4116,9 @@ def _reindex_output( # "ndarray[Any, dtype[floating[_64Bit]]]"; expected "Index" levels_list.append(qs) # type: ignore[arg-type] names = names + [None] - index, _ = MultiIndex.from_product(levels_list, names=names).sortlevel() + index = MultiIndex.from_product(levels_list, names=names) + if self.sort: + index = index.sortlevel()[0] if self.as_index: # Always holds for SeriesGroupBy unless GH#36507 is implemented diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 7da7ea119cea3..688dcb44c31f3 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -655,7 +655,7 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: if self._observed: ucodes = algorithms.unique1d(cat.codes) ucodes = ucodes[ucodes != -1] - if self._sort or cat.ordered: + if self._sort: ucodes = np.sort(ucodes) else: ucodes = np.arange(len(categories)) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 8fe1dc010211a..1e2bcb58110dd 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -818,12 +818,14 @@ def test_preserve_categories(): # ordered=True df = DataFrame({"A": Categorical(list("ba"), categories=categories, ordered=True)}) - index = CategoricalIndex(categories, categories, ordered=True, name="A") + sort_index = CategoricalIndex(categories, categories, ordered=True, name="A") + nosort_index = CategoricalIndex(list("bac"), categories, ordered=True, name="A") tm.assert_index_equal( - df.groupby("A", sort=True, observed=False).first().index, index + df.groupby("A", sort=True, observed=False).first().index, sort_index ) + # GH#42482 - don't sort result when sort=False, even when ordered=True tm.assert_index_equal( - df.groupby("A", sort=False, observed=False).first().index, index + df.groupby("A", sort=False, observed=False).first().index, nosort_index ) # ordered=False @@ -972,8 +974,11 @@ def test_sort(): tm.assert_series_equal(res, exp) -def test_sort2(): +@pytest.mark.parametrize("ordered", [True, False]) +def test_sort2(sort, ordered): # dataframe groupby sort was being ignored # GH 8868 + # GH#48749 - don't change order of categories + # GH#42482 - don't sort result when sort=False, even when ordered=True df = DataFrame( [ ["(7.5, 10]", 10, 10], @@ -986,53 +991,28 @@ def test_sort2(): ], columns=["range", "foo", "bar"], ) - df["range"] = Categorical(df["range"], ordered=True) - index = CategoricalIndex( - ["(0, 2.5]", "(2.5, 5]", "(5, 7.5]", "(7.5, 10]"], name="range", ordered=True - ) - expected_sort = DataFrame( - [[1, 60], [5, 30], [6, 40], [10, 10]], columns=["foo", "bar"], index=index - ) - - col = "range" - result_sort = df.groupby(col, sort=True, observed=False).first() - tm.assert_frame_equal(result_sort, expected_sort) - - # when categories is ordered, group is ordered by category's order - expected_sort = result_sort - result_sort = df.groupby(col, sort=False, observed=False).first() - tm.assert_frame_equal(result_sort, expected_sort) + df["range"] = Categorical(df["range"], ordered=ordered) + result = df.groupby("range", sort=sort, observed=False).first() - df["range"] = Categorical(df["range"], ordered=False) - index = CategoricalIndex( - ["(0, 2.5]", "(2.5, 5]", "(5, 7.5]", "(7.5, 10]"], name="range" - ) - expected_sort = DataFrame( - [[1, 60], [5, 30], [6, 40], [10, 10]], columns=["foo", "bar"], index=index - ) - - index = CategoricalIndex( - ["(7.5, 10]", "(2.5, 5]", "(5, 7.5]", "(0, 2.5]"], - # GH#48749 - don't change order of categories - categories=["(0, 2.5]", "(2.5, 5]", "(5, 7.5]", "(7.5, 10]"], - name="range", - ) - expected_nosort = DataFrame( - [[10, 10], [5, 30], [6, 40], [1, 60]], index=index, columns=["foo", "bar"] + if sort: + data_values = [[1, 60], [5, 30], [6, 40], [10, 10]] + index_values = ["(0, 2.5]", "(2.5, 5]", "(5, 7.5]", "(7.5, 10]"] + else: + data_values = [[10, 10], [5, 30], [6, 40], [1, 60]] + index_values = ["(7.5, 10]", "(2.5, 5]", "(5, 7.5]", "(0, 2.5]"] + expected = DataFrame( + data_values, + columns=["foo", "bar"], + index=CategoricalIndex(index_values, name="range", ordered=ordered), ) - col = "range" - - # this is an unordered categorical, but we allow this #### - result_sort = df.groupby(col, sort=True, observed=False).first() - tm.assert_frame_equal(result_sort, expected_sort) - - result_nosort = df.groupby(col, sort=False, observed=False).first() - tm.assert_frame_equal(result_nosort, expected_nosort) + tm.assert_frame_equal(result, expected) -def test_sort_datetimelike(): +@pytest.mark.parametrize("ordered", [True, False]) +def test_sort_datetimelike(sort, ordered): # GH10505 + # GH#42482 - don't sort result when sort=False, even when ordered=True # use same data as test_groupby_sort_categorical, which category is # corresponding to datetime.month @@ -1054,80 +1034,30 @@ def test_sort_datetimelike(): ) # ordered=True - df["dt"] = Categorical(df["dt"], ordered=True) - index = [ - datetime(2011, 1, 1), - datetime(2011, 2, 1), - datetime(2011, 5, 1), - datetime(2011, 7, 1), - ] - result_sort = DataFrame( - [[1, 60], [5, 30], [6, 40], [10, 10]], columns=["foo", "bar"] - ) - result_sort.index = CategoricalIndex(index, name="dt", ordered=True) - - index = [ - datetime(2011, 7, 1), - datetime(2011, 2, 1), - datetime(2011, 5, 1), - datetime(2011, 1, 1), - ] - result_nosort = DataFrame( - [[10, 10], [5, 30], [6, 40], [1, 60]], columns=["foo", "bar"] - ) - result_nosort.index = CategoricalIndex( - index, categories=index, name="dt", ordered=True - ) - - col = "dt" - tm.assert_frame_equal( - result_sort, df.groupby(col, sort=True, observed=False).first() - ) - - # when categories is ordered, group is ordered by category's order - tm.assert_frame_equal( - result_sort, df.groupby(col, sort=False, observed=False).first() - ) - - # ordered = False - df["dt"] = Categorical(df["dt"], ordered=False) - sort_index = CategoricalIndex( - [ + df["dt"] = Categorical(df["dt"], ordered=ordered) + if sort: + data_values = [[1, 60], [5, 30], [6, 40], [10, 10]] + index_values = [ datetime(2011, 1, 1), datetime(2011, 2, 1), datetime(2011, 5, 1), datetime(2011, 7, 1), - ], - name="dt", - ) - result_sort = DataFrame( - [[1, 60], [5, 30], [6, 40], [10, 10]], columns=["foo", "bar"], index=sort_index - ) - - nosort_index = CategoricalIndex( - [ + ] + else: + data_values = [[10, 10], [5, 30], [6, 40], [1, 60]] + index_values = [ datetime(2011, 7, 1), datetime(2011, 2, 1), datetime(2011, 5, 1), datetime(2011, 1, 1), - ], - # GH#48749 - don't change order of categories - categories=sort_index.categories, - name="dt", - ) - result_nosort = DataFrame( - [[10, 10], [5, 30], [6, 40], [1, 60]], + ] + expected = DataFrame( + data_values, columns=["foo", "bar"], - index=nosort_index, - ) - - col = "dt" - tm.assert_frame_equal( - result_sort, df.groupby(col, sort=True, observed=False).first() - ) - tm.assert_frame_equal( - result_nosort, df.groupby(col, sort=False, observed=False).first() + index=CategoricalIndex(index_values, name="dt", ordered=ordered), ) + result = df.groupby("dt", sort=sort, observed=False).first() + tm.assert_frame_equal(result, expected) def test_empty_sum(): @@ -2055,13 +1985,10 @@ def test_category_order_apply(as_index, sort, observed, method, index_kind, orde @pytest.mark.parametrize("index_kind", ["range", "single", "multi"]) -def test_many_categories(request, as_index, sort, index_kind, ordered): +def test_many_categories(as_index, sort, index_kind, ordered): # GH#48749 - Test when the grouper has many categories if index_kind != "range" and not as_index: pytest.skip(reason="Result doesn't have categories, nothing to test") - if index_kind == "multi" and as_index and not sort and ordered: - msg = "GH#48749 - values are unsorted even though the Categorical is ordered" - request.node.add_marker(pytest.mark.xfail(reason=msg)) categories = np.arange(9999, -1, -1) grouper = Categorical([2, 1, 2, 3], categories=categories, ordered=ordered) df = DataFrame({"a": grouper, "b": range(4)}) @@ -2078,7 +2005,7 @@ def test_many_categories(request, as_index, sort, index_kind, ordered): result = gb.sum() # Test is setup so that data and index are the same values - data = [3, 2, 1] if sort or ordered else [2, 1, 3] + data = [3, 2, 1] if sort else [2, 1, 3] index = CategoricalIndex( data, categories=grouper.categories, ordered=ordered, name="a" From bddb7e3fe8fcf809feff0f1ae4173a5b8bbe1ef1 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Thu, 10 Nov 2022 12:46:42 -0500 Subject: [PATCH 2/2] Add versionchanged --- pandas/core/shared_docs.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index cfabe05ec9e3b..07dc203e556e8 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -119,6 +119,12 @@ Sort group keys. Get better performance by turning this off. Note this does not influence the order of observations within each group. Groupby preserves the order of rows within each group. + + .. versionchanged:: 2.0.0 + + Specifying ``sort=False`` with an ordered categorical grouper will no + longer sort the values. + group_keys : bool, optional When calling apply and the ``by`` argument produces a like-indexed (i.e. :ref:`a transform `) result, add group keys to