Skip to content

BUG: groupby with sort=False still sorts an ordered categorical #49613

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Nov 10, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -649,6 +649,8 @@ Groupby/resample/rolling
- Bug in :meth:`DataFrameGroupBy.resample` produces inconsistent results when passing empty DataFrame (:issue:`47705`)
- Bug in :class:`.DataFrameGroupBy` and :class:`.SeriesGroupBy` would not include unobserved categories in result when grouping by categorical indexes (:issue:`49354`)
- Bug in :class:`.DataFrameGroupBy` and :class:`.SeriesGroupBy` would change result order depending on the input index when grouping by categoricals (:issue:`49223`)
- Bug in :class:`.DataFrameGroupBy` and :class:`.SeriesGroupBy` when grouping on categorical data would sort result values even when used with ``sort=False`` (:issue:`42482`)
-

Reshaping
^^^^^^^^^
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/groupby/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def recode_for_groupby(
unique_codes = unique1d(c.codes)

take_codes = unique_codes[unique_codes != -1]
if c.ordered or sort:
if sort:
take_codes = np.sort(take_codes)

# we recode according to the uniques
Expand All @@ -75,7 +75,7 @@ def recode_for_groupby(
all_codes = np.arange(c.categories.nunique())
# GH 38140: exclude nan from indexer for categories
unique_notnan_codes = unique1d(c.codes[c.codes != -1])
if c.ordered:
if sort:
unique_notnan_codes = np.sort(unique_notnan_codes)
if len(all_codes) > len(unique_notnan_codes):
# GH 13179: All categories need to be present, even if missing from the data
Expand Down
4 changes: 3 additions & 1 deletion pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -4116,7 +4116,9 @@ def _reindex_output(
# "ndarray[Any, dtype[floating[_64Bit]]]"; expected "Index"
levels_list.append(qs) # type: ignore[arg-type]
names = names + [None]
index, _ = MultiIndex.from_product(levels_list, names=names).sortlevel()
index = MultiIndex.from_product(levels_list, names=names)
if self.sort:
index = index.sortlevel()[0]

if self.as_index:
# Always holds for SeriesGroupBy unless GH#36507 is implemented
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/groupby/grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -655,7 +655,7 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
if self._observed:
ucodes = algorithms.unique1d(cat.codes)
ucodes = ucodes[ucodes != -1]
if self._sort or cat.ordered:
if self._sort:
ucodes = np.sort(ucodes)
else:
ucodes = np.arange(len(categories))
Expand Down
6 changes: 6 additions & 0 deletions pandas/core/shared_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,12 @@
Sort group keys. Get better performance by turning this off.
Note this does not influence the order of observations within each
group. Groupby preserves the order of rows within each group.

.. versionchanged:: 2.0.0

Specifying ``sort=False`` with an ordered categorical grouper will no
longer sort the values.

group_keys : bool, optional
When calling apply and the ``by`` argument produces a like-indexed
(i.e. :ref:`a transform <groupby.transform>`) result, add group keys to
Expand Down
155 changes: 41 additions & 114 deletions pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -818,12 +818,14 @@ def test_preserve_categories():

# ordered=True
df = DataFrame({"A": Categorical(list("ba"), categories=categories, ordered=True)})
index = CategoricalIndex(categories, categories, ordered=True, name="A")
sort_index = CategoricalIndex(categories, categories, ordered=True, name="A")
nosort_index = CategoricalIndex(list("bac"), categories, ordered=True, name="A")
tm.assert_index_equal(
df.groupby("A", sort=True, observed=False).first().index, index
df.groupby("A", sort=True, observed=False).first().index, sort_index
)
# GH#42482 - don't sort result when sort=False, even when ordered=True
tm.assert_index_equal(
df.groupby("A", sort=False, observed=False).first().index, index
df.groupby("A", sort=False, observed=False).first().index, nosort_index
)

# ordered=False
Expand Down Expand Up @@ -972,8 +974,11 @@ def test_sort():
tm.assert_series_equal(res, exp)


def test_sort2():
@pytest.mark.parametrize("ordered", [True, False])
def test_sort2(sort, ordered):
# dataframe groupby sort was being ignored # GH 8868
# GH#48749 - don't change order of categories
# GH#42482 - don't sort result when sort=False, even when ordered=True
df = DataFrame(
[
["(7.5, 10]", 10, 10],
Expand All @@ -986,53 +991,28 @@ def test_sort2():
],
columns=["range", "foo", "bar"],
)
df["range"] = Categorical(df["range"], ordered=True)
index = CategoricalIndex(
["(0, 2.5]", "(2.5, 5]", "(5, 7.5]", "(7.5, 10]"], name="range", ordered=True
)
expected_sort = DataFrame(
[[1, 60], [5, 30], [6, 40], [10, 10]], columns=["foo", "bar"], index=index
)

col = "range"
result_sort = df.groupby(col, sort=True, observed=False).first()
tm.assert_frame_equal(result_sort, expected_sort)

# when categories is ordered, group is ordered by category's order
expected_sort = result_sort
result_sort = df.groupby(col, sort=False, observed=False).first()
tm.assert_frame_equal(result_sort, expected_sort)
df["range"] = Categorical(df["range"], ordered=ordered)
result = df.groupby("range", sort=sort, observed=False).first()

df["range"] = Categorical(df["range"], ordered=False)
index = CategoricalIndex(
["(0, 2.5]", "(2.5, 5]", "(5, 7.5]", "(7.5, 10]"], name="range"
)
expected_sort = DataFrame(
[[1, 60], [5, 30], [6, 40], [10, 10]], columns=["foo", "bar"], index=index
)

index = CategoricalIndex(
["(7.5, 10]", "(2.5, 5]", "(5, 7.5]", "(0, 2.5]"],
# GH#48749 - don't change order of categories
categories=["(0, 2.5]", "(2.5, 5]", "(5, 7.5]", "(7.5, 10]"],
name="range",
)
expected_nosort = DataFrame(
[[10, 10], [5, 30], [6, 40], [1, 60]], index=index, columns=["foo", "bar"]
if sort:
data_values = [[1, 60], [5, 30], [6, 40], [10, 10]]
index_values = ["(0, 2.5]", "(2.5, 5]", "(5, 7.5]", "(7.5, 10]"]
else:
data_values = [[10, 10], [5, 30], [6, 40], [1, 60]]
index_values = ["(7.5, 10]", "(2.5, 5]", "(5, 7.5]", "(0, 2.5]"]
expected = DataFrame(
data_values,
columns=["foo", "bar"],
index=CategoricalIndex(index_values, name="range", ordered=ordered),
)

col = "range"

# this is an unordered categorical, but we allow this ####
result_sort = df.groupby(col, sort=True, observed=False).first()
tm.assert_frame_equal(result_sort, expected_sort)

result_nosort = df.groupby(col, sort=False, observed=False).first()
tm.assert_frame_equal(result_nosort, expected_nosort)
tm.assert_frame_equal(result, expected)


def test_sort_datetimelike():
@pytest.mark.parametrize("ordered", [True, False])
def test_sort_datetimelike(sort, ordered):
# GH10505
# GH#42482 - don't sort result when sort=False, even when ordered=True

# use same data as test_groupby_sort_categorical, which category is
# corresponding to datetime.month
Expand All @@ -1054,80 +1034,30 @@ def test_sort_datetimelike():
)

# ordered=True
df["dt"] = Categorical(df["dt"], ordered=True)
index = [
datetime(2011, 1, 1),
datetime(2011, 2, 1),
datetime(2011, 5, 1),
datetime(2011, 7, 1),
]
result_sort = DataFrame(
[[1, 60], [5, 30], [6, 40], [10, 10]], columns=["foo", "bar"]
)
result_sort.index = CategoricalIndex(index, name="dt", ordered=True)

index = [
datetime(2011, 7, 1),
datetime(2011, 2, 1),
datetime(2011, 5, 1),
datetime(2011, 1, 1),
]
result_nosort = DataFrame(
[[10, 10], [5, 30], [6, 40], [1, 60]], columns=["foo", "bar"]
)
result_nosort.index = CategoricalIndex(
index, categories=index, name="dt", ordered=True
)

col = "dt"
tm.assert_frame_equal(
result_sort, df.groupby(col, sort=True, observed=False).first()
)

# when categories is ordered, group is ordered by category's order
tm.assert_frame_equal(
result_sort, df.groupby(col, sort=False, observed=False).first()
)

# ordered = False
df["dt"] = Categorical(df["dt"], ordered=False)
sort_index = CategoricalIndex(
[
df["dt"] = Categorical(df["dt"], ordered=ordered)
if sort:
data_values = [[1, 60], [5, 30], [6, 40], [10, 10]]
index_values = [
datetime(2011, 1, 1),
datetime(2011, 2, 1),
datetime(2011, 5, 1),
datetime(2011, 7, 1),
],
name="dt",
)
result_sort = DataFrame(
[[1, 60], [5, 30], [6, 40], [10, 10]], columns=["foo", "bar"], index=sort_index
)

nosort_index = CategoricalIndex(
[
]
else:
data_values = [[10, 10], [5, 30], [6, 40], [1, 60]]
index_values = [
datetime(2011, 7, 1),
datetime(2011, 2, 1),
datetime(2011, 5, 1),
datetime(2011, 1, 1),
],
# GH#48749 - don't change order of categories
categories=sort_index.categories,
name="dt",
)
result_nosort = DataFrame(
[[10, 10], [5, 30], [6, 40], [1, 60]],
]
expected = DataFrame(
data_values,
columns=["foo", "bar"],
index=nosort_index,
)

col = "dt"
tm.assert_frame_equal(
result_sort, df.groupby(col, sort=True, observed=False).first()
)
tm.assert_frame_equal(
result_nosort, df.groupby(col, sort=False, observed=False).first()
index=CategoricalIndex(index_values, name="dt", ordered=ordered),
)
result = df.groupby("dt", sort=sort, observed=False).first()
tm.assert_frame_equal(result, expected)


def test_empty_sum():
Expand Down Expand Up @@ -2055,13 +1985,10 @@ def test_category_order_apply(as_index, sort, observed, method, index_kind, orde


@pytest.mark.parametrize("index_kind", ["range", "single", "multi"])
def test_many_categories(request, as_index, sort, index_kind, ordered):
def test_many_categories(as_index, sort, index_kind, ordered):
# GH#48749 - Test when the grouper has many categories
if index_kind != "range" and not as_index:
pytest.skip(reason="Result doesn't have categories, nothing to test")
if index_kind == "multi" and as_index and not sort and ordered:
msg = "GH#48749 - values are unsorted even though the Categorical is ordered"
request.node.add_marker(pytest.mark.xfail(reason=msg))
categories = np.arange(9999, -1, -1)
grouper = Categorical([2, 1, 2, 3], categories=categories, ordered=ordered)
df = DataFrame({"a": grouper, "b": range(4)})
Expand All @@ -2078,7 +2005,7 @@ def test_many_categories(request, as_index, sort, index_kind, ordered):
result = gb.sum()

# Test is setup so that data and index are the same values
data = [3, 2, 1] if sort or ordered else [2, 1, 3]
data = [3, 2, 1] if sort else [2, 1, 3]

index = CategoricalIndex(
data, categories=grouper.categories, ordered=ordered, name="a"
Expand Down