Skip to content

Commit c8813ae

Browse files
authored
API: value_counts to consistently maintain order of input (#59745)
* API: value_counts to consistently maintain order of input * Docs * Cleanup * Test & docs fixups * Refine whatsnew * Refine whatsnew
1 parent 05fa958 commit c8813ae

File tree

7 files changed

+157
-53
lines changed

7 files changed

+157
-53
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,67 @@ In cases with mixed-resolution inputs, the highest resolution is used:
203203
In [2]: pd.to_datetime([pd.Timestamp("2024-03-22 11:43:01"), "2024-03-22 11:43:01.002"]).dtype
204204
Out[2]: dtype('<M8[ns]')
205205
206+
.. _whatsnew_300.api_breaking.value_counts_sorting:
207+
208+
Changed behavior in :meth:`DataFrame.value_counts` and :meth:`DataFrameGroupBy.value_counts` when ``sort=False``
209+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
210+
211+
In previous versions of pandas, :meth:`DataFrame.value_counts` with ``sort=False`` would sort the result by row labels (as was documented). This was nonintuitive and inconsistent with :meth:`Series.value_counts` which would maintain the order of the input. Now :meth:`DataFrame.value_counts` will maintain the order of the input.
212+
213+
.. ipython:: python
214+
215+
df = pd.DataFrame(
216+
{
217+
"a": [2, 2, 2, 2, 1, 1, 1, 1],
218+
"b": [2, 1, 3, 1, 2, 3, 1, 1],
219+
}
220+
)
221+
df
222+
223+
*Old behavior*
224+
225+
.. code-block:: ipython
226+
227+
In [3]: df.value_counts(sort=False)
228+
Out[3]:
229+
a b
230+
1 1 2
231+
2 1
232+
3 1
233+
2 1 2
234+
2 1
235+
3 1
236+
Name: count, dtype: int64
237+
238+
*New behavior*
239+
240+
.. ipython:: python
241+
242+
df.value_counts(sort=False)
243+
244+
This change also applies to :meth:`.DataFrameGroupBy.value_counts`. Here, there are two options for sorting: one ``sort`` passed to :meth:`DataFrame.groupby` and one passed directly to :meth:`.DataFrameGroupBy.value_counts`. The former will determine whether to sort the groups, the latter whether to sort the counts. All non-grouping columns will maintain the order of the input *within groups*.
245+
246+
*Old behavior*
247+
248+
.. code-block:: ipython
249+
250+
In [5]: df.groupby("a", sort=True).value_counts(sort=False)
251+
Out[5]:
252+
a b
253+
1 1 2
254+
2 1
255+
3 1
256+
2 1 2
257+
2 1
258+
3 1
259+
dtype: int64
260+
261+
*New behavior*
262+
263+
.. ipython:: python
264+
265+
df.groupby("a", sort=True).value_counts(sort=False)
266+
206267
.. _whatsnew_300.api_breaking.deps:
207268

208269
Increased minimum version for Python

pandas/core/frame.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7266,7 +7266,11 @@ def value_counts(
72667266
normalize : bool, default False
72677267
Return proportions rather than frequencies.
72687268
sort : bool, default True
7269-
Sort by frequencies when True. Sort by DataFrame column values when False.
7269+
Sort by frequencies when True. Preserve the order of the data when False.
7270+
7271+
.. versionchanged:: 3.0.0
7272+
7273+
Prior to 3.0.0, ``sort=False`` would sort by the columns values.
72707274
ascending : bool, default False
72717275
Sort in ascending order.
72727276
dropna : bool, default True
@@ -7372,7 +7376,9 @@ def value_counts(
73727376
subset = self.columns.tolist()
73737377

73747378
name = "proportion" if normalize else "count"
7375-
counts = self.groupby(subset, dropna=dropna, observed=False)._grouper.size()
7379+
counts = self.groupby(
7380+
subset, sort=False, dropna=dropna, observed=False
7381+
)._grouper.size()
73767382
counts.name = name
73777383

73787384
if sort:

pandas/core/groupby/generic.py

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2621,7 +2621,13 @@ def value_counts(
26212621
normalize : bool, default False
26222622
Return proportions rather than frequencies.
26232623
sort : bool, default True
2624-
Sort by frequencies.
2624+
Sort by frequencies when True. When False, non-grouping columns will appear
2625+
in the order they occur in within groups.
2626+
2627+
.. versionchanged:: 3.0.0
2628+
2629+
In prior versions, ``sort=False`` would sort the non-grouping columns
2630+
by label.
26252631
ascending : bool, default False
26262632
Sort in ascending order.
26272633
dropna : bool, default True
@@ -2673,43 +2679,43 @@ def value_counts(
26732679
26742680
>>> df.groupby("gender").value_counts()
26752681
gender education country
2676-
female high FR 1
2677-
US 1
2682+
female high US 1
2683+
FR 1
26782684
male low FR 2
26792685
US 1
26802686
medium FR 1
26812687
Name: count, dtype: int64
26822688
26832689
>>> df.groupby("gender").value_counts(ascending=True)
26842690
gender education country
2685-
female high FR 1
2686-
US 1
2691+
female high US 1
2692+
FR 1
26872693
male low US 1
26882694
medium FR 1
26892695
low FR 2
26902696
Name: count, dtype: int64
26912697
26922698
>>> df.groupby("gender").value_counts(normalize=True)
26932699
gender education country
2694-
female high FR 0.50
2695-
US 0.50
2700+
female high US 0.50
2701+
FR 0.50
26962702
male low FR 0.50
26972703
US 0.25
26982704
medium FR 0.25
26992705
Name: proportion, dtype: float64
27002706
27012707
>>> df.groupby("gender", as_index=False).value_counts()
27022708
gender education country count
2703-
0 female high FR 1
2704-
1 female high US 1
2709+
0 female high US 1
2710+
1 female high FR 1
27052711
2 male low FR 2
27062712
3 male low US 1
27072713
4 male medium FR 1
27082714
27092715
>>> df.groupby("gender", as_index=False).value_counts(normalize=True)
27102716
gender education country proportion
2711-
0 female high FR 0.50
2712-
1 female high US 0.50
2717+
0 female high US 0.50
2718+
1 female high FR 0.50
27132719
2 male low FR 0.50
27142720
3 male low US 0.25
27152721
4 male medium FR 0.25

pandas/core/groupby/groupby.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2519,7 +2519,7 @@ def _value_counts(
25192519
grouper, _, _ = get_grouper(
25202520
df,
25212521
key=key,
2522-
sort=self.sort,
2522+
sort=False,
25232523
observed=False,
25242524
dropna=dropna,
25252525
)
@@ -2528,7 +2528,7 @@ def _value_counts(
25282528
# Take the size of the overall columns
25292529
gb = df.groupby(
25302530
groupings,
2531-
sort=self.sort,
2531+
sort=False,
25322532
observed=self.observed,
25332533
dropna=self.dropna,
25342534
)

pandas/core/groupby/ops.py

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -755,6 +755,7 @@ def result_index_and_ids(self) -> tuple[Index, npt.NDArray[np.intp]]:
755755
obs = [
756756
ping._observed or not ping._passed_categorical for ping in self.groupings
757757
]
758+
sorts = [ping._sort for ping in self.groupings]
758759
# When passed a categorical grouping, keep all categories
759760
for k, (ping, level) in enumerate(zip(self.groupings, levels)):
760761
if ping._passed_categorical:
@@ -765,7 +766,9 @@ def result_index_and_ids(self) -> tuple[Index, npt.NDArray[np.intp]]:
765766
result_index.name = self.names[0]
766767
ids = ensure_platform_int(self.codes[0])
767768
elif all(obs):
768-
result_index, ids = self._ob_index_and_ids(levels, self.codes, self.names)
769+
result_index, ids = self._ob_index_and_ids(
770+
levels, self.codes, self.names, sorts
771+
)
769772
elif not any(obs):
770773
result_index, ids = self._unob_index_and_ids(levels, self.codes, self.names)
771774
else:
@@ -778,6 +781,7 @@ def result_index_and_ids(self) -> tuple[Index, npt.NDArray[np.intp]]:
778781
levels=[levels[idx] for idx in ob_indices],
779782
codes=[codes[idx] for idx in ob_indices],
780783
names=[names[idx] for idx in ob_indices],
784+
sorts=[sorts[idx] for idx in ob_indices],
781785
)
782786
unob_index, unob_ids = self._unob_index_and_ids(
783787
levels=[levels[idx] for idx in unob_indices],
@@ -800,9 +804,18 @@ def result_index_and_ids(self) -> tuple[Index, npt.NDArray[np.intp]]:
800804
).reorder_levels(index)
801805
ids = len(unob_index) * ob_ids + unob_ids
802806

803-
if self._sort:
807+
if any(sorts):
804808
# Sort result_index and recode ids using the new order
805-
sorter = result_index.argsort()
809+
n_levels = len(sorts)
810+
drop_levels = [
811+
n_levels - idx
812+
for idx, sort in enumerate(reversed(sorts), 1)
813+
if not sort
814+
]
815+
if len(drop_levels) > 0:
816+
sorter = result_index._drop_level_numbers(drop_levels).argsort()
817+
else:
818+
sorter = result_index.argsort()
806819
result_index = result_index.take(sorter)
807820
_, index = np.unique(sorter, return_index=True)
808821
ids = ensure_platform_int(ids)
@@ -837,10 +850,13 @@ def _ob_index_and_ids(
837850
levels: list[Index],
838851
codes: list[npt.NDArray[np.intp]],
839852
names: list[Hashable],
853+
sorts: list[bool],
840854
) -> tuple[MultiIndex, npt.NDArray[np.intp]]:
855+
consistent_sorting = all(sorts[0] == sort for sort in sorts[1:])
856+
sort_in_compress = sorts[0] if consistent_sorting else False
841857
shape = tuple(len(level) for level in levels)
842858
group_index = get_group_index(codes, shape, sort=True, xnull=True)
843-
ob_ids, obs_group_ids = compress_group_index(group_index, sort=self._sort)
859+
ob_ids, obs_group_ids = compress_group_index(group_index, sort=sort_in_compress)
844860
ob_ids = ensure_platform_int(ob_ids)
845861
ob_index_codes = decons_obs_group_ids(
846862
ob_ids, obs_group_ids, shape, codes, xnull=True
@@ -851,6 +867,21 @@ def _ob_index_and_ids(
851867
names=names,
852868
verify_integrity=False,
853869
)
870+
if not consistent_sorting:
871+
# Sort by the levels where the corresponding sort argument is True
872+
n_levels = len(sorts)
873+
drop_levels = [
874+
n_levels - idx
875+
for idx, sort in enumerate(reversed(sorts), 1)
876+
if not sort
877+
]
878+
if len(drop_levels) > 0:
879+
sorter = ob_index._drop_level_numbers(drop_levels).argsort()
880+
else:
881+
sorter = ob_index.argsort()
882+
ob_index = ob_index.take(sorter)
883+
_, index = np.unique(sorter, return_index=True)
884+
ob_ids = np.where(ob_ids == -1, -1, index.take(ob_ids))
854885
ob_ids = ensure_platform_int(ob_ids)
855886
return ob_index, ob_ids
856887

pandas/tests/frame/methods/test_value_counts.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ def test_data_frame_value_counts_dropna_true(nulls_fixture):
128128
expected = pd.Series(
129129
data=[1, 1],
130130
index=pd.MultiIndex.from_arrays(
131-
[("Beth", "John"), ("Louise", "Smith")], names=["first_name", "middle_name"]
131+
[("John", "Beth"), ("Smith", "Louise")], names=["first_name", "middle_name"]
132132
),
133133
name="count",
134134
)
@@ -156,7 +156,7 @@ def test_data_frame_value_counts_dropna_false(nulls_fixture):
156156
pd.Index(["Anne", "Beth", "John"]),
157157
pd.Index(["Louise", "Smith", np.nan]),
158158
],
159-
codes=[[0, 1, 2, 2], [2, 0, 1, 2]],
159+
codes=[[2, 0, 2, 1], [1, 2, 2, 0]],
160160
names=["first_name", "middle_name"],
161161
),
162162
name="count",

0 commit comments

Comments
 (0)