From 27e9dc6c97b6ca772bf3a9502b6c7c9661709db4 Mon Sep 17 00:00:00 2001 From: MarcoGorelli Date: Sun, 8 Mar 2020 10:39:12 +0000 Subject: [PATCH 1/5] sort codes --- pandas/core/groupby/generic.py | 28 ++++------------------------ pandas/core/groupby/grouper.py | 2 +- pandas/core/groupby/ops.py | 17 +++++++++-------- pandas/tests/groupby/test_groupby.py | 24 ++++++++++++++++++++++++ 4 files changed, 38 insertions(+), 33 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index ac522fc7863b2..0cb43e3d5d1e5 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1185,8 +1185,6 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(keys) == 0: return DataFrame(index=keys) - key_names = self.grouper.names - # GH12824. def first_not_none(values): try: @@ -1203,27 +1201,9 @@ def first_not_none(values): elif isinstance(v, DataFrame): return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) elif self.grouper.groupings is not None: - if len(self.grouper.groupings) > 1: - key_index = self.grouper.result_index - - else: - ping = self.grouper.groupings[0] - if len(keys) == ping.ngroups: - key_index = ping.group_index - key_index.name = key_names[0] - - key_lookup = Index(keys) - indexer = key_lookup.get_indexer(key_index) - - # reorder the values - values = [values[i] for i in indexer] - else: - - key_index = Index(keys, name=key_names[0]) - - # don't use the key indexer - if not self.as_index: - key_index = None + key_index = self.grouper.result_index + if not self.as_index: + key_index = None # make Nones an empty object v = first_not_none(values) @@ -1635,7 +1615,7 @@ def _gotitem(self, key, ndim: int, subset=None): raise AssertionError("invalid ndim for _gotitem") def _wrap_frame_output(self, result, obj) -> DataFrame: - result_index = self.grouper.levels[0] + result_index = self.grouper.result_index if self.axis == 0: return DataFrame(result, index=obj.columns, columns=result_index).T diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 21e171f937de8..dc3df010c1eb0 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -412,7 +412,7 @@ def _make_codes(self) -> None: codes = self.grouper.codes_info uniques = self.grouper.result_index else: - codes, uniques = algorithms.factorize(self.grouper, sort=self.sort) + codes, uniques = algorithms.factorize(self.grouper, sort=True) uniques = Index(uniques, name=self.name) self._codes = codes self._group_index = uniques diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 7259268ac3f2b..be7e1fab4b37a 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -39,12 +39,13 @@ from pandas.core.dtypes.missing import _maybe_fill, isna import pandas.core.algorithms as algorithms +from pandas.core.arrays import Categorical from pandas.core.base import SelectionMixin import pandas.core.common as com from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame from pandas.core.groupby import base, grouper -from pandas.core.indexes.api import Index, MultiIndex, ensure_index +from pandas.core.indexes.api import CategoricalIndex, Index, MultiIndex, ensure_index from pandas.core.series import Series from pandas.core.sorting import ( compress_group_index, @@ -141,7 +142,7 @@ def _get_grouper(self): def _get_group_keys(self): if len(self.groupings) == 1: - return self.levels[0] + return self.result_index else: comp_ids, _, ngroups = self.group_info @@ -277,12 +278,13 @@ def codes_info(self) -> np.ndarray: return codes def _get_compressed_codes(self) -> Tuple[np.ndarray, np.ndarray]: + ping = self.groupings[0] all_codes = self.codes - if len(all_codes) > 1: + if len(all_codes) > 1 and not isinstance( + ping.grouper, (Categorical, CategoricalIndex) + ): group_index = get_group_index(all_codes, self.shape, sort=True, xnull=True) return compress_group_index(group_index, sort=self.sort) - - ping = self.groupings[0] return ping.codes, np.arange(len(ping.group_index)) @cache_readonly @@ -297,14 +299,13 @@ def reconstructed_codes(self) -> List[np.ndarray]: @cache_readonly def result_index(self) -> Index: - if not self.compressed and len(self.groupings) == 1: - return self.groupings[0].result_index.rename(self.names[0]) - codes = self.reconstructed_codes levels = [ping.result_index for ping in self.groupings] result = MultiIndex( levels=levels, codes=codes, verify_integrity=False, names=self.names ) + if not self.compressed and len(self.groupings) == 1: + return result.get_level_values(0) return result def get_group_levels(self): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 5662d41e19885..845af651aa6b1 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2057,3 +2057,27 @@ def test_groups_repr_truncates(max_seq_items, expected): result = df.groupby(np.array(df.a)).groups.__repr__() assert result == expected + + +def test_sort_false_multiindex_lexsorted(): + # GH 32259 + d = pd.to_datetime( + [ + "2020-11-02", + "2019-01-02", + "2020-01-02", + "2020-02-04", + "2020-11-03", + "2019-11-03", + "2019-11-13", + "2019-11-13", + ] + ) + a = np.arange(len(d)) + b = np.random.rand(len(d)) + df = pd.DataFrame({"d": d, "a": a, "b": b}) + t = df.groupby(["d", "a"], sort=False).mean() + assert not t.index.is_lexsorted() + + t = df.groupby(["d", "a"], sort=True).mean() + assert t.index.is_lexsorted() From 7a9604a24a134e11d2a00afeb9620d7b94bed454 Mon Sep 17 00:00:00 2001 From: MarcoGorelli Date: Sun, 8 Mar 2020 11:24:40 +0000 Subject: [PATCH 2/5] fix logical condition --- pandas/core/groupby/ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index be7e1fab4b37a..304ca7667a465 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -280,7 +280,7 @@ def codes_info(self) -> np.ndarray: def _get_compressed_codes(self) -> Tuple[np.ndarray, np.ndarray]: ping = self.groupings[0] all_codes = self.codes - if len(all_codes) > 1 and not isinstance( + if len(all_codes) > 1 or not isinstance( ping.grouper, (Categorical, CategoricalIndex) ): group_index = get_group_index(all_codes, self.shape, sort=True, xnull=True) From a587f63cbbc4b797cb680922d8e0d25dc92a6a2d Mon Sep 17 00:00:00 2001 From: MarcoGorelli Date: Sun, 8 Mar 2020 17:09:45 +0000 Subject: [PATCH 3/5] special-case bingrouper --- pandas/core/groupby/ops.py | 2 +- pandas/core/resample.py | 1 - pandas/tests/groupby/test_grouping.py | 10 +++------- pandas/tests/groupby/test_timegrouper.py | 3 ++- 4 files changed, 6 insertions(+), 10 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 304ca7667a465..c16826b2a9b54 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -281,7 +281,7 @@ def _get_compressed_codes(self) -> Tuple[np.ndarray, np.ndarray]: ping = self.groupings[0] all_codes = self.codes if len(all_codes) > 1 or not isinstance( - ping.grouper, (Categorical, CategoricalIndex) + ping.grouper, (Categorical, CategoricalIndex, BinGrouper) ): group_index = get_group_index(all_codes, self.shape, sort=True, xnull=True) return compress_group_index(group_index, sort=self.sort) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index f19a82ab6f86a..2156c534c4fa2 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -885,7 +885,6 @@ def count(self): result = DataFrame( [], index=result.index, columns=result.columns, dtype="int64" ) - return result def quantile(self, q=0.5, **kwargs): diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index efcd22f9c0c82..f9a6ab13d256d 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -575,16 +575,12 @@ def test_groupby_args(self, mframe): frame.groupby(by=None, level=None) @pytest.mark.parametrize( - "sort,labels", - [ - [True, [2, 2, 2, 0, 0, 1, 1, 3, 3, 3]], - [False, [0, 0, 0, 1, 1, 2, 2, 3, 3, 3]], - ], + "sort", [True, False], ) - def test_level_preserve_order(self, sort, labels, mframe): + def test_level_preserve_order(self, sort, mframe): # GH 17537 grouped = mframe.groupby(level=0, sort=sort) - exp_labels = np.array(labels, np.intp) + exp_labels = np.array([2, 2, 2, 0, 0, 1, 1, 3, 3, 3], np.intp) tm.assert_almost_equal(grouped.grouper.codes[0], exp_labels) def test_grouping_labels(self, mframe): diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index 6b8bd9e805a0c..fc875f73f79d5 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -750,8 +750,9 @@ def test_scalar_call_versus_list_call(self): grouper = pd.Grouper(freq="D") grouped = data_frame.groupby(grouper) + result = grouped.count() + grouped = data_frame.groupby([grouper]) expected = grouped.count() - tm.assert_frame_equal(result, expected) From f94fc8abc81fb483dbeeac42227a32554bc88545 Mon Sep 17 00:00:00 2001 From: MarcoGorelli Date: Sun, 8 Mar 2020 17:10:51 +0000 Subject: [PATCH 4/5] revert file --- pandas/core/resample.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 2156c534c4fa2..f19a82ab6f86a 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -885,6 +885,7 @@ def count(self): result = DataFrame( [], index=result.index, columns=result.columns, dtype="int64" ) + return result def quantile(self, q=0.5, **kwargs): From a76a69927b73d765c5d6768e32a5cde8fe61b397 Mon Sep 17 00:00:00 2001 From: MarcoGorelli Date: Sun, 8 Mar 2020 17:11:34 +0000 Subject: [PATCH 5/5] revert another file --- pandas/tests/groupby/test_timegrouper.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index fc875f73f79d5..6b8bd9e805a0c 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -750,9 +750,8 @@ def test_scalar_call_versus_list_call(self): grouper = pd.Grouper(freq="D") grouped = data_frame.groupby(grouper) - result = grouped.count() - grouped = data_frame.groupby([grouper]) expected = grouped.count() + tm.assert_frame_equal(result, expected)