diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 03ff62568b405..59f55914ea4d3 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -492,6 +492,43 @@ def time_groupby_sum(self): self.df.groupby(['a'])['b'].sum() +class groupby_categorical(object): + goal_time = 0.2 + + def setup(self): + N = 100000 + arr = np.random.random(N) + + self.df = DataFrame(dict( + a=Categorical(np.random.randint(10000, size=N)), + b=arr)) + self.df_ordered = DataFrame(dict( + a=Categorical(np.random.randint(10000, size=N), ordered=True), + b=arr)) + self.df_extra_cat = DataFrame(dict( + a=Categorical(np.random.randint(100, size=N), + categories=np.arange(10000)), + b=arr)) + + def time_groupby_sort(self): + self.df.groupby('a')['b'].count() + + def time_groupby_nosort(self): + self.df.groupby('a', sort=False)['b'].count() + + def time_groupby_ordered_sort(self): + self.df_ordered.groupby('a')['b'].count() + + def time_groupby_ordered_nosort(self): + self.df_ordered.groupby('a', sort=False)['b'].count() + + def time_groupby_extra_cat_sort(self): + self.df_extra_cat.groupby('a')['b'].count() + + def time_groupby_extra_cat_nosort(self): + self.df_extra_cat.groupby('a', sort=False)['b'].count() + + class groupby_period(object): # GH 14338 goal_time = 0.2 diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index ae4a3d3c3d97f..fb8a708849abc 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -120,6 +120,42 @@ Notably, a new numerical index, ``UInt64Index``, has been created (:issue:`14937 - Bug in ``pd.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14915`) - Bug in ``pd.value_counts()`` in which unsigned 64-bit integers were being erroneously truncated in the output (:issue:`14934`) +.. _whatsnew_0200.enhancements.groupy_categorical + +GroupBy on Categoricals +^^^^^^^^^^^^^^^^^^^^^^^ + +In previous versions, ``.groupby(..., sort=False)`` would fail with a ``ValueError`` when grouping on a categorical series with some categories not appearing in the data. (:issue:`13179`) + +Now, it works. + +.. ipython:: python + + chromosomes = np.r_[np.arange(1, 23).astype(str), ['X', 'Y']] + df = pd.DataFrame({ + 'A': np.random.randint(100), + 'B': np.random.randint(100), + 'C': np.random.randint(100), + 'chromosomes': pd.Categorical(np.random.choice(chromosomes, 100), + categories=chromosomes, + ordered=True)}) + +Previous Behavior: + +.. code-block:: ipython + + In [3]: df[df.chromosomes != '1'].groupby('chromosomes', sort=False).sum() + --------------------------------------------------------------------------- + ValueError Traceback (most recent call last) + ... + ValueError: items in new_categories are not the same as in old categories + +New Behavior: + +.. ipython:: python + + df[df.chromosomes != '1'].groupby('chromosomes', sort=False).sum() + .. _whatsnew_0200.enhancements.other: Other enhancements @@ -160,7 +196,6 @@ Other enhancements .. _whatsnew_0200.api_breaking: - Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 491db2e080953..c188f04d23873 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -602,6 +602,44 @@ def _get_categories(self): categories = property(fget=_get_categories, fset=_set_categories, doc=_categories_doc) + def _codes_for_groupby(self, sort): + """ + If sort=False, return a copy of self, coded with categories as + returned by .unique(), followed by any categories not appearing in + the data. If sort=True, return self. + + This method is needed solely to ensure the categorical index of the + GroupBy result has categories in the order of appearance in the data + (GH-8868). + + Parameters + ---------- + sort : boolean + The value of the sort paramter groupby was called with. + + Returns + ------- + Categorical + If sort=False, the new categories are set to the order of + appearance in codes (unless ordered=True, in which case the + original order is preserved), followed by any unrepresented + categories in the original order. + """ + if sort: + # Already sorted according to self.categories; all is fine + return self + + # sort=False should order groups in as-encountered order (GH-8868) + cat = self.unique() + # But for groupby to work, all categories should be present, + # including those missing from the data (GH-13179), which .unique() + # above dropped + cat.add_categories( + self.categories[~self.categories.isin(cat.categories)], + inplace=True) + cat = self.reorder_categories(cat.categories) + return cat + _ordered = None def set_ordered(self, value, inplace=False): diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index ba2de295fa0a9..0b3fcba1c1ba5 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2300,23 +2300,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, # a passed Categorical elif is_categorical_dtype(self.grouper): - # must have an ordered categorical - if self.sort: - if not self.grouper.ordered: - - # technically we cannot group on an unordered - # Categorical - # but this a user convenience to do so; the ordering - # is preserved and if it's a reduction it doesn't make - # any difference - pass - - # fix bug #GH8868 sort=False being ignored in categorical - # groupby - else: - cat = self.grouper.unique() - self.grouper = self.grouper.reorder_categories( - cat.categories) + self.grouper = self.grouper._codes_for_groupby(self.sort) # we make a CategoricalIndex out of the cat grouper # preserving the categories / ordered attributes diff --git a/pandas/indexes/category.py b/pandas/indexes/category.py index acb2758641a62..5299a094156cd 100644 --- a/pandas/indexes/category.py +++ b/pandas/indexes/category.py @@ -550,6 +550,10 @@ def _append_same_dtype(self, to_concat, name): result.name = name return result + def _codes_for_groupby(self, sort): + """ Return a Categorical adjusted for groupby """ + return self.values._codes_for_groupby(sort) + @classmethod def _add_comparison_methods(cls): """ add in comparison methods """ diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index eebd0e0f490c1..cfcb531bedab8 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -284,6 +284,30 @@ def test_groupby_multi_categorical_as_index(self): tm.assert_frame_equal(result, expected, check_index_type=True) + def test_groupby_preserve_categories(self): + # GH-13179 + categories = list('abc') + + # ordered=True + df = DataFrame({'A': pd.Categorical(list('ba'), + categories=categories, + ordered=True)}) + index = pd.CategoricalIndex(categories, categories, ordered=True) + tm.assert_index_equal(df.groupby('A', sort=True).first().index, index) + tm.assert_index_equal(df.groupby('A', sort=False).first().index, index) + + # ordered=False + df = DataFrame({'A': pd.Categorical(list('ba'), + categories=categories, + ordered=False)}) + sort_index = pd.CategoricalIndex(categories, categories, ordered=False) + nosort_index = pd.CategoricalIndex(list('bac'), list('bac'), + ordered=False) + tm.assert_index_equal(df.groupby('A', sort=True).first().index, + sort_index) + tm.assert_index_equal(df.groupby('A', sort=False).first().index, + nosort_index) + def test_groupby_preserve_categorical_dtype(self): # GH13743, GH13854 df = DataFrame({'A': [1, 2, 1, 1, 2],