diff --git a/doc/source/whatsnew/v0.16.2.txt b/doc/source/whatsnew/v0.16.2.txt index b571aab0b19a5..1a8fc90b9683f 100644 --- a/doc/source/whatsnew/v0.16.2.txt +++ b/doc/source/whatsnew/v0.16.2.txt @@ -55,7 +55,7 @@ Bug Fixes multi-indexed (:issue:`7212`) - Bug in ``Categorical`` repr with ``display.width`` of ``None`` in Python 3 (:issue:`10087`) - +- Bug in groupby.apply aggregation for Categorical not preserving categories (:issue:`10138`) - Bug in ``mean()`` where integer dtypes can overflow (:issue:`10172`) - Bug where Panel.from_dict does not set dtype when specified (:issue:`10058`) - Bug in ``Index.union`` raises ``AttributeError`` when passing array-likes. (:issue:`10149`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 51674bad60f5b..4abdd1112c721 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2944,7 +2944,8 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): cd = 'coerce' else: cd = True - return result.convert_objects(convert_dates=cd) + result = result.convert_objects(convert_dates=cd) + return self._reindex_output(result) else: # only coerce dates if we find at least 1 datetime diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 0789e20df3945..ab78bd63a7c94 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -2595,6 +2595,35 @@ def get_stats(group): result = self.df.groupby(cats).D.apply(get_stats) self.assertEqual(result.index.names[0], 'C') + def test_apply_categorical_data(self): + # GH 10138 + for ordered in [True, False]: + dense = Categorical(list('abc'), ordered=ordered) + # 'b' is in the categories but not in the list + missing = Categorical(list('aaa'), categories=['a', 'b'], ordered=ordered) + values = np.arange(len(dense)) + df = DataFrame({'missing': missing, + 'dense': dense, + 'values': values}) + grouped = df.groupby(['missing', 'dense']) + + # missing category 'b' should still exist in the output index + idx = MultiIndex.from_product([['a', 'b'], ['a', 'b', 'c']], + names=['missing', 'dense']) + expected = DataFrame([0, 1, 2, np.nan, np.nan, np.nan], + index=idx, + columns=['values']) + + assert_frame_equal(grouped.apply(lambda x: np.mean(x)), expected) + assert_frame_equal(grouped.mean(), expected) + assert_frame_equal(grouped.agg(np.mean), expected) + + # but for transform we should still get back the original index + idx = MultiIndex.from_product([['a'], ['a', 'b', 'c']], + names=['missing', 'dense']) + expected = Series(1, index=idx) + assert_series_equal(grouped.apply(lambda x: 1), expected) + def test_apply_corner_cases(self): # #535, can't use sliding iterator