ENH: groupby.apply for Categorical should preserve categories (closes pandas-dev#10138)

mortada · mortada · commit 659bbec5aa34 · 2015-05-21T10:27:10.000-07:00
diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
@@ -63,6 +63,7 @@ Bug Fixes
 - Bug in ``Categorical`` repr with ``display.width`` of ``None`` in Python 3 (:issue:`10087`)
 
 
+- Bug in groupby.apply aggregation for Categorical not preserving categories (:issue:`10138`)
 - Bug where Panel.from_dict does not set dtype when specified (:issue:`10058`)
 - Bug in ``Timestamp``'s' ``microsecond``, ``quarter``, ``dayofyear``, ``week`` and ``daysinmonth`` properties return ``np.int`` type, not built-in ``int``. (:issue:`10050`)
 - Bug in ``NaT`` raises ``AttributeError`` when accessing to ``daysinmonth``, ``dayofweek`` properties. (:issue:`10096`)
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -2940,7 +2940,8 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
                     cd = 'coerce'
                 else:
                     cd = True
-                return result.convert_objects(convert_dates=cd)
+                result = result.convert_objects(convert_dates=cd)
+                return self._reindex_output(result)
 
             else:
                 # only coerce dates if we find at least 1 datetime
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -2596,6 +2596,34 @@ def get_stats(group):
         result = self.df.groupby(cats).D.apply(get_stats)
         self.assertEqual(result.index.names[0], 'C')
 
+    def test_apply_categorical_data(self):
+        # GH 10138
+        dense = Categorical(list('abc'))
+        # 'b' is in the categories but not in the list
+        missing = Categorical(list('aaa'), categories=['a', 'b'])
+        values = np.arange(len(dense))
+        df = DataFrame({'missing': missing,
+                        'dense': dense,
+                        'values': values})
+        grouped = df.groupby(['missing', 'dense'])
+
+        # missing category 'b' should still exist in the output index
+        idx = MultiIndex.from_product([['a', 'b'], ['a', 'b', 'c']],
+                                      names=['missing', 'dense'])
+        expected = DataFrame([0, 1, 2, np.nan, np.nan, np.nan],
+                             index=idx,
+                             columns=['values'])
+
+        assert_frame_equal(grouped.apply(lambda x: np.mean(x)), expected)
+        assert_frame_equal(grouped.mean(), expected)
+        assert_frame_equal(grouped.agg(np.mean), expected)
+
+        # but for transform we should still get back the original index
+        idx = MultiIndex.from_product([['a'], ['a', 'b', 'c']],
+                                      names=['missing', 'dense'])
+        expected = Series(1, index=idx)
+        assert_series_equal(grouped.apply(lambda x: 1), expected)
+
     def test_apply_corner_cases(self):
         # #535, can't use sliding iterator