diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 322f431a37a79..83e5ec5b1d107 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -303,7 +303,22 @@ Other API Changes - Allow passing `kwargs` to the interpolation methods (:issue:`10378`). - Serialize metadata properties of subclasses of pandas objects (:issue:`10553`). - ``Categorical.name`` was removed to make `Categorical` more ``numpy.ndarray`` like. Use ``Series(cat, name="whatever")`` instead (:issue:`10482`). +- ``Categorical.unique`` now returns new ``Categorical`` which ``categories`` and ``codes`` are unique, rather than returnning ``np.array`` (:issue:`10508`) + - unordered category: values and categories are sorted by appearance order. + - ordered category: values are sorted by appearance order, categories keeps existing order. + +.. ipython :: python + + cat = pd.Categorical(['C', 'A', 'B', 'C'], categories=['A', 'B', 'C'], ordered=True) + cat + cat.unique() + + cat = pd.Categorical(['C', 'A', 'B', 'C'], categories=['A', 'B', 'C']) + cat + cat.unique() + +- ``groupby`` using ``Categorical`` follows the same rule as ``Categorical.unique`` described above (:issue:`10508`) - ``NaT``'s methods now either raise ``ValueError``, or return ``np.nan`` or ``NaT`` (:issue:`9513`) =============================== ============================================================== @@ -365,6 +380,9 @@ Bug Fixes - Bug in ``DataFrame.interpolate`` with ``axis=1`` and ``inplace=True`` (:issue:`10395`) - Bug in ``io.sql.get_schema`` when specifying multiple columns as primary key (:issue:`10385`). + +- Bug in ``groupby(sort=False)`` with datetime-like ``Categorical`` raises ``ValueError`` (:issue:`10505`) + - Bug in ``test_categorical`` on big-endian builds (:issue:`10425`) - Bug in ``Series.shift`` and ``DataFrame.shift`` not supporting categorical data (:issue:`9416`) - Bug in ``Series.map`` using categorical ``Series`` raises ``AttributeError`` (:issue:`10324`) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 1d1f0d7da80e4..1604705ff824a 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -1558,19 +1558,30 @@ def mode(self): def unique(self): """ - Return the unique values. + Return the ``Categorical`` which ``categories`` and ``codes`` are unique. + Unused categories are NOT returned. - Unused categories are NOT returned. Unique values are returned in order - of appearance. + - unordered category: values and categories are sorted by appearance + order. + - ordered category: values are sorted by appearance order, categories + keeps existing order. Returns ------- - unique values : array + unique values : ``Categorical`` """ + from pandas.core.nanops import unique1d # unlike np.unique, unique1d does not sort unique_codes = unique1d(self.codes) - return take_1d(self.categories.values, unique_codes) + cat = self.copy() + # keep nan in codes + cat._codes = unique_codes + # exclude nan from indexer for categories + take_codes = unique_codes[unique_codes != -1] + if self.ordered: + take_codes = sorted(take_codes) + return cat.set_categories(cat.categories.take(take_codes)) def equals(self, other): """ diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index c01c6104ab904..2ed5774bdbec6 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1959,7 +1959,8 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, # fix bug #GH8868 sort=False being ignored in categorical groupby else: - self.grouper = self.grouper.reorder_categories(self.grouper.unique()) + cat = self.grouper.unique() + self.grouper = self.grouper.reorder_categories(cat.categories) # we make a CategoricalIndex out of the cat grouper # preserving the categories / ordered attributes diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index fdd20af6ab6ce..41c487adc0d6e 100755 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -958,20 +958,59 @@ def test_min_max(self): self.assertEqual(_max, 1) def test_unique(self): - cat = Categorical(["a","b"]) - exp = np.asarray(["a","b"]) + # categories are reordered based on value when ordered=False + cat = Categorical(["a", "b"]) + exp = np.asarray(["a", "b"]) res = cat.unique() self.assert_numpy_array_equal(res, exp) - cat = Categorical(["a","b","a","a"], categories=["a","b","c"]) + cat = Categorical(["a", "b", "a", "a"], categories=["a", "b", "c"]) res = cat.unique() self.assert_numpy_array_equal(res, exp) + tm.assert_categorical_equal(res, Categorical(exp)) - # unique should not sort - cat = Categorical(["b", "b", np.nan, "a"], categories=["a","b","c"]) + cat = Categorical(["c", "a", "b", "a", "a"], categories=["a", "b", "c"]) + exp = np.asarray(["c", "a", "b"]) + res = cat.unique() + self.assert_numpy_array_equal(res, exp) + tm.assert_categorical_equal(res, Categorical(exp, categories=['c', 'a', 'b'])) + + # nan must be removed + cat = Categorical(["b", np.nan, "b", np.nan, "a"], categories=["a", "b", "c"]) res = cat.unique() exp = np.asarray(["b", np.nan, "a"], dtype=object) self.assert_numpy_array_equal(res, exp) + tm.assert_categorical_equal(res, Categorical(["b", np.nan, "a"], categories=["b", "a"])) + + def test_unique_ordered(self): + # keep categories order when ordered=True + cat = Categorical(['b', 'a', 'b'], categories=['a', 'b'], ordered=True) + res = cat.unique() + exp = np.asarray(['b', 'a']) + exp_cat = Categorical(exp, categories=['a', 'b'], ordered=True) + self.assert_numpy_array_equal(res, exp) + tm.assert_categorical_equal(res, exp_cat) + + cat = Categorical(['c', 'b', 'a', 'a'], categories=['a', 'b', 'c'], ordered=True) + res = cat.unique() + exp = np.asarray(['c', 'b', 'a']) + exp_cat = Categorical(exp, categories=['a', 'b', 'c'], ordered=True) + self.assert_numpy_array_equal(res, exp) + tm.assert_categorical_equal(res, exp_cat) + + cat = Categorical(['b', 'a', 'a'], categories=['a', 'b', 'c'], ordered=True) + res = cat.unique() + exp = np.asarray(['b', 'a']) + exp_cat = Categorical(exp, categories=['a', 'b'], ordered=True) + self.assert_numpy_array_equal(res, exp) + tm.assert_categorical_equal(res, exp_cat) + + cat = Categorical(['b', 'b', np.nan, 'a'], categories=['a', 'b', 'c'], ordered=True) + res = cat.unique() + exp = np.asarray(['b', np.nan, 'a'], dtype=object) + exp_cat = Categorical(exp, categories=['a', 'b'], ordered=True) + self.assert_numpy_array_equal(res, exp) + tm.assert_categorical_equal(res, exp_cat) def test_mode(self): s = Categorical([1,1,2,4,5,5,5], categories=[5,4,3,2,1], ordered=True) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index a73f4e2939578..f1df0d711b5d0 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -3413,7 +3413,8 @@ def test_groupby_sort_categorical(self): col = 'range' assert_frame_equal(result_sort, df.groupby(col, sort=True).first()) - assert_frame_equal(result_nosort, df.groupby(col, sort=False).first()) + # when categories is ordered, group is ordered by category's order + assert_frame_equal(result_sort, df.groupby(col, sort=False).first()) df['range'] = Categorical(df['range'],ordered=False) index = Index(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]', '(7.5, 10]'], dtype='object') @@ -3431,6 +3432,55 @@ def test_groupby_sort_categorical(self): assert_frame_equal(result_sort, df.groupby(col, sort=True).first()) assert_frame_equal(result_nosort, df.groupby(col, sort=False).first()) + def test_groupby_sort_categorical_datetimelike(self): + # GH10505 + + # use same data as test_groupby_sort_categorical, which category is + # corresponding to datetime.month + df = DataFrame({'dt': [datetime(2011, 7, 1), datetime(2011, 7, 1), + datetime(2011, 2, 1), datetime(2011, 5, 1), + datetime(2011, 2, 1), datetime(2011, 1, 1), + datetime(2011, 5, 1)], + 'foo': [10, 8, 5, 6, 4, 1, 7], + 'bar': [10, 20, 30, 40, 50, 60, 70]}, + columns=['dt', 'foo', 'bar']) + + # ordered=True + df['dt'] = Categorical(df['dt'], ordered=True) + index = [datetime(2011, 1, 1), datetime(2011, 2, 1), + datetime(2011, 5, 1), datetime(2011, 7, 1)] + result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar']) + result_sort.index = CategoricalIndex(index, name='dt', ordered=True) + + index = [datetime(2011, 7, 1), datetime(2011, 2, 1), + datetime(2011, 5, 1), datetime(2011, 1, 1)] + result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]], + columns=['foo', 'bar']) + result_nosort.index = CategoricalIndex(index, categories=index, + name='dt', ordered=True) + + col = 'dt' + assert_frame_equal(result_sort, df.groupby(col, sort=True).first()) + # when categories is ordered, group is ordered by category's order + assert_frame_equal(result_sort, df.groupby(col, sort=False).first()) + + # ordered = False + df['dt'] = Categorical(df['dt'], ordered=False) + index = [datetime(2011, 1, 1), datetime(2011, 2, 1), + datetime(2011, 5, 1), datetime(2011, 7, 1)] + result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar']) + result_sort.index = CategoricalIndex(index, name='dt') + + index = [datetime(2011, 7, 1), datetime(2011, 2, 1), + datetime(2011, 5, 1), datetime(2011, 1, 1)] + result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]], + columns=['foo', 'bar']) + result_nosort.index = CategoricalIndex(index, categories=index, name='dt') + + col = 'dt' + assert_frame_equal(result_sort, df.groupby(col, sort=True).first()) + assert_frame_equal(result_nosort, df.groupby(col, sort=False).first()) + def test_groupby_sort_multiindex_series(self): # series multiindex groupby sort argument was not being passed through _compress_group_index