diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 08222ef06d21f..5dabe730c92b0 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -128,6 +128,7 @@ Bug Fixes - Bug in ``test_categorical`` on big-endian builds (:issue:`10425`) +- Bug in ``Series.shift`` and ``DataFrame.shift`` not supporting categorical data (:issue:`9416`) - Bug in ``Series.map`` using categorical ``Series`` raises ``AttributeError`` (:issue:`10324`) - Bug in ``MultiIndex.get_level_values`` including ``Categorical`` raises ``AttributeError`` (:issue:`10460`) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index edd4a532cf8f5..96d2d283d2a2d 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -820,6 +820,35 @@ def shape(self): return tuple([len(self._codes)]) + def shift(self, periods): + """ + Shift Categorical by desired number of periods. + + Parameters + ---------- + periods : int + Number of periods to move, can be positive or negative + + Returns + ------- + shifted : Categorical + """ + # since categoricals always have ndim == 1, an axis parameter + # doesnt make any sense here. + codes = self.codes + if codes.ndim > 1: + raise NotImplementedError("Categorical with ndim > 1.") + if np.prod(codes.shape) and (periods != 0): + codes = np.roll(codes, com._ensure_platform_int(periods), axis=0) + if periods > 0: + codes[:periods] = -1 + else: + codes[periods:] = -1 + + return Categorical.from_codes(codes, + categories=self.categories, + ordered=self.ordered) + def __array__(self, dtype=None): """ The numpy array interface. diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 42d7163e7f741..0c18ff641c269 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1709,6 +1709,10 @@ def interpolate(self, method='pad', axis=0, inplace=False, limit=limit), placement=self.mgr_locs) + def shift(self, periods, axis=0): + return self.make_block_same_class(values=self.values.shift(periods), + placement=self.mgr_locs) + def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None): """ Take values according to indexer and return them as a block.bb diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 2c1a4fd43e57f..5f3ff794b4900 100755 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -1080,6 +1080,26 @@ def test_set_item_nan(self): exp = np.array([0,1,3,2]) self.assert_numpy_array_equal(cat.codes, exp) + def test_shift(self): + # GH 9416 + cat = pd.Categorical(['a', 'b', 'c', 'd', 'a']) + + # shift forward + sp1 = cat.shift(1) + xp1 = pd.Categorical([np.nan, 'a', 'b', 'c', 'd']) + self.assert_categorical_equal(sp1, xp1) + self.assert_categorical_equal(cat[:-1], sp1[1:]) + + # shift back + sn2 = cat.shift(-2) + xp2 = pd.Categorical(['c', 'd', 'a', np.nan, np.nan], + categories=['a', 'b', 'c', 'd']) + self.assert_categorical_equal(sn2, xp2) + self.assert_categorical_equal(cat[2:], sn2[:-2]) + + # shift by zero + self.assert_categorical_equal(cat, cat.shift(0)) + def test_nbytes(self): cat = pd.Categorical([1,2,3]) exp = cat._codes.nbytes + cat._categories.values.nbytes diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index ae6102751fb41..8bdd493e3d841 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -10360,6 +10360,15 @@ def test_shift_bool(self): columns=['high', 'low']) assert_frame_equal(rs, xp) + def test_shift_categorical(self): + # GH 9416 + s1 = pd.Series(['a', 'b', 'c'], dtype='category') + s2 = pd.Series(['A', 'B', 'C'], dtype='category') + df = DataFrame({'one': s1, 'two': s2}) + rs = df.shift(1) + xp = DataFrame({'one': s1.shift(1), 'two': s2.shift(1)}) + assert_frame_equal(rs, xp) + def test_shift_empty(self): # Regression test for #8019 df = DataFrame({'foo': []}) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 075362e006206..a6a05ef6f479c 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -35,6 +35,7 @@ from pandas.util.testing import (assert_series_equal, assert_almost_equal, assert_frame_equal, + assert_index_equal, ensure_clean) import pandas.util.testing as tm @@ -5260,6 +5261,25 @@ def test_shift_int(self): expected = ts.astype(float).shift(1) assert_series_equal(shifted, expected) + def test_shift_categorical(self): + # GH 9416 + s = pd.Series(['a', 'b', 'c', 'd'], dtype='category') + + assert_series_equal(s.iloc[:-1], s.shift(1).shift(-1).valid()) + + sp1 = s.shift(1) + assert_index_equal(s.index, sp1.index) + self.assertTrue(np.all(sp1.values.codes[:1] == -1)) + self.assertTrue(np.all(s.values.codes[:-1] == sp1.values.codes[1:])) + + sn2 = s.shift(-2) + assert_index_equal(s.index, sn2.index) + self.assertTrue(np.all(sn2.values.codes[-2:] == -1)) + self.assertTrue(np.all(s.values.codes[2:] == sn2.values.codes[:-2])) + + assert_index_equal(s.values.categories, sp1.values.categories) + assert_index_equal(s.values.categories, sn2.values.categories) + def test_truncate(self): offset = datetools.bday