diff --git a/doc/source/api.rst b/doc/source/api.rst index aa5c58652d550..60e8fc634070e 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -335,6 +335,7 @@ Computations / Descriptive Stats Series.cumsum Series.describe Series.diff + Series.factorize Series.kurt Series.mad Series.max @@ -1040,6 +1041,7 @@ Modifying and Computations Index.diff Index.drop Index.equals + Index.factorize Index.identical Index.insert Index.order diff --git a/doc/source/release.rst b/doc/source/release.rst index 3e6f7bb232156..53abc22cd02f4 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -203,6 +203,7 @@ API Changes ignored (:issue:`6607`) - Produce :class:`~pandas.io.parsers.ParserWarning` on fallback to python parser when no options are ignored (:issue:`6607`) +- Added ``factorize`` functions to ``Index`` and ``Series`` to get indexer and unique values (:issue:`7090`) Deprecations ~~~~~~~~~~~~ @@ -485,6 +486,7 @@ Bug Fixes - Bug in cache coherence with chained indexing and slicing; add ``_is_view`` property to ``NDFrame`` to correctly predict views; mark ``is_copy`` on ``xs` only if its an actual copy (and not a view) (:issue:`7084`) - Bug in DatetimeIndex creation from string ndarray with ``dayfirst=True`` (:issue:`5917`) +- Bug in ``MultiIndex.from_arrays`` created from ``DatetimeIndex`` doesn't preserve ``freq`` and ``tz`` (:issue:`7090`) pandas 0.13.1 ------------- diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index cde6bf3bfd670..7548072f04d1d 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -245,6 +245,7 @@ API changes - add ``inplace`` keyword to ``Series.order/sort`` to make them inverses (:issue:`6859`) - accept ``TextFileReader`` in ``concat``, which was affecting a common user idiom (:issue:`6583`), this was a regression from 0.13.1 +- Added ``factorize`` functions to ``Index`` and ``Series`` to get indexer and unique values (:issue:`7090`) .. _whatsnew_0140.sql: diff --git a/pandas/core/base.py b/pandas/core/base.py index 1e9adb60f534e..f614516c87d50 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -319,6 +319,28 @@ def nunique(self): """ return len(self.value_counts()) + def factorize(self, sort=False, na_sentinel=-1): + """ + Encode the object as an enumerated type or categorical variable + + Parameters + ---------- + sort : boolean, default False + Sort by values + na_sentinel: int, default -1 + Value to mark "not found" + + Returns + ------- + labels : the indexer to the original array + uniques : the unique Index + """ + from pandas.core.algorithms import factorize + from pandas.core.index import Index + labels, uniques = factorize(self, sort=sort, na_sentinel=na_sentinel) + uniques = Index(uniques) + return labels, uniques + date = _field_accessor('date','Returns numpy array of datetime.date. The date part of the Timestamps') time = _field_accessor('time','Returns numpy array of datetime.time. The time part of the Timestamps') year = _field_accessor('year', "The year of the datetime") diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index b255831e51ae0..ee6f8f1847258 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -80,8 +80,11 @@ def __init__(self, labels, levels=None, name=None): if levels is None: if name is None: name = getattr(labels, 'name', None) - if isinstance(labels, Index) and hasattr(labels, 'factorize'): - labels, levels = labels.factorize() + if hasattr(labels, 'factorize'): + try: + labels, levels = labels.factorize(sort=True) + except TypeError: + labels, levels = labels.factorize(sort=False) else: try: labels, levels = factorize(labels, sort=True) @@ -103,16 +106,7 @@ def from_array(cls, data): Can be an Index or array-like. The levels are assumed to be the unique values of `data`. """ - if isinstance(data, Index) and hasattr(data, 'factorize'): - labels, levels = data.factorize() - else: - try: - labels, levels = factorize(data, sort=True) - except TypeError: - labels, levels = factorize(data, sort=False) - - return Categorical(labels, levels, - name=getattr(data, 'name', None)) + return Categorical(data) _levels = None diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 81b3d4631bfbf..e07b1ff15d26f 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -398,6 +398,48 @@ def test_value_counts_inferred(self): self.assert_numpy_array_equal(td.unique(), expected) self.assertEquals(td.nunique(), 1) + def test_factorize(self): + for o in self.objs: + exp_arr = np.array(range(len(o))) + labels, uniques = o.factorize() + + self.assert_numpy_array_equal(labels, exp_arr) + if isinstance(o, Series): + expected = Index(o.values) + self.assert_numpy_array_equal(uniques, expected) + else: + self.assertTrue(uniques.equals(o)) + + for o in self.objs: + # sort by value, and create duplicates + if isinstance(o, Series): + o.sort() + else: + indexer = o.argsort() + o = o.take(indexer) + n = o[5:].append(o) + + exp_arr = np.array([5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) + labels, uniques = n.factorize(sort=True) + + self.assert_numpy_array_equal(labels, exp_arr) + if isinstance(o, Series): + expected = Index(o.values) + self.assert_numpy_array_equal(uniques, expected) + else: + self.assertTrue(uniques.equals(o)) + + exp_arr = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4]) + labels, uniques = n.factorize(sort=False) + self.assert_numpy_array_equal(labels, exp_arr) + + if isinstance(o, Series): + expected = Index(np.concatenate([o.values[5:10], o.values[:5]])) + self.assert_numpy_array_equal(uniques, expected) + else: + expected = o[5:].append(o[:5]) + self.assertTrue(uniques.equals(expected)) + class TestDatetimeIndexOps(Ops): _allowed = '_allow_datetime_index_ops' diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index e4d7ef2f9a8c6..04e9f238d1dbe 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -10,6 +10,7 @@ from pandas.core.categorical import Categorical from pandas.core.index import Index, Int64Index, MultiIndex from pandas.core.frame import DataFrame +from pandas.tseries.period import PeriodIndex from pandas.util.testing import assert_almost_equal import pandas.core.common as com @@ -180,6 +181,37 @@ def test_empty_print(self): "Index([], dtype=object)") self.assertEqual(repr(factor), expected) + def test_periodindex(self): + idx1 = PeriodIndex(['2014-01', '2014-01', '2014-02', '2014-02', + '2014-03', '2014-03'], freq='M') + cat1 = Categorical.from_array(idx1) + + exp_arr = np.array([0, 0, 1, 1, 2, 2]) + exp_idx = PeriodIndex(['2014-01', '2014-02', '2014-03'], freq='M') + + self.assert_numpy_array_equal(cat1.labels, exp_arr) + self.assert_(cat1.levels.equals(exp_idx)) + + idx2 = PeriodIndex(['2014-03', '2014-03', '2014-02', '2014-01', + '2014-03', '2014-01'], freq='M') + cat2 = Categorical.from_array(idx2) + + exp_arr = np.array([2, 2, 1, 0, 2, 0]) + + self.assert_numpy_array_equal(cat2.labels, exp_arr) + self.assert_(cat2.levels.equals(exp_idx)) + + idx3 = PeriodIndex(['2013-12', '2013-11', '2013-10', '2013-09', + '2013-08', '2013-07', '2013-05'], freq='M') + cat3 = Categorical.from_array(idx3) + + exp_arr = np.array([6, 5, 4, 3, 2, 1, 0]) + exp_idx = PeriodIndex(['2013-05', '2013-07', '2013-08', '2013-09', + '2013-10', '2013-11', '2013-12'], freq='M') + + self.assert_numpy_array_equal(cat3.labels, exp_arr) + self.assert_(cat3.levels.equals(exp_idx)) + if __name__ == '__main__': import nose diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index a6c2bb9f56602..00f7b65f5690e 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1899,6 +1899,17 @@ def test_multiindex_set_index(self): # it works! df.set_index(index) + def test_datetimeindex(self): + idx1 = pd.DatetimeIndex(['2013-04-01 9:00', '2013-04-02 9:00', '2013-04-03 9:00'] * 2, tz='Asia/Tokyo') + idx2 = pd.date_range('2010/01/01', periods=6, freq='M', tz='US/Eastern') + idx = MultiIndex.from_arrays([idx1, idx2]) + + expected1 = pd.DatetimeIndex(['2013-04-01 9:00', '2013-04-02 9:00', '2013-04-03 9:00'], tz='Asia/Tokyo') + + self.assert_(idx.levels[0].equals(expected1)) + self.assert_(idx.levels[1].equals(idx2)) + + if __name__ == '__main__': import nose diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index d9018ad92eb17..b318e18fd6481 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -806,6 +806,19 @@ def to_period(self, freq=None): return PeriodIndex(self.values, freq=freq, tz=self.tz) + def factorize(self, sort=False, na_sentinel=-1): + """ + Index.factorize with handling for DatetimeIndex metadata + + Returns + ------- + result : DatetimeIndex + """ + from pandas.core.algorithms import factorize + labels, uniques = factorize(self.asi8, sort=sort, na_sentinel=na_sentinel) + uniques = DatetimeIndex._simple_new(uniques, name=self.name, freq=self.freq, tz=self.tz) + return labels, uniques + def order(self, return_indexer=False, ascending=True): """ Return sorted copy of Index diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 6d9e32433cd1e..01a93b712b42c 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -739,15 +739,6 @@ def is_full(self): values = self.values return ((values[1:] - values[:-1]) < 2).all() - def factorize(self): - """ - Specialized factorize that boxes uniques - """ - from pandas.core.algorithms import factorize - labels, uniques = factorize(self.values) - uniques = PeriodIndex(ordinal=uniques, freq=self.freq) - return labels, uniques - @property def freqstr(self): return self.freq diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index a6326794c1b12..43a4d4ff1239b 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -2175,6 +2175,35 @@ def test_slice_keep_name(self): idx = period_range('20010101', periods=10, freq='D', name='bob') self.assertEqual(idx.name, idx[1:].name) + def test_factorize(self): + idx1 = PeriodIndex(['2014-01', '2014-01', '2014-02', '2014-02', + '2014-03', '2014-03'], freq='M') + + exp_arr = np.array([0, 0, 1, 1, 2, 2]) + exp_idx = PeriodIndex(['2014-01', '2014-02', '2014-03'], freq='M') + + arr, idx = idx1.factorize() + self.assert_numpy_array_equal(arr, exp_arr) + self.assert_(idx.equals(exp_idx)) + + arr, idx = idx1.factorize(sort=True) + self.assert_numpy_array_equal(arr, exp_arr) + self.assert_(idx.equals(exp_idx)) + + idx2 = pd.PeriodIndex(['2014-03', '2014-03', '2014-02', '2014-01', + '2014-03', '2014-01'], freq='M') + + exp_arr = np.array([2, 2, 1, 0, 2, 0]) + arr, idx = idx2.factorize(sort=True) + self.assert_numpy_array_equal(arr, exp_arr) + self.assert_(idx.equals(exp_idx)) + + exp_arr = np.array([0, 0, 1, 2, 0, 2]) + exp_idx = PeriodIndex(['2014-03', '2014-02', '2014-01'], freq='M') + arr, idx = idx2.factorize() + self.assert_numpy_array_equal(arr, exp_arr) + self.assert_(idx.equals(exp_idx)) + def _permute(obj): return obj.take(np.random.permutation(len(obj))) diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 7690f118af482..0c0e7692b7d4c 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -2189,6 +2189,51 @@ def test_join_with_period_index(self): 'PeriodIndex-ed objects'): df.columns.join(s.index, how=join) + def test_factorize(self): + idx1 = DatetimeIndex(['2014-01', '2014-01', '2014-02', + '2014-02', '2014-03', '2014-03']) + + exp_arr = np.array([0, 0, 1, 1, 2, 2]) + exp_idx = DatetimeIndex(['2014-01', '2014-02', '2014-03']) + + arr, idx = idx1.factorize() + self.assert_numpy_array_equal(arr, exp_arr) + self.assert_(idx.equals(exp_idx)) + + arr, idx = idx1.factorize(sort=True) + self.assert_numpy_array_equal(arr, exp_arr) + self.assert_(idx.equals(exp_idx)) + + # tz must be preserved + idx1 = idx1.tz_localize('Asia/Tokyo') + exp_idx = exp_idx.tz_localize('Asia/Tokyo') + + arr, idx = idx1.factorize() + self.assert_numpy_array_equal(arr, exp_arr) + self.assert_(idx.equals(exp_idx)) + + idx2 = pd.DatetimeIndex(['2014-03', '2014-03', '2014-02', '2014-01', + '2014-03', '2014-01']) + + exp_arr = np.array([2, 2, 1, 0, 2, 0]) + exp_idx = DatetimeIndex(['2014-01', '2014-02', '2014-03']) + arr, idx = idx2.factorize(sort=True) + self.assert_numpy_array_equal(arr, exp_arr) + self.assert_(idx.equals(exp_idx)) + + exp_arr = np.array([0, 0, 1, 2, 0, 2]) + exp_idx = DatetimeIndex(['2014-03', '2014-02', '2014-01']) + arr, idx = idx2.factorize() + self.assert_numpy_array_equal(arr, exp_arr) + self.assert_(idx.equals(exp_idx)) + + # freq must be preserved + idx3 = date_range('2000-01', periods=4, freq='M', tz='Asia/Tokyo') + exp_arr = np.array([0, 1, 2, 3]) + arr, idx = idx3.factorize() + self.assert_numpy_array_equal(arr, exp_arr) + self.assert_(idx.equals(idx3)) + class TestDatetime64(tm.TestCase): """