pandas-dev · jreback · May 10, 2014 · May 6, 2014
diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -335,6 +335,7 @@ Computations / Descriptive Stats
    Series.cumsum
    Series.describe
    Series.diff
+   Series.factorize
    Series.kurt
    Series.mad
    Series.max
@@ -1040,6 +1041,7 @@ Modifying and Computations
    Index.diff
    Index.drop
    Index.equals
+   Index.factorize
    Index.identical
    Index.insert
    Index.order

diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -203,6 +203,7 @@ API Changes
   ignored (:issue:`6607`)
 - Produce :class:`~pandas.io.parsers.ParserWarning` on fallback to python
   parser when no options are ignored (:issue:`6607`)
+- Added ``factorize`` functions to ``Index`` and ``Series`` to get indexer and unique values (:issue:`7090`)
 
 Deprecations
 ~~~~~~~~~~~~
@@ -485,6 +486,7 @@ Bug Fixes
 - Bug in cache coherence with chained indexing and slicing; add ``_is_view`` property to ``NDFrame`` to correctly predict
   views; mark ``is_copy`` on ``xs` only if its an actual copy (and not a view) (:issue:`7084`)
 - Bug in DatetimeIndex creation from string ndarray with ``dayfirst=True`` (:issue:`5917`)
+- Bug in ``MultiIndex.from_arrays`` created from ``DatetimeIndex`` doesn't preserve ``freq`` and ``tz`` (:issue:`7090`)
 
 pandas 0.13.1
 -------------

diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt
@@ -245,6 +245,7 @@ API changes
 - add ``inplace`` keyword to ``Series.order/sort`` to make them inverses (:issue:`6859`)
 - accept ``TextFileReader`` in ``concat``, which was affecting a common user idiom (:issue:`6583`), this was a regression
   from 0.13.1
+- Added ``factorize`` functions to ``Index`` and ``Series`` to get indexer and unique values (:issue:`7090`)
 
 .. _whatsnew_0140.sql:
 

diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -319,6 +319,28 @@ def nunique(self):
         """
         return len(self.value_counts())
 
+    def factorize(self, sort=False, na_sentinel=-1):
+        """
+        Encode the object as an enumerated type or categorical variable
+
+        Parameters
+        ----------
+        sort : boolean, default False
+            Sort by values
+        na_sentinel: int, default -1
+            Value to mark "not found"
+
+        Returns
+        -------
+        labels : the indexer to the original array
+        uniques : the unique Index
+        """
+        from pandas.core.algorithms import factorize
+        from pandas.core.index import Index
+        labels, uniques = factorize(self, sort=sort, na_sentinel=na_sentinel)
+        uniques = Index(uniques)
+        return labels, uniques
+
     date = _field_accessor('date','Returns numpy array of datetime.date. The date part of the Timestamps')
     time = _field_accessor('time','Returns numpy array of datetime.time. The time part of the Timestamps')
     year = _field_accessor('year', "The year of the datetime")

diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -80,8 +80,11 @@ def __init__(self, labels, levels=None, name=None):
         if levels is None:
             if name is None:
                 name = getattr(labels, 'name', None)
-            if isinstance(labels, Index) and hasattr(labels, 'factorize'):
-                labels, levels = labels.factorize()
+            if hasattr(labels, 'factorize'):
+                try:
+                    labels, levels = labels.factorize(sort=True)
+                except TypeError:
+                    labels, levels = labels.factorize(sort=False)
             else:
                 try:
                     labels, levels = factorize(labels, sort=True)
@@ -103,16 +106,7 @@ def from_array(cls, data):
             Can be an Index or array-like. The levels are assumed to be
             the unique values of `data`.
         """
-        if isinstance(data, Index) and hasattr(data, 'factorize'):
-            labels, levels = data.factorize()
-        else:
-            try:
-                labels, levels = factorize(data, sort=True)
-            except TypeError:
-                labels, levels = factorize(data, sort=False)
-
-        return Categorical(labels, levels,
-                           name=getattr(data, 'name', None))
+        return Categorical(data)
 
     _levels = None
 

diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py
@@ -398,6 +398,48 @@ def test_value_counts_inferred(self):
             self.assert_numpy_array_equal(td.unique(), expected)
             self.assertEquals(td.nunique(), 1)
 
+    def test_factorize(self):
+        for o in self.objs:
+            exp_arr = np.array(range(len(o)))
+            labels, uniques = o.factorize()
+
+            self.assert_numpy_array_equal(labels, exp_arr)
+            if isinstance(o, Series):
+                expected = Index(o.values)
+                self.assert_numpy_array_equal(uniques, expected)
+            else:
+                self.assertTrue(uniques.equals(o))
+
+        for o in self.objs:
+            # sort by value, and create duplicates
+            if isinstance(o, Series):
+                o.sort()
+            else:
+                indexer = o.argsort()
+                o = o.take(indexer)
+            n = o[5:].append(o)
+
+            exp_arr = np.array([5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+            labels, uniques = n.factorize(sort=True)
+
+            self.assert_numpy_array_equal(labels, exp_arr)
+            if isinstance(o, Series):
+                expected = Index(o.values)
+                self.assert_numpy_array_equal(uniques, expected)
+            else:
+                self.assertTrue(uniques.equals(o))
+
+            exp_arr = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4])
+            labels, uniques = n.factorize(sort=False)
+            self.assert_numpy_array_equal(labels, exp_arr)
+
+            if isinstance(o, Series):
+                expected = Index(np.concatenate([o.values[5:10], o.values[:5]]))
+                self.assert_numpy_array_equal(uniques, expected)
+            else:
+                expected = o[5:].append(o[:5])
+                self.assertTrue(uniques.equals(expected))
+
 
 class TestDatetimeIndexOps(Ops):
     _allowed = '_allow_datetime_index_ops'

diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
@@ -10,6 +10,7 @@
 from pandas.core.categorical import Categorical
 from pandas.core.index import Index, Int64Index, MultiIndex
 from pandas.core.frame import DataFrame
+from pandas.tseries.period import PeriodIndex
 from pandas.util.testing import assert_almost_equal
 import pandas.core.common as com
 
@@ -180,6 +181,37 @@ def test_empty_print(self):
                     "Index([], dtype=object)")
         self.assertEqual(repr(factor), expected)
 
+    def test_periodindex(self):
+        idx1 = PeriodIndex(['2014-01', '2014-01', '2014-02', '2014-02',
+                               '2014-03', '2014-03'], freq='M')
+        cat1 = Categorical.from_array(idx1)
+
+        exp_arr = np.array([0, 0, 1, 1, 2, 2])
+        exp_idx = PeriodIndex(['2014-01', '2014-02', '2014-03'], freq='M')
+
+        self.assert_numpy_array_equal(cat1.labels, exp_arr)
+        self.assert_(cat1.levels.equals(exp_idx))
+
+        idx2 = PeriodIndex(['2014-03', '2014-03', '2014-02', '2014-01',
+                               '2014-03', '2014-01'], freq='M')
+        cat2 = Categorical.from_array(idx2)
+
+        exp_arr = np.array([2, 2, 1, 0, 2, 0])
+
+        self.assert_numpy_array_equal(cat2.labels, exp_arr)
+        self.assert_(cat2.levels.equals(exp_idx))
+
+        idx3 = PeriodIndex(['2013-12', '2013-11', '2013-10', '2013-09',
+                            '2013-08', '2013-07', '2013-05'], freq='M')
+        cat3 = Categorical.from_array(idx3)
+
+        exp_arr = np.array([6, 5, 4, 3, 2, 1, 0])
+        exp_idx = PeriodIndex(['2013-05', '2013-07', '2013-08', '2013-09',
+                               '2013-10', '2013-11', '2013-12'], freq='M')
+
+        self.assert_numpy_array_equal(cat3.labels, exp_arr)
+        self.assert_(cat3.levels.equals(exp_idx))
+
 
 if __name__ == '__main__':
     import nose

diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py
@@ -1899,6 +1899,17 @@ def test_multiindex_set_index(self):
         # it works!
         df.set_index(index)
 
+    def test_datetimeindex(self):
+        idx1 = pd.DatetimeIndex(['2013-04-01 9:00', '2013-04-02 9:00', '2013-04-03 9:00'] * 2, tz='Asia/Tokyo')
+        idx2 = pd.date_range('2010/01/01', periods=6, freq='M', tz='US/Eastern')
+        idx = MultiIndex.from_arrays([idx1, idx2])
+
+        expected1 = pd.DatetimeIndex(['2013-04-01 9:00', '2013-04-02 9:00', '2013-04-03 9:00'], tz='Asia/Tokyo')
+
+        self.assert_(idx.levels[0].equals(expected1))
+        self.assert_(idx.levels[1].equals(idx2))
+
+
 if __name__ == '__main__':
 
     import nose

diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py
@@ -806,6 +806,19 @@ def to_period(self, freq=None):
 
         return PeriodIndex(self.values, freq=freq, tz=self.tz)
 
+    def factorize(self, sort=False, na_sentinel=-1):
+        """
+        Index.factorize with handling for DatetimeIndex metadata
+
+        Returns
+        -------
+        result : DatetimeIndex
+        """
+        from pandas.core.algorithms import factorize
+        labels, uniques = factorize(self.asi8, sort=sort, na_sentinel=na_sentinel)
+        uniques = DatetimeIndex._simple_new(uniques, name=self.name, freq=self.freq, tz=self.tz)
+        return labels, uniques
+
     def order(self, return_indexer=False, ascending=True):
         """
         Return sorted copy of Index

diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py
@@ -739,15 +739,6 @@ def is_full(self):
         values = self.values
         return ((values[1:] - values[:-1]) < 2).all()
 
-    def factorize(self):
-        """
-        Specialized factorize that boxes uniques
-        """
-        from pandas.core.algorithms import factorize
-        labels, uniques = factorize(self.values)
-        uniques = PeriodIndex(ordinal=uniques, freq=self.freq)
-        return labels, uniques
-
     @property
     def freqstr(self):
         return self.freq

diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py
@@ -2175,6 +2175,35 @@ def test_slice_keep_name(self):
         idx = period_range('20010101', periods=10, freq='D', name='bob')
         self.assertEqual(idx.name, idx[1:].name)
 
+    def test_factorize(self):
+        idx1 = PeriodIndex(['2014-01', '2014-01', '2014-02', '2014-02',
+                       '2014-03', '2014-03'], freq='M')
+
+        exp_arr = np.array([0, 0, 1, 1, 2, 2])
+        exp_idx = PeriodIndex(['2014-01', '2014-02', '2014-03'], freq='M')
+
+        arr, idx = idx1.factorize()
+        self.assert_numpy_array_equal(arr, exp_arr)
+        self.assert_(idx.equals(exp_idx))
+
+        arr, idx = idx1.factorize(sort=True)
+        self.assert_numpy_array_equal(arr, exp_arr)
+        self.assert_(idx.equals(exp_idx))
+
+        idx2 = pd.PeriodIndex(['2014-03', '2014-03', '2014-02', '2014-01',
+                               '2014-03', '2014-01'], freq='M')
+
+        exp_arr = np.array([2, 2, 1, 0, 2, 0])        
+        arr, idx = idx2.factorize(sort=True)
+        self.assert_numpy_array_equal(arr, exp_arr)
+        self.assert_(idx.equals(exp_idx))
+
+        exp_arr = np.array([0, 0, 1, 2, 0, 2])
+        exp_idx = PeriodIndex(['2014-03', '2014-02', '2014-01'], freq='M')
+        arr, idx = idx2.factorize()
+        self.assert_numpy_array_equal(arr, exp_arr)
+        self.assert_(idx.equals(exp_idx))
+
 
 def _permute(obj):
     return obj.take(np.random.permutation(len(obj)))

diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py
@@ -2189,6 +2189,51 @@ def test_join_with_period_index(self):
                                        'PeriodIndex-ed objects'):
                 df.columns.join(s.index, how=join)
 
+    def test_factorize(self):
+        idx1 = DatetimeIndex(['2014-01', '2014-01', '2014-02',
+                              '2014-02', '2014-03', '2014-03'])
+
+        exp_arr = np.array([0, 0, 1, 1, 2, 2])
+        exp_idx = DatetimeIndex(['2014-01', '2014-02', '2014-03'])
+
+        arr, idx = idx1.factorize()
+        self.assert_numpy_array_equal(arr, exp_arr)
+        self.assert_(idx.equals(exp_idx))
+
+        arr, idx = idx1.factorize(sort=True)
+        self.assert_numpy_array_equal(arr, exp_arr)
+        self.assert_(idx.equals(exp_idx))
+
+        # tz must be preserved
+        idx1 = idx1.tz_localize('Asia/Tokyo')
+        exp_idx = exp_idx.tz_localize('Asia/Tokyo')
+
+        arr, idx = idx1.factorize()
+        self.assert_numpy_array_equal(arr, exp_arr)
+        self.assert_(idx.equals(exp_idx))
+
+        idx2 = pd.DatetimeIndex(['2014-03', '2014-03', '2014-02', '2014-01',
+                               '2014-03', '2014-01'])
+
+        exp_arr = np.array([2, 2, 1, 0, 2, 0])        
+        exp_idx = DatetimeIndex(['2014-01', '2014-02', '2014-03'])
+        arr, idx = idx2.factorize(sort=True)
+        self.assert_numpy_array_equal(arr, exp_arr)
+        self.assert_(idx.equals(exp_idx))
+
+        exp_arr = np.array([0, 0, 1, 2, 0, 2])
+        exp_idx = DatetimeIndex(['2014-03', '2014-02', '2014-01'])
+        arr, idx = idx2.factorize()
+        self.assert_numpy_array_equal(arr, exp_arr)
+        self.assert_(idx.equals(exp_idx))
+
+        # freq must be preserved
+        idx3 = date_range('2000-01', periods=4, freq='M', tz='Asia/Tokyo')
+        exp_arr = np.array([0, 1, 2, 3])
+        arr, idx = idx3.factorize()
+        self.assert_numpy_array_equal(arr, exp_arr)
+        self.assert_(idx.equals(idx3))
+
 
 class TestDatetime64(tm.TestCase):
     """