Skip to content

ENH/CLN: Add factorize to IndexOpsMixin #7090

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 10, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,7 @@ Computations / Descriptive Stats
Series.cumsum
Series.describe
Series.diff
Series.factorize
Series.kurt
Series.mad
Series.max
Expand Down Expand Up @@ -1040,6 +1041,7 @@ Modifying and Computations
Index.diff
Index.drop
Index.equals
Index.factorize
Index.identical
Index.insert
Index.order
Expand Down
2 changes: 2 additions & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,7 @@ API Changes
ignored (:issue:`6607`)
- Produce :class:`~pandas.io.parsers.ParserWarning` on fallback to python
parser when no options are ignored (:issue:`6607`)
- Added ``factorize`` functions to ``Index`` and ``Series`` to get indexer and unique values (:issue:`7090`)

Deprecations
~~~~~~~~~~~~
Expand Down Expand Up @@ -485,6 +486,7 @@ Bug Fixes
- Bug in cache coherence with chained indexing and slicing; add ``_is_view`` property to ``NDFrame`` to correctly predict
views; mark ``is_copy`` on ``xs` only if its an actual copy (and not a view) (:issue:`7084`)
- Bug in DatetimeIndex creation from string ndarray with ``dayfirst=True`` (:issue:`5917`)
- Bug in ``MultiIndex.from_arrays`` created from ``DatetimeIndex`` doesn't preserve ``freq`` and ``tz`` (:issue:`7090`)

pandas 0.13.1
-------------
Expand Down
1 change: 1 addition & 0 deletions doc/source/v0.14.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,7 @@ API changes
- add ``inplace`` keyword to ``Series.order/sort`` to make them inverses (:issue:`6859`)
- accept ``TextFileReader`` in ``concat``, which was affecting a common user idiom (:issue:`6583`), this was a regression
from 0.13.1
- Added ``factorize`` functions to ``Index`` and ``Series`` to get indexer and unique values (:issue:`7090`)

.. _whatsnew_0140.sql:

Expand Down
22 changes: 22 additions & 0 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,28 @@ def nunique(self):
"""
return len(self.value_counts())

def factorize(self, sort=False, na_sentinel=-1):
"""
Encode the object as an enumerated type or categorical variable

Parameters
----------
sort : boolean, default False
Sort by values
na_sentinel: int, default -1
Value to mark "not found"

Returns
-------
labels : the indexer to the original array
uniques : the unique Index
"""
from pandas.core.algorithms import factorize
from pandas.core.index import Index
labels, uniques = factorize(self, sort=sort, na_sentinel=na_sentinel)
uniques = Index(uniques)
return labels, uniques

date = _field_accessor('date','Returns numpy array of datetime.date. The date part of the Timestamps')
time = _field_accessor('time','Returns numpy array of datetime.time. The time part of the Timestamps')
year = _field_accessor('year', "The year of the datetime")
Expand Down
18 changes: 6 additions & 12 deletions pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,11 @@ def __init__(self, labels, levels=None, name=None):
if levels is None:
if name is None:
name = getattr(labels, 'name', None)
if isinstance(labels, Index) and hasattr(labels, 'factorize'):
labels, levels = labels.factorize()
if hasattr(labels, 'factorize'):
try:
labels, levels = labels.factorize(sort=True)
except TypeError:
labels, levels = labels.factorize(sort=False)
else:
try:
labels, levels = factorize(labels, sort=True)
Expand All @@ -103,16 +106,7 @@ def from_array(cls, data):
Can be an Index or array-like. The levels are assumed to be
the unique values of `data`.
"""
if isinstance(data, Index) and hasattr(data, 'factorize'):
labels, levels = data.factorize()
else:
try:
labels, levels = factorize(data, sort=True)
except TypeError:
labels, levels = factorize(data, sort=False)

return Categorical(labels, levels,
name=getattr(data, 'name', None))
return Categorical(data)

_levels = None

Expand Down
42 changes: 42 additions & 0 deletions pandas/tests/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,48 @@ def test_value_counts_inferred(self):
self.assert_numpy_array_equal(td.unique(), expected)
self.assertEquals(td.nunique(), 1)

def test_factorize(self):
for o in self.objs:
exp_arr = np.array(range(len(o)))
labels, uniques = o.factorize()

self.assert_numpy_array_equal(labels, exp_arr)
if isinstance(o, Series):
expected = Index(o.values)
self.assert_numpy_array_equal(uniques, expected)
else:
self.assertTrue(uniques.equals(o))

for o in self.objs:
# sort by value, and create duplicates
if isinstance(o, Series):
o.sort()
else:
indexer = o.argsort()
o = o.take(indexer)
n = o[5:].append(o)

exp_arr = np.array([5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
labels, uniques = n.factorize(sort=True)

self.assert_numpy_array_equal(labels, exp_arr)
if isinstance(o, Series):
expected = Index(o.values)
self.assert_numpy_array_equal(uniques, expected)
else:
self.assertTrue(uniques.equals(o))

exp_arr = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4])
labels, uniques = n.factorize(sort=False)
self.assert_numpy_array_equal(labels, exp_arr)

if isinstance(o, Series):
expected = Index(np.concatenate([o.values[5:10], o.values[:5]]))
self.assert_numpy_array_equal(uniques, expected)
else:
expected = o[5:].append(o[:5])
self.assertTrue(uniques.equals(expected))


class TestDatetimeIndexOps(Ops):
_allowed = '_allow_datetime_index_ops'
Expand Down
32 changes: 32 additions & 0 deletions pandas/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from pandas.core.categorical import Categorical
from pandas.core.index import Index, Int64Index, MultiIndex
from pandas.core.frame import DataFrame
from pandas.tseries.period import PeriodIndex
from pandas.util.testing import assert_almost_equal
import pandas.core.common as com

Expand Down Expand Up @@ -180,6 +181,37 @@ def test_empty_print(self):
"Index([], dtype=object)")
self.assertEqual(repr(factor), expected)

def test_periodindex(self):
idx1 = PeriodIndex(['2014-01', '2014-01', '2014-02', '2014-02',
'2014-03', '2014-03'], freq='M')
cat1 = Categorical.from_array(idx1)

exp_arr = np.array([0, 0, 1, 1, 2, 2])
exp_idx = PeriodIndex(['2014-01', '2014-02', '2014-03'], freq='M')

self.assert_numpy_array_equal(cat1.labels, exp_arr)
self.assert_(cat1.levels.equals(exp_idx))

idx2 = PeriodIndex(['2014-03', '2014-03', '2014-02', '2014-01',
'2014-03', '2014-01'], freq='M')
cat2 = Categorical.from_array(idx2)

exp_arr = np.array([2, 2, 1, 0, 2, 0])

self.assert_numpy_array_equal(cat2.labels, exp_arr)
self.assert_(cat2.levels.equals(exp_idx))

idx3 = PeriodIndex(['2013-12', '2013-11', '2013-10', '2013-09',
'2013-08', '2013-07', '2013-05'], freq='M')
cat3 = Categorical.from_array(idx3)

exp_arr = np.array([6, 5, 4, 3, 2, 1, 0])
exp_idx = PeriodIndex(['2013-05', '2013-07', '2013-08', '2013-09',
'2013-10', '2013-11', '2013-12'], freq='M')

self.assert_numpy_array_equal(cat3.labels, exp_arr)
self.assert_(cat3.levels.equals(exp_idx))


if __name__ == '__main__':
import nose
Expand Down
11 changes: 11 additions & 0 deletions pandas/tests/test_multilevel.py
Original file line number Diff line number Diff line change
Expand Up @@ -1899,6 +1899,17 @@ def test_multiindex_set_index(self):
# it works!
df.set_index(index)

def test_datetimeindex(self):
idx1 = pd.DatetimeIndex(['2013-04-01 9:00', '2013-04-02 9:00', '2013-04-03 9:00'] * 2, tz='Asia/Tokyo')
idx2 = pd.date_range('2010/01/01', periods=6, freq='M', tz='US/Eastern')
idx = MultiIndex.from_arrays([idx1, idx2])

expected1 = pd.DatetimeIndex(['2013-04-01 9:00', '2013-04-02 9:00', '2013-04-03 9:00'], tz='Asia/Tokyo')

self.assert_(idx.levels[0].equals(expected1))
self.assert_(idx.levels[1].equals(idx2))


if __name__ == '__main__':

import nose
Expand Down
13 changes: 13 additions & 0 deletions pandas/tseries/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -806,6 +806,19 @@ def to_period(self, freq=None):

return PeriodIndex(self.values, freq=freq, tz=self.tz)

def factorize(self, sort=False, na_sentinel=-1):
"""
Index.factorize with handling for DatetimeIndex metadata

Returns
-------
result : DatetimeIndex
"""
from pandas.core.algorithms import factorize
labels, uniques = factorize(self.asi8, sort=sort, na_sentinel=na_sentinel)
uniques = DatetimeIndex._simple_new(uniques, name=self.name, freq=self.freq, tz=self.tz)
return labels, uniques

def order(self, return_indexer=False, ascending=True):
"""
Return sorted copy of Index
Expand Down
9 changes: 0 additions & 9 deletions pandas/tseries/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -739,15 +739,6 @@ def is_full(self):
values = self.values
return ((values[1:] - values[:-1]) < 2).all()

def factorize(self):
"""
Specialized factorize that boxes uniques
"""
from pandas.core.algorithms import factorize
labels, uniques = factorize(self.values)
uniques = PeriodIndex(ordinal=uniques, freq=self.freq)
return labels, uniques

@property
def freqstr(self):
return self.freq
Expand Down
29 changes: 29 additions & 0 deletions pandas/tseries/tests/test_period.py
Original file line number Diff line number Diff line change
Expand Up @@ -2175,6 +2175,35 @@ def test_slice_keep_name(self):
idx = period_range('20010101', periods=10, freq='D', name='bob')
self.assertEqual(idx.name, idx[1:].name)

def test_factorize(self):
idx1 = PeriodIndex(['2014-01', '2014-01', '2014-02', '2014-02',
'2014-03', '2014-03'], freq='M')

exp_arr = np.array([0, 0, 1, 1, 2, 2])
exp_idx = PeriodIndex(['2014-01', '2014-02', '2014-03'], freq='M')

arr, idx = idx1.factorize()
self.assert_numpy_array_equal(arr, exp_arr)
self.assert_(idx.equals(exp_idx))

arr, idx = idx1.factorize(sort=True)
self.assert_numpy_array_equal(arr, exp_arr)
self.assert_(idx.equals(exp_idx))

idx2 = pd.PeriodIndex(['2014-03', '2014-03', '2014-02', '2014-01',
'2014-03', '2014-01'], freq='M')

exp_arr = np.array([2, 2, 1, 0, 2, 0])
arr, idx = idx2.factorize(sort=True)
self.assert_numpy_array_equal(arr, exp_arr)
self.assert_(idx.equals(exp_idx))

exp_arr = np.array([0, 0, 1, 2, 0, 2])
exp_idx = PeriodIndex(['2014-03', '2014-02', '2014-01'], freq='M')
arr, idx = idx2.factorize()
self.assert_numpy_array_equal(arr, exp_arr)
self.assert_(idx.equals(exp_idx))


def _permute(obj):
return obj.take(np.random.permutation(len(obj)))
Expand Down
45 changes: 45 additions & 0 deletions pandas/tseries/tests/test_timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -2189,6 +2189,51 @@ def test_join_with_period_index(self):
'PeriodIndex-ed objects'):
df.columns.join(s.index, how=join)

def test_factorize(self):
idx1 = DatetimeIndex(['2014-01', '2014-01', '2014-02',
'2014-02', '2014-03', '2014-03'])

exp_arr = np.array([0, 0, 1, 1, 2, 2])
exp_idx = DatetimeIndex(['2014-01', '2014-02', '2014-03'])

arr, idx = idx1.factorize()
self.assert_numpy_array_equal(arr, exp_arr)
self.assert_(idx.equals(exp_idx))

arr, idx = idx1.factorize(sort=True)
self.assert_numpy_array_equal(arr, exp_arr)
self.assert_(idx.equals(exp_idx))

# tz must be preserved
idx1 = idx1.tz_localize('Asia/Tokyo')
exp_idx = exp_idx.tz_localize('Asia/Tokyo')

arr, idx = idx1.factorize()
self.assert_numpy_array_equal(arr, exp_arr)
self.assert_(idx.equals(exp_idx))

idx2 = pd.DatetimeIndex(['2014-03', '2014-03', '2014-02', '2014-01',
'2014-03', '2014-01'])

exp_arr = np.array([2, 2, 1, 0, 2, 0])
exp_idx = DatetimeIndex(['2014-01', '2014-02', '2014-03'])
arr, idx = idx2.factorize(sort=True)
self.assert_numpy_array_equal(arr, exp_arr)
self.assert_(idx.equals(exp_idx))

exp_arr = np.array([0, 0, 1, 2, 0, 2])
exp_idx = DatetimeIndex(['2014-03', '2014-02', '2014-01'])
arr, idx = idx2.factorize()
self.assert_numpy_array_equal(arr, exp_arr)
self.assert_(idx.equals(exp_idx))

# freq must be preserved
idx3 = date_range('2000-01', periods=4, freq='M', tz='Asia/Tokyo')
exp_arr = np.array([0, 1, 2, 3])
arr, idx = idx3.factorize()
self.assert_numpy_array_equal(arr, exp_arr)
self.assert_(idx.equals(idx3))


class TestDatetime64(tm.TestCase):
"""
Expand Down