Skip to content

Commit

Permalink
ENH: add docs and add match function to API, close pandas-dev#502
Browse files Browse the repository at this point in the history
  • Loading branch information
wesm committed May 12, 2012
1 parent 7353202 commit 59f0ee7
Show file tree
Hide file tree
Showing 5 changed files with 63 additions and 7 deletions.
1 change: 1 addition & 0 deletions RELEASE.rst
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ pandas 0.8.0
- Add support for indexes (dates or otherwise) with duplicates and common
sense indexing/selection functionality
- Series/DataFrame.update methods, in-place variant of combine_first (#961)
- Add ``match`` function to API (#502)

**Improvements to existing features**

Expand Down
30 changes: 23 additions & 7 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,29 +8,45 @@
import pandas.core.common as com
import pandas._tseries as lib

def match(values, index):
def match(to_match, values, na_sentinel=-1):
"""
Compute locations of to_match into values
Parameters
----------
to_match : array-like
values to find positions of
values : array-like
Unique set of values
na_sentinel : int, default -1
Value to mark "not found"
Examples
--------
Returns
-------
match : ndarray
match : ndarray of integers
"""
f = lambda htype, caster: _match_generic(values, index, htype, caster)
return _hashtable_algo(f, index.dtype)
values = np.asarray(values)
if issubclass(values.dtype.type, basestring):
values = np.array(values, dtype='O')

f = lambda htype, caster: _match_generic(to_match, values, htype, caster)
return _hashtable_algo(f, values.dtype)

def unique(values):
"""
Compute unique values (not necessarily sorted) efficiently from input array
of values
Parameters
----------
values : array-like
Returns
-------
uniques
"""
f = lambda htype, caster: _unique_generic(values, htype, caster)
return _hashtable_algo(f, values.dtype)
Expand Down Expand Up @@ -98,7 +114,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1):
labels, counts = table.get_labels(values, uniques, 0, na_sentinel)

labels = com._ensure_platform_int(labels)

uniques = com._asarray_tuplesafe(uniques)
if sort and len(counts) > 0:
sorter = uniques.argsort()
Expand Down
2 changes: 2 additions & 0 deletions pandas/core/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

import numpy as np

from pandas.core.algorithms import factorize, match, unique

from pandas.core.common import isnull, notnull, save, load
from pandas.core.factor import Factor
from pandas.core.format import set_printoptions
Expand Down
25 changes: 25 additions & 0 deletions pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,31 @@

import numpy as np


import pandas.core.algorithms as algos
import pandas.util.testing as tm


class TestMatch(unittest.TestCase):

def test_ints(self):
values = np.array([0, 2, 1])
to_match = np.array([0, 1, 2, 2, 0, 1, 3, 0])

result = algos.match(to_match, values)
expected = np.array([0, 2, 1, 1, 0, 2, -1, 0])
self.assert_(np.array_equal(result, expected))

def test_strings(self):
values = ['foo', 'bar', 'baz']
to_match = ['bar', 'foo', 'qux', 'foo', 'bar', 'baz', 'qux']

result = algos.match(to_match, values)
expected = np.array([1, 0, -1, 0, 1, 2, -1])
self.assert_(np.array_equal(result, expected))

if __name__ == '__main__':
import nose
nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'],
exit=False)

12 changes: 12 additions & 0 deletions vb_suite/miscellaneous.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,15 @@ def prop(self):
misc_cache_readonly = Benchmark("obj.prop", setup, name="misc_cache_readonly",
ncalls=2000000)

#----------------------------------------------------------------------
# match

setup = common_setup + """
from pandas.util.testing import rands
uniques = np.array([rands(10) for _ in xrange(1000)], dtype='O')
all = uniques.repeat(10)
"""

match_strings = Benchmark("match(all, uniques)", setup,
start_date=datetime(2012, 5, 12))

0 comments on commit 59f0ee7

Please sign in to comment.