From 2c5e3d325ca104289cce038c4e5dee806ac9eddc Mon Sep 17 00:00:00 2001 From: Andy Hayden Date: Mon, 27 Jan 2014 10:00:44 -0800 Subject: [PATCH 1/2] ENH get_dummies str method FIX/DOC py3 and add docstrings DOC add in get_dummies to release and docs --- doc/source/basics.rst | 16 +++++++++++- doc/source/v0.13.1.txt | 8 ++++++ pandas/core/reshape.py | 2 ++ pandas/core/strings.py | 48 ++++++++++++++++++++++++++++++++++-- pandas/tests/test_strings.py | 16 ++++++++++-- vb_suite/strings.py | 5 ++++ 6 files changed, 90 insertions(+), 5 deletions(-) diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 638c8451bf8db..f1e0013dbc920 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -1155,7 +1155,6 @@ can also be used. Testing for Strings that Match or Contain a Pattern ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - You can check whether elements contain a pattern: .. ipython:: python @@ -1221,6 +1220,21 @@ Methods like ``match``, ``contains``, ``startswith``, and ``endswith`` take ``lower``,Equivalent to ``str.lower`` ``upper``,Equivalent to ``str.upper`` + +Getting indicator variables from seperated strings +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can extract dummy variables from string columns. +For example if they are seperated by a ``'|'``: + + .. ipython:: python + + s = pd.Series(['a', 'a|b', np.nan, 'a|c']) + s.str.get_dummies(sep='|') + +See also ``pd.get_dummies``. + + .. _basics.sorting: Sorting by index and value diff --git a/doc/source/v0.13.1.txt b/doc/source/v0.13.1.txt index e3e06357cea72..0d3bf839f5776 100644 --- a/doc/source/v0.13.1.txt +++ b/doc/source/v0.13.1.txt @@ -43,6 +43,14 @@ API changes - Add ``-NaN`` and ``-nan`` to the default set of NA values (:issue:`5952`). See :ref:`NA Values `. +- Added ``Series.str.get_dummies`` vectorized string method (:issue:`6021`), to extract + dummy/indicator variables for seperated string columns: + + .. ipython:: python + + s = Series(['a', 'a|b', np.nan, 'a|c']) + s.str.get_dummies(sep='|') + - Added the ``NDFrame.equals()`` method to compare if two NDFrames are equal have equal axes, dtypes, and values. Added the ``array_equivalent`` function to compare if two ndarrays are diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 1244d0140a01b..f5ca96e2d827e 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -941,6 +941,8 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False): 1 0 1 0 2 0 0 1 + See also ``Series.str.get_dummies``. + """ # Series avoids inconsistent NaN handling cat = Categorical.from_array(Series(data)) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 588a81e3cf80d..fc0cc7fda9fa8 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -187,7 +187,6 @@ def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True): else: f = lambda x: pat in x return _na_map(f, arr, na) - def str_startswith(arr, pat, na=np.nan): @@ -460,6 +459,46 @@ def f(x): return result +def str_get_dummies(arr, sep='|'): + """ + Split each string by sep and return a frame of dummy/indicator variables. + + Examples + -------- + >>> Series(['a|b', 'a', 'a|c']).str.get_dummies() + a b c + 0 1 1 0 + 1 1 0 0 + 2 1 0 1 + + >>> pd.Series(['a|b', np.nan, 'a|c']).str.get_dummies() + a b c + 0 1 1 0 + 1 NaN NaN NaN + 2 1 0 1 + + See also ``pd.get_dummies``. + + """ + def na_setunion(x, y): + try: + return x.union(y) + except TypeError: + return x + + # TODO remove this hack? + arr = sep + arr.fillna('').astype(str) + sep + + from functools import reduce + tags = sorted(reduce(na_setunion, arr.str.split(sep), set()) + - set([''])) + dummies = np.empty((len(arr), len(tags)), dtype=int) + + for i, t in enumerate(tags): + pat = sep + t + sep + dummies[:, i] = _na_map(lambda x: pat in x, arr) + return DataFrame(dummies, arr.index, tags) + def str_join(arr, sep): """ @@ -843,7 +882,7 @@ def contains(self, pat, case=True, flags=0, na=np.nan, regex=True): result = str_contains(self.series, pat, case=case, flags=flags, na=na, regex=regex) return self._wrap_result(result) - + @copy(str_replace) def replace(self, pat, repl, n=-1, case=True, flags=0): result = str_replace(self.series, pat, repl, n=n, case=case, @@ -899,6 +938,11 @@ def rstrip(self, to_strip=None): result = str_rstrip(self.series, to_strip) return self._wrap_result(result) + @copy(str_get_dummies) + def get_dummies(self, sep='|'): + result = str_get_dummies(self.series, sep) + return self._wrap_result(result) + count = _pat_wrapper(str_count, flags=True) startswith = _pat_wrapper(str_startswith, na=True) endswith = _pat_wrapper(str_endswith, na=True) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 797e415ab9c31..6c9832ebc5c2b 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -366,7 +366,6 @@ def test_replace(self): result = values.str.replace("(?<=\w),(?=\w)", ", ", flags=re.UNICODE) tm.assert_series_equal(result, exp) - def test_repeat(self): values = Series(['a', 'b', NA, 'c', NA, 'd']) @@ -465,7 +464,7 @@ def test_extract(self): # Contains tests like those in test_match and some others. values = Series(['fooBAD__barBAD', NA, 'foo']) - er = [NA, NA] # empty row + er = [NA, NA] # empty row result = values.str.extract('.*(BAD[_]+).*(BAD)') exp = DataFrame([['BAD__', 'BAD'], er, er]) @@ -549,6 +548,19 @@ def test_extract(self): exp = DataFrame([['A', '1'], ['B', '2'], ['C', NA]], columns=['letter', 'number']) tm.assert_frame_equal(result, exp) + def test_get_dummies(self): + s = Series(['a|b', 'a|c', np.nan]) + result = s.str.get_dummies('|') + expected = DataFrame([[1, 1, 0], [1, 0, 1], [0, 0, 0]], + columns=list('abc')) + tm.assert_frame_equal(result, expected) + + s = Series(['a;b', 'a', 7]) + result = s.str.get_dummies(';') + expected = DataFrame([[0, 1, 1], [0, 1, 0], [1, 0, 0]], + columns=list('7ab')) + tm.assert_frame_equal(result, expected) + def test_join(self): values = Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h']) result = values.str.split('_').str.join('_') diff --git a/vb_suite/strings.py b/vb_suite/strings.py index 287fd6d5bf2e2..459684ec0e435 100644 --- a/vb_suite/strings.py +++ b/vb_suite/strings.py @@ -45,6 +45,11 @@ def make_series(letters, strlen, size): strings_rstrip = Benchmark("many.str.rstrip('matchthis')", setup) strings_get = Benchmark("many.str.get(0)", setup) +setup = setup + """ +make_series(string.uppercase, strlen=10, size=10000).str.join('|') +""" +strings_get_dummies = Benchmark("s.str.get_dummies('|')", setup) + setup = common_setup + """ import pandas.util.testing as testing ser = pd.Series(testing.makeUnicodeIndex()) From d8f94e9c756e3c8f9b470945a18403fe9dd53217 Mon Sep 17 00:00:00 2001 From: Andy Hayden Date: Mon, 27 Jan 2014 16:25:28 -0800 Subject: [PATCH 2/2] PERF speed up str.get_dummies --- pandas/core/strings.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index fc0cc7fda9fa8..a41c06a6ad0b6 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -472,31 +472,31 @@ def str_get_dummies(arr, sep='|'): 2 1 0 1 >>> pd.Series(['a|b', np.nan, 'a|c']).str.get_dummies() - a b c - 0 1 1 0 - 1 NaN NaN NaN - 2 1 0 1 + a b c + 0 1 1 0 + 1 0 0 0 + 2 1 0 1 See also ``pd.get_dummies``. """ - def na_setunion(x, y): - try: - return x.union(y) - except TypeError: - return x - # TODO remove this hack? - arr = sep + arr.fillna('').astype(str) + sep + arr = arr.fillna('') + try: + arr = sep + arr + sep + except TypeError: + arr = sep + arr.astype(str) + sep + + tags = set() + for ts in arr.str.split(sep): + tags.update(ts) + tags = sorted(tags - set([""])) - from functools import reduce - tags = sorted(reduce(na_setunion, arr.str.split(sep), set()) - - set([''])) dummies = np.empty((len(arr), len(tags)), dtype=int) for i, t in enumerate(tags): pat = sep + t + sep - dummies[:, i] = _na_map(lambda x: pat in x, arr) + dummies[:, i] = lib.map_infer(arr.values, lambda x: pat in x) return DataFrame(dummies, arr.index, tags)