From 2c5e3d325ca104289cce038c4e5dee806ac9eddc Mon Sep 17 00:00:00 2001
From: Andy Hayden <andyhayden1@gmail.com>
Date: Mon, 27 Jan 2014 10:00:44 -0800
Subject: [PATCH 1/2] ENH get_dummies str method

FIX/DOC py3 and add docstrings

DOC add in get_dummies to release and docs
---
 doc/source/basics.rst        | 16 +++++++++++-
 doc/source/v0.13.1.txt       |  8 ++++++
 pandas/core/reshape.py       |  2 ++
 pandas/core/strings.py       | 48 ++++++++++++++++++++++++++++++++++--
 pandas/tests/test_strings.py | 16 ++++++++++--
 vb_suite/strings.py          |  5 ++++
 6 files changed, 90 insertions(+), 5 deletions(-)

diff --git a/doc/source/basics.rst b/doc/source/basics.rst
index 638c8451bf8db..f1e0013dbc920 100644
--- a/doc/source/basics.rst
+++ b/doc/source/basics.rst
@@ -1155,7 +1155,6 @@ can also be used.
 Testing for Strings that Match or Contain a Pattern
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-
 You can check whether elements contain a pattern:
 
 .. ipython:: python
@@ -1221,6 +1220,21 @@ Methods like ``match``, ``contains``, ``startswith``, and ``endswith`` take
     ``lower``,Equivalent to ``str.lower``
     ``upper``,Equivalent to ``str.upper``
 
+
+Getting indicator variables from seperated strings
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+You can extract dummy variables from string columns.
+For example if they are seperated by a ``'|'``:
+
+  .. ipython:: python
+
+      s = pd.Series(['a', 'a|b', np.nan, 'a|c'])
+      s.str.get_dummies(sep='|')
+
+See also ``pd.get_dummies``.
+
+
 .. _basics.sorting:
 
 Sorting by index and value
diff --git a/doc/source/v0.13.1.txt b/doc/source/v0.13.1.txt
index e3e06357cea72..0d3bf839f5776 100644
--- a/doc/source/v0.13.1.txt
+++ b/doc/source/v0.13.1.txt
@@ -43,6 +43,14 @@ API changes
 - Add ``-NaN`` and ``-nan`` to the default set of NA values (:issue:`5952`).
   See :ref:`NA Values <io.na_values>`.
 
+- Added ``Series.str.get_dummies`` vectorized string method (:issue:`6021`), to extract
+  dummy/indicator variables for seperated string columns:
+
+  .. ipython:: python
+
+      s = Series(['a', 'a|b', np.nan, 'a|c'])
+      s.str.get_dummies(sep='|')
+
 - Added the ``NDFrame.equals()`` method to compare if two NDFrames are
   equal have equal axes, dtypes, and values. Added the
   ``array_equivalent`` function to compare if two ndarrays are
diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py
index 1244d0140a01b..f5ca96e2d827e 100644
--- a/pandas/core/reshape.py
+++ b/pandas/core/reshape.py
@@ -941,6 +941,8 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False):
     1  0  1    0
     2  0  0    1
 
+    See also ``Series.str.get_dummies``.
+
     """
     # Series avoids inconsistent NaN handling
     cat = Categorical.from_array(Series(data))
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
index 588a81e3cf80d..fc0cc7fda9fa8 100644
--- a/pandas/core/strings.py
+++ b/pandas/core/strings.py
@@ -187,7 +187,6 @@ def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True):
     else:
         f = lambda x: pat in x
     return _na_map(f, arr, na)
-        
 
 
 def str_startswith(arr, pat, na=np.nan):
@@ -460,6 +459,46 @@ def f(x):
     return result
 
 
+def str_get_dummies(arr, sep='|'):
+    """
+    Split each string by sep and return a frame of dummy/indicator variables.
+
+    Examples
+    --------
+    >>> Series(['a|b', 'a', 'a|c']).str.get_dummies()
+       a  b  c
+    0  1  1  0
+    1  1  0  0
+    2  1  0  1
+
+    >>> pd.Series(['a|b', np.nan, 'a|c']).str.get_dummies()
+        a   b   c
+    0   1   1   0
+    1 NaN NaN NaN
+    2   1   0   1
+
+    See also ``pd.get_dummies``.
+
+    """
+    def na_setunion(x, y):
+        try:
+            return x.union(y)
+        except TypeError:
+            return x
+
+    # TODO remove this hack?
+    arr = sep + arr.fillna('').astype(str) + sep
+
+    from functools import reduce
+    tags = sorted(reduce(na_setunion, arr.str.split(sep), set())
+                  - set(['']))
+    dummies = np.empty((len(arr), len(tags)), dtype=int)
+
+    for i, t in enumerate(tags):
+        pat = sep + t + sep
+        dummies[:, i] = _na_map(lambda x: pat in x, arr)
+    return DataFrame(dummies, arr.index, tags)
+
 
 def str_join(arr, sep):
     """
@@ -843,7 +882,7 @@ def contains(self, pat, case=True, flags=0, na=np.nan, regex=True):
         result = str_contains(self.series, pat, case=case, flags=flags,
                               na=na, regex=regex)
         return self._wrap_result(result)
-            
+
     @copy(str_replace)
     def replace(self, pat, repl, n=-1, case=True, flags=0):
         result = str_replace(self.series, pat, repl, n=n, case=case,
@@ -899,6 +938,11 @@ def rstrip(self, to_strip=None):
         result = str_rstrip(self.series, to_strip)
         return self._wrap_result(result)
 
+    @copy(str_get_dummies)
+    def get_dummies(self, sep='|'):
+        result = str_get_dummies(self.series, sep)
+        return self._wrap_result(result)
+
     count = _pat_wrapper(str_count, flags=True)
     startswith = _pat_wrapper(str_startswith, na=True)
     endswith = _pat_wrapper(str_endswith, na=True)
diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py
index 797e415ab9c31..6c9832ebc5c2b 100644
--- a/pandas/tests/test_strings.py
+++ b/pandas/tests/test_strings.py
@@ -366,7 +366,6 @@ def test_replace(self):
         result = values.str.replace("(?<=\w),(?=\w)", ", ", flags=re.UNICODE)
         tm.assert_series_equal(result, exp)
 
-
     def test_repeat(self):
         values = Series(['a', 'b', NA, 'c', NA, 'd'])
 
@@ -465,7 +464,7 @@ def test_extract(self):
         # Contains tests like those in test_match and some others.
 
         values = Series(['fooBAD__barBAD', NA, 'foo'])
-        er = [NA, NA] # empty row
+        er = [NA, NA]  # empty row
 
         result = values.str.extract('.*(BAD[_]+).*(BAD)')
         exp = DataFrame([['BAD__', 'BAD'], er, er])
@@ -549,6 +548,19 @@ def test_extract(self):
         exp = DataFrame([['A', '1'], ['B', '2'], ['C', NA]], columns=['letter', 'number'])
         tm.assert_frame_equal(result, exp)
 
+    def test_get_dummies(self):
+        s = Series(['a|b', 'a|c', np.nan])
+        result = s.str.get_dummies('|')
+        expected = DataFrame([[1, 1, 0], [1, 0, 1], [0, 0, 0]],
+                             columns=list('abc'))
+        tm.assert_frame_equal(result, expected)
+
+        s = Series(['a;b', 'a', 7])
+        result = s.str.get_dummies(';')
+        expected = DataFrame([[0, 1, 1], [0, 1, 0], [1, 0, 0]],
+                             columns=list('7ab'))
+        tm.assert_frame_equal(result, expected)
+
     def test_join(self):
         values = Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h'])
         result = values.str.split('_').str.join('_')
diff --git a/vb_suite/strings.py b/vb_suite/strings.py
index 287fd6d5bf2e2..459684ec0e435 100644
--- a/vb_suite/strings.py
+++ b/vb_suite/strings.py
@@ -45,6 +45,11 @@ def make_series(letters, strlen, size):
 strings_rstrip = Benchmark("many.str.rstrip('matchthis')", setup)
 strings_get = Benchmark("many.str.get(0)", setup)
 
+setup = setup + """
+make_series(string.uppercase, strlen=10, size=10000).str.join('|')
+"""
+strings_get_dummies = Benchmark("s.str.get_dummies('|')", setup)
+
 setup = common_setup + """
 import pandas.util.testing as testing
 ser = pd.Series(testing.makeUnicodeIndex())

From d8f94e9c756e3c8f9b470945a18403fe9dd53217 Mon Sep 17 00:00:00 2001
From: Andy Hayden <andyhayden1@gmail.com>
Date: Mon, 27 Jan 2014 16:25:28 -0800
Subject: [PATCH 2/2] PERF speed up str.get_dummies

---
 pandas/core/strings.py | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/pandas/core/strings.py b/pandas/core/strings.py
index fc0cc7fda9fa8..a41c06a6ad0b6 100644
--- a/pandas/core/strings.py
+++ b/pandas/core/strings.py
@@ -472,31 +472,31 @@ def str_get_dummies(arr, sep='|'):
     2  1  0  1
 
     >>> pd.Series(['a|b', np.nan, 'a|c']).str.get_dummies()
-        a   b   c
-    0   1   1   0
-    1 NaN NaN NaN
-    2   1   0   1
+       a  b  c
+    0  1  1  0
+    1  0  0  0
+    2  1  0  1
 
     See also ``pd.get_dummies``.
 
     """
-    def na_setunion(x, y):
-        try:
-            return x.union(y)
-        except TypeError:
-            return x
-
     # TODO remove this hack?
-    arr = sep + arr.fillna('').astype(str) + sep
+    arr = arr.fillna('')
+    try:
+        arr = sep + arr + sep
+    except TypeError:
+        arr = sep + arr.astype(str) + sep
+
+    tags = set()
+    for ts in arr.str.split(sep):
+        tags.update(ts)
+    tags = sorted(tags - set([""]))
 
-    from functools import reduce
-    tags = sorted(reduce(na_setunion, arr.str.split(sep), set())
-                  - set(['']))
     dummies = np.empty((len(arr), len(tags)), dtype=int)
 
     for i, t in enumerate(tags):
         pat = sep + t + sep
-        dummies[:, i] = _na_map(lambda x: pat in x, arr)
+        dummies[:, i] = lib.map_infer(arr.values, lambda x: pat in x)
     return DataFrame(dummies, arr.index, tags)