diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 64e5eec43a5c1..4cff6358f8b59 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2744,15 +2744,6 @@ def _str_map( result = NumpyExtensionArray(categories.to_numpy())._str_map(f, na_value, dtype) return take_nd(result, codes, fill_value=na_value) - def _str_get_dummies(self, sep: str = "|"): - # sep may not be in categories. Just bail on this. - from pandas.core.arrays import NumpyExtensionArray - - return NumpyExtensionArray(self.astype(str))._str_get_dummies(sep) - - # ------------------------------------------------------------------------ - # GroupBy Methods - def _groupby_op( self, *, diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 7494a43caf004..327f8fb6d6ecc 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -2356,8 +2356,22 @@ def wrap( ) return self._wrap_result(result) + from collections.abc import Iterable + from typing import TYPE_CHECKING + + if TYPE_CHECKING: + from pandas._typing import NpDtype + @forbid_nonstring_types(["bytes"]) - def get_dummies(self, sep: str = "|"): + def get_dummies( + self, + sep: str = "|", + prefix: str | Iterable[str] | dict[str, str] | None = None, + prefix_sep: str | None = "_", + dummy_na: bool = False, + sparse: bool = False, + dtype: NpDtype | None = np.int64, + ): """ Return DataFrame of dummy/indicator variables for Series. @@ -2368,6 +2382,21 @@ def get_dummies(self, sep: str = "|"): ---------- sep : str, default "|" String to split on. + prefix : str, list of str, or dict of str, default None + String to append DataFrame column names. + Pass a list with length equal to the number of columns + when calling get_dummies on a DataFrame. Alternatively, `prefix` + can be a dictionary mapping column names to prefixes. + prefix_sep : str, default '_' + If appending prefix, separator/delimiter to use. Or pass a + list or dictionary as with `prefix`. + dummy_na : bool, default False + Add a column to indicate NaNs, if False NaNs are ignored. + sparse : bool, default False + Whether the dummy-encoded columns should be backed by + a :class:`SparseArray` (True) or a regular NumPy array (False). + dtype : dtype, default bool + Data type for new columns. Only a single dtype is allowed. Returns ------- @@ -2395,13 +2424,80 @@ def get_dummies(self, sep: str = "|"): """ # we need to cast to Series of strings as only that has all # methods available for making the dummies... - result, name = self._data.array._str_get_dummies(sep) - return self._wrap_result( - result, - name=name, - expand=True, - returns_string=False, + # result, name = self._data.array._str_get_dummies(sep) + # return self._wrap_result( + # result, + # name=name, + # expand=True, + # returns_string=False, + # ) + from pandas import ( + MultiIndex, + Series, + ) + from pandas.core.reshape.encoding import get_dummies + + input_series = ( + Series(self._data) if isinstance(self._data, ABCIndex) else self._data + ) + if isinstance(self._data.dtype, ArrowDtype): + import pyarrow as pa + + dtype = ArrowDtype(pa.bool_()) + string_series = input_series.apply(lambda x: str(x) if not isna(x) else x) + split_series = string_series.str.split(sep, expand=True).stack() + valid_split_series = split_series[ + (split_series.astype(str) != "None") + & ~( + split_series.index.get_level_values(0).duplicated(keep="first") + & split_series.isna() + ) + ] + + dummy_df = get_dummies( + valid_split_series, None, None, dummy_na, None, sparse, False, dtype ) + grouped_dummies = dummy_df.groupby(level=0) + if dtype == bool: + result_df = grouped_dummies.any() + else: + result_df = grouped_dummies.sum() + + if isinstance(prefix, str): + result_df.columns = [ + f"{prefix}{prefix_sep}{col}" for col in result_df.columns + ] + elif isinstance(prefix, dict): + if len(prefix) != len(result_df.columns): + len_msg = ( + f"Length of 'prefix' ({len(prefix)}) did not match the " + "length of the columns being encoded " + f"({len(result_df.columns)})." + ) + raise ValueError(len_msg) + result_df.columns = [ + f"{prefix[col]}{prefix_sep}{col}" for col in result_df.columns + ] + elif isinstance(prefix, list): + if len(prefix) != len(result_df.columns): + len_msg = ( + f"Length of 'prefix' ({len(prefix)}) did not match the " + "length of the columns being encoded " + f"({len(result_df.columns)})." + ) + raise ValueError(len_msg) + result_df.columns = [ + f"{prefix[i]}{prefix_sep}{col}" + for i, col in enumerate(result_df.columns) + ] + + if isinstance(self._data, ABCIndex): + return MultiIndex.from_frame(result_df) + + result_df.attrs = self._data.attrs + if dtype is not None and not sparse: + return result_df.astype(dtype) + return result_df @forbid_nonstring_types(["bytes"]) def translate(self, table): diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py index c1f94abff428a..fab90b5c5baf9 100644 --- a/pandas/core/strings/base.py +++ b/pandas/core/strings/base.py @@ -160,10 +160,6 @@ def _str_translate(self, table): def _str_wrap(self, width: int, **kwargs): pass - @abc.abstractmethod - def _str_get_dummies(self, sep: str = "|"): - pass - @abc.abstractmethod def _str_isalnum(self): pass diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index bdcf55e61d2d1..f10863f847a4c 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -1,6 +1,5 @@ from __future__ import annotations -import functools import re import textwrap from typing import ( @@ -372,32 +371,6 @@ def _str_wrap(self, width: int, **kwargs): tw = textwrap.TextWrapper(**kwargs) return self._str_map(lambda s: "\n".join(tw.wrap(s))) - def _str_get_dummies(self, sep: str = "|"): - from pandas import Series - - arr = Series(self).fillna("") - try: - arr = sep + arr + sep - except (TypeError, NotImplementedError): - arr = sep + arr.astype(str) + sep - - tags: set[str] = set() - for ts in Series(arr, copy=False).str.split(sep): - tags.update(ts) - tags2 = sorted(tags - {""}) - - dummies = np.empty((len(arr), len(tags2)), dtype=np.int64) - - def _isin(test_elements: str, element: str) -> bool: - return element in test_elements - - for i, t in enumerate(tags2): - pat = sep + t + sep - dummies[:, i] = lib.map_infer( - arr.to_numpy(), functools.partial(_isin, element=pat) - ) - return dummies, tags2 - def _str_upper(self): return self._str_map(lambda x: x.upper()) diff --git a/pandas/tests/strings/conftest.py b/pandas/tests/strings/conftest.py index 92b7b16da3c1f..b796a80ff6a23 100644 --- a/pandas/tests/strings/conftest.py +++ b/pandas/tests/strings/conftest.py @@ -96,7 +96,10 @@ ) ) ids, _, _ = zip(*_any_string_method) # use method name as fixture-id -missing_methods = {f for f in dir(StringMethods) if not f.startswith("_")} - set(ids) +NON_METHODS = {"TYPE_CHECKING", "Iterable"} +missing_methods = ( + {f for f in dir(StringMethods) if not f.startswith("_")} - set(ids) - NON_METHODS +) # test that the above list captures all methods of StringMethods assert not missing_methods diff --git a/pandas/tests/strings/test_get_dummies.py b/pandas/tests/strings/test_get_dummies.py index 31386e4e342ae..61c232d3634a3 100644 --- a/pandas/tests/strings/test_get_dummies.py +++ b/pandas/tests/strings/test_get_dummies.py @@ -5,6 +5,7 @@ Index, MultiIndex, Series, + SparseDtype, _testing as tm, ) @@ -51,3 +52,71 @@ def test_get_dummies_with_name_dummy_index(): [(1, 1, 0, 0), (0, 0, 1, 1), (0, 1, 0, 1)], names=("a", "b", "c", "name") ) tm.assert_index_equal(result, expected) + + +def test_get_dummies_with_prefix(any_string_dtype): + s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) + result = s.str.get_dummies(sep="|", prefix="prefix") + expected = DataFrame( + [[1, 1, 0], [1, 0, 1], [0, 0, 0]], + columns=["prefix_a", "prefix_b", "prefix_c"], + ) + tm.assert_frame_equal(result, expected) + + +def test_get_dummies_with_prefix_sep(any_string_dtype): + s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) + result = s.str.get_dummies(sep="|", prefix=None, prefix_sep="__") + expected = DataFrame([[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=["a", "b", "c"]) + tm.assert_frame_equal(result, expected) + + result = s.str.get_dummies(sep="|", prefix="col", prefix_sep="__") + expected = DataFrame( + [[1, 1, 0], [1, 0, 1], [0, 0, 0]], + columns=["col__a", "col__b", "col__c"], + ) + tm.assert_frame_equal(result, expected) + + +def test_get_dummies_with_dummy_na(any_string_dtype): + s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) + result = s.str.get_dummies(sep="|", dummy_na=True) + expected = DataFrame( + [[1, 1, 0, 0], [1, 0, 1, 0], [0, 0, 0, 1]], + columns=["a", "b", "c", np.nan], + ) + tm.assert_frame_equal(result, expected) + + +def test_get_dummies_with_sparse(any_string_dtype): + s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) + result = s.str.get_dummies(sep="|", sparse=True) + expected = DataFrame( + [[1, 1, 0], [1, 0, 1], [0, 0, 0]], + columns=["a", "b", "c"], + dtype="Sparse[int]", + ) + tm.assert_frame_equal(result, expected) + assert all(isinstance(dtype, SparseDtype) for dtype in result.dtypes) + + +def test_get_dummies_with_dtype(any_string_dtype): + s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) + result = s.str.get_dummies(sep="|", dtype=bool) + expected = DataFrame( + [[True, True, False], [True, False, True], [False, False, False]], + columns=["a", "b", "c"], + ) + tm.assert_frame_equal(result, expected) + assert (result.dtypes == bool).all() + + +def test_get_dummies_with_prefix_dict(any_string_dtype): + s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) + prefix = {"a": "alpha", "b": "beta", "c": "gamma"} + result = s.str.get_dummies(sep="|", prefix=prefix) + expected = DataFrame( + [[1, 1, 0], [1, 0, 1], [0, 0, 0]], + columns=["alpha_a", "beta_b", "gamma_c"], + ) + tm.assert_frame_equal(result, expected)