From 46ce29b2ded65be02790dd7bca2b03c6ee499806 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 9 Sep 2024 14:09:18 +0200 Subject: [PATCH 1/2] String dtype: fix isin() values handling for python storage --- pandas/conftest.py | 9 +++++++- pandas/core/arrays/string_.py | 18 ++++++++++++++++ pandas/tests/arrays/string_/test_string.py | 24 +++++++++++++++++----- 3 files changed, 45 insertions(+), 6 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index d11213f1164bc..ca707454b528d 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1310,7 +1310,13 @@ def string_storage(request): pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")), pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")), ("python", np.nan), - ] + ], + ids=[ + "string=string[python]", + "string=string[pyarrow]", + "string=str[pyarrow]", + "string=str[python]", + ], ) def string_dtype_arguments(request): """ @@ -1341,6 +1347,7 @@ def dtype_backend(request): # Alias so we can test with cartesian product of string_storage string_storage2 = string_storage +string_dtype_arguments2 = string_dtype_arguments @pytest.fixture(params=tm.BYTES_DTYPES) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 88fd1481031f8..9aee1a3be1975 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -46,6 +46,7 @@ nanops, ops, ) +from pandas.core.algorithms import isin from pandas.core.array_algos import masked_reductions from pandas.core.arrays.base import ExtensionArray from pandas.core.arrays.floating import ( @@ -65,6 +66,7 @@ import pyarrow from pandas._typing import ( + ArrayLike, AxisInt, Dtype, DtypeObj, @@ -731,6 +733,22 @@ def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None: # base class implementation that uses __setitem__ ExtensionArray._putmask(self, mask, value) + def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: + if not isinstance(values, BaseStringArray): + if not lib.is_string_array(np.asarray(values), skipna=True): + values = np.array( + [val for val in values if isinstance(val, str) or isna(val)], + dtype=object, + ) + if not len(values): + return np.zeros(self.shape, dtype=bool) + + values = self._from_sequence(values, dtype=self.dtype) + else: + values = values.astype(self.dtype, copy=False) + + return isin(np.asarray(self), np.asarray(values)) + def astype(self, dtype, copy: bool = True): dtype = pandas_dtype(dtype) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index dd87dbf8e9a43..af45b98d46695 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -30,6 +30,12 @@ def dtype(string_dtype_arguments): return pd.StringDtype(storage=storage, na_value=na_value) +@pytest.fixture +def dtype2(string_dtype_arguments2): + storage, na_value = string_dtype_arguments2 + return pd.StringDtype(storage=storage, na_value=na_value) + + @pytest.fixture def cls(dtype): """Fixture giving array type from parametrized 'dtype'""" @@ -665,11 +671,7 @@ def test_isin(dtype, fixed_now_ts): tm.assert_series_equal(result, expected) result = s.isin(["a", pd.NA]) - if dtype.storage == "python" and dtype.na_value is np.nan: - # TODO(infer_string) we should make this consistent - expected = pd.Series([True, False, False]) - else: - expected = pd.Series([True, False, True]) + expected = pd.Series([True, False, True]) tm.assert_series_equal(result, expected) result = s.isin([]) @@ -681,6 +683,18 @@ def test_isin(dtype, fixed_now_ts): tm.assert_series_equal(result, expected) +def test_isin_string_array(dtype, dtype2): + s = pd.Series(["a", "b", None], dtype=dtype) + + result = s.isin(pd.array(["a", "c"], dtype=dtype2)) + expected = pd.Series([True, False, False]) + tm.assert_series_equal(result, expected) + + result = s.isin(pd.array(["a", None], dtype=dtype2)) + expected = pd.Series([True, False, True]) + tm.assert_series_equal(result, expected) + + def test_setitem_scalar_with_mask_validation(dtype): # https://github.com/pandas-dev/pandas/issues/47628 # setting None with a boolean mask (through _putmaks) should still result From 5a2d4e42c2e733bef6648bbba9ac2ae8624fc0f0 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 10 Sep 2024 21:05:54 +0200 Subject: [PATCH 2/2] address feedback --- pandas/core/arrays/string_.py | 8 +++++--- pandas/tests/arrays/string_/test_string.py | 17 +++++++++++++++++ 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 9aee1a3be1975..f7a63a0b4dcfd 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -734,7 +734,11 @@ def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None: ExtensionArray._putmask(self, mask, value) def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: - if not isinstance(values, BaseStringArray): + if isinstance(values, BaseStringArray) or ( + isinstance(values, ExtensionArray) and is_string_dtype(values.dtype) + ): + values = values.astype(self.dtype, copy=False) + else: if not lib.is_string_array(np.asarray(values), skipna=True): values = np.array( [val for val in values if isinstance(val, str) or isna(val)], @@ -744,8 +748,6 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: return np.zeros(self.shape, dtype=bool) values = self._from_sequence(values, dtype=self.dtype) - else: - values = values.astype(self.dtype, copy=False) return isin(np.asarray(self), np.asarray(values)) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index af45b98d46695..57ab0e008013f 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -682,6 +682,10 @@ def test_isin(dtype, fixed_now_ts): expected = pd.Series([True, False, False]) tm.assert_series_equal(result, expected) + result = s.isin([fixed_now_ts]) + expected = pd.Series([False, False, False]) + tm.assert_series_equal(result, expected) + def test_isin_string_array(dtype, dtype2): s = pd.Series(["a", "b", None], dtype=dtype) @@ -695,6 +699,19 @@ def test_isin_string_array(dtype, dtype2): tm.assert_series_equal(result, expected) +def test_isin_arrow_string_array(dtype): + pa = pytest.importorskip("pyarrow") + s = pd.Series(["a", "b", None], dtype=dtype) + + result = s.isin(pd.array(["a", "c"], dtype=pd.ArrowDtype(pa.string()))) + expected = pd.Series([True, False, False]) + tm.assert_series_equal(result, expected) + + result = s.isin(pd.array(["a", None], dtype=pd.ArrowDtype(pa.string()))) + expected = pd.Series([True, False, True]) + tm.assert_series_equal(result, expected) + + def test_setitem_scalar_with_mask_validation(dtype): # https://github.com/pandas-dev/pandas/issues/47628 # setting None with a boolean mask (through _putmaks) should still result