From 7ab6443a3dcc131b781d75578d9bdca98c88d719 Mon Sep 17 00:00:00 2001 From: tp Date: Sun, 15 Nov 2020 17:12:06 +0000 Subject: [PATCH 01/26] BUG: membership checks on ExtensionArray containing NA values --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/arrays/base.py | 14 +++++++++++++- pandas/tests/arrays/categorical/test_operators.py | 12 ++++++++++++ 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 6f046d3a9379d..19e322c8ea75d 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -752,6 +752,7 @@ ExtensionArray - Fixed bug when applying a NumPy ufunc with multiple outputs to an :class:`.IntegerArray` returning None (:issue:`36913`) - Fixed an inconsistency in :class:`.PeriodArray`'s ``__init__`` signature to those of :class:`.DatetimeArray` and :class:`.TimedeltaArray` (:issue:`37289`) - Reductions for :class:`.BooleanArray`, :class:`.Categorical`, :class:`.DatetimeArray`, :class:`.FloatingArray`, :class:`.IntegerArray`, :class:`.PeriodArray`, :class:`.TimedeltaArray`, and :class:`.PandasArray` are now keyword-only methods (:issue:`37541`) +- Fixed a bug where a `ValueError` was wrongly raised if a membership check was made on an `ExtensionArray` containing nan-like values (:issue:`37867`) Other ^^^^^ diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 448025e05422d..9ecd7cf952100 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -41,7 +41,7 @@ ) from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries -from pandas.core.dtypes.missing import isna +from pandas.core.dtypes.missing import isna, notna from pandas.core import ops from pandas.core.algorithms import factorize_array, unique @@ -354,6 +354,18 @@ def __iter__(self): for i in range(len(self)): yield self[i] + def __contains__(self, item) -> bool: + """ + Return for `item in self`. + """ + # comparisons of any item to pd.NA always return pd.NA, so e.g. "a" in [pd.NA] + # would raise a TypeError. The implementation below works around that. + if isna(item): + return isna(self).any() if self._can_hold_na else False + + arr = self[notna(self)] if self._can_hold_na else self + return item in iter(arr) + def __eq__(self, other: Any) -> ArrayLike: """ Return for `self == other` (element-wise equality). diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index 328b5771e617c..7769dd53c0d99 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -395,3 +395,15 @@ def test_numeric_like_ops(self): msg = "Object with dtype category cannot perform the numpy op log" with pytest.raises(TypeError, match=msg): np.log(s) + + def test_contains(self, ordered): + # GH-xxxxx + cat = Categorical(["a", "b"], ordered=ordered) + assert "a" in cat + assert "x" not in cat + assert pd.NA not in cat + + cat = Categorical([np.nan, "a"], ordered=ordered) + assert "a" in cat + assert "x" not in cat + assert pd.NA in cat From 7986bc4002b0a3eb09626bd11081520178641847 Mon Sep 17 00:00:00 2001 From: tp Date: Sun, 15 Nov 2020 17:29:38 +0000 Subject: [PATCH 02/26] add GH-number, simplify implementation --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/arrays/base.py | 5 ++--- pandas/tests/arrays/categorical/test_operators.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 19e322c8ea75d..62045c050fb8f 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -753,6 +753,7 @@ ExtensionArray - Fixed an inconsistency in :class:`.PeriodArray`'s ``__init__`` signature to those of :class:`.DatetimeArray` and :class:`.TimedeltaArray` (:issue:`37289`) - Reductions for :class:`.BooleanArray`, :class:`.Categorical`, :class:`.DatetimeArray`, :class:`.FloatingArray`, :class:`.IntegerArray`, :class:`.PeriodArray`, :class:`.TimedeltaArray`, and :class:`.PandasArray` are now keyword-only methods (:issue:`37541`) - Fixed a bug where a `ValueError` was wrongly raised if a membership check was made on an `ExtensionArray` containing nan-like values (:issue:`37867`) +- Bug, where a `ValueError` was wrongly raised if a membership check was made on an `ExtensionArray` with :class:`NA` values, but without a custom ``__contains__`` method (:issue:`37867`) Other ^^^^^ diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 9ecd7cf952100..837b230a9dd8a 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -362,9 +362,8 @@ def __contains__(self, item) -> bool: # would raise a TypeError. The implementation below works around that. if isna(item): return isna(self).any() if self._can_hold_na else False - - arr = self[notna(self)] if self._can_hold_na else self - return item in iter(arr) + else: + return (item == self).any() def __eq__(self, other: Any) -> ArrayLike: """ diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index 7769dd53c0d99..2eb35909a0b4e 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -397,7 +397,7 @@ def test_numeric_like_ops(self): np.log(s) def test_contains(self, ordered): - # GH-xxxxx + # GH-37867 cat = Categorical(["a", "b"], ordered=ordered) assert "a" in cat assert "x" not in cat From 466f7cc415af761cd80c3eaa0b0f66585286cd75 Mon Sep 17 00:00:00 2001 From: tp Date: Sun, 15 Nov 2020 17:32:49 +0000 Subject: [PATCH 03/26] flake8 fix --- pandas/core/arrays/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 837b230a9dd8a..f4476795e9ece 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -41,7 +41,7 @@ ) from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries -from pandas.core.dtypes.missing import isna, notna +from pandas.core.dtypes.missing import isna from pandas.core import ops from pandas.core.algorithms import factorize_array, unique From a9b75dd14cbdcfa5af3b962e208c086c49abc50e Mon Sep 17 00:00:00 2001 From: tp Date: Sun, 15 Nov 2020 17:44:37 +0000 Subject: [PATCH 04/26] NA membership should return False --- pandas/core/arrays/base.py | 2 +- pandas/tests/arrays/categorical/test_operators.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index f4476795e9ece..b7124a9866a07 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -361,7 +361,7 @@ def __contains__(self, item) -> bool: # comparisons of any item to pd.NA always return pd.NA, so e.g. "a" in [pd.NA] # would raise a TypeError. The implementation below works around that. if isna(item): - return isna(self).any() if self._can_hold_na else False + return False else: return (item == self).any() diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index 2eb35909a0b4e..efbdf221078cd 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -406,4 +406,4 @@ def test_contains(self, ordered): cat = Categorical([np.nan, "a"], ordered=ordered) assert "a" in cat assert "x" not in cat - assert pd.NA in cat + assert pd.NA not in cat From 9d0eca10926e4add1e19e845b7072973098310c3 Mon Sep 17 00:00:00 2001 From: tp Date: Sun, 15 Nov 2020 20:01:53 +0000 Subject: [PATCH 05/26] NA in arr should return True if arr contains NA --- pandas/core/arrays/base.py | 2 +- pandas/tests/arrays/categorical/test_operators.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index b7124a9866a07..f4476795e9ece 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -361,7 +361,7 @@ def __contains__(self, item) -> bool: # comparisons of any item to pd.NA always return pd.NA, so e.g. "a" in [pd.NA] # would raise a TypeError. The implementation below works around that. if isna(item): - return False + return isna(self).any() if self._can_hold_na else False else: return (item == self).any() diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index efbdf221078cd..3229ea1533b4a 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -406,4 +406,9 @@ def test_contains(self, ordered): cat = Categorical([np.nan, "a"], ordered=ordered) assert "a" in cat assert "x" not in cat - assert pd.NA not in cat + assert pd.NA in cat + + cat = cat[::-1] + assert "a" in cat + assert "x" not in cat + assert pd.NA in cat From b0b32ab1397059a651d816300c29e7c9d19a6155 Mon Sep 17 00:00:00 2001 From: tp Date: Tue, 24 Nov 2020 18:49:31 +0000 Subject: [PATCH 06/26] various --- pandas/core/arrays/base.py | 4 ++-- .../tests/arrays/categorical/test_operators.py | 3 +++ pandas/tests/arrays/string_/test_string.py | 16 ++++++++++++++++ 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index f4476795e9ece..ead945f9174c9 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -41,7 +41,7 @@ ) from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries -from pandas.core.dtypes.missing import isna +from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna from pandas.core import ops from pandas.core.algorithms import factorize_array, unique @@ -360,7 +360,7 @@ def __contains__(self, item) -> bool: """ # comparisons of any item to pd.NA always return pd.NA, so e.g. "a" in [pd.NA] # would raise a TypeError. The implementation below works around that. - if isna(item): + if is_valid_nat_for_dtype(item, self.dtype): return isna(self).any() if self._can_hold_na else False else: return (item == self).any() diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index 3229ea1533b4a..216fd73be5775 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -401,14 +401,17 @@ def test_contains(self, ordered): cat = Categorical(["a", "b"], ordered=ordered) assert "a" in cat assert "x" not in cat + assert np.nan not in cat assert pd.NA not in cat cat = Categorical([np.nan, "a"], ordered=ordered) assert "a" in cat assert "x" not in cat + assert np.nan in cat assert pd.NA in cat cat = cat[::-1] assert "a" in cat assert "x" not in cat + assert np.nan in cat assert pd.NA in cat diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 07e9484994c26..33b54019bf86d 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -524,3 +524,19 @@ def test_to_numpy_na_value(dtype, nulls_fixture): result = arr.to_numpy(na_value=na_value) expected = np.array(["a", na_value, "b"], dtype=object) tm.assert_numpy_array_equal(result, expected) + + +def test_contains(): + # GH-xxxxx + arr = pd.array(np.array(["a", "b"], dtype="string")) + + assert "a" in arr + assert "x" not in arr + assert np.nan not in arr + assert pd.NA not in arr + + arr = pd.arrays.StringArray(np.array(["a", pd.NA])) + assert "a" in arr + assert "x" not in arr + assert np.nan in arr + assert pd.NA in arr From c6e42d2c0c4ada7f1e2bd31790528575e1962274 Mon Sep 17 00:00:00 2001 From: tp Date: Tue, 24 Nov 2020 19:35:21 +0000 Subject: [PATCH 07/26] small fixes --- pandas/tests/arrays/string_/test_string.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 33b54019bf86d..5ebc2f0891a05 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -528,14 +528,14 @@ def test_to_numpy_na_value(dtype, nulls_fixture): def test_contains(): # GH-xxxxx - arr = pd.array(np.array(["a", "b"], dtype="string")) + arr = pd.array(["a", "b"], dtype="string") assert "a" in arr assert "x" not in arr assert np.nan not in arr assert pd.NA not in arr - arr = pd.arrays.StringArray(np.array(["a", pd.NA])) + arr = pd.array(["a", pd.NA], dtype="string") assert "a" in arr assert "x" not in arr assert np.nan in arr From 75c45bc50b47f066b8146832589a2e0316cacfb0 Mon Sep 17 00:00:00 2001 From: tp Date: Wed, 25 Nov 2020 19:33:37 +0000 Subject: [PATCH 08/26] stricter na_value requirements --- pandas/core/arrays/base.py | 5 ++++- pandas/tests/arrays/string_/test_string.py | 4 ++-- pandas/tests/extension/base/interface.py | 17 +++++++++++++++++ pandas/tests/extension/json/test_json.py | 5 +++++ 4 files changed, 28 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index ead945f9174c9..58f441e581545 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -37,6 +37,7 @@ is_array_like, is_dtype_equal, is_list_like, + is_scalar, pandas_dtype, ) from pandas.core.dtypes.dtypes import ExtensionDtype @@ -360,8 +361,10 @@ def __contains__(self, item) -> bool: """ # comparisons of any item to pd.NA always return pd.NA, so e.g. "a" in [pd.NA] # would raise a TypeError. The implementation below works around that. - if is_valid_nat_for_dtype(item, self.dtype): + if item is self.dtype.na_value: return isna(self).any() if self._can_hold_na else False + elif is_scalar(item) and isna(item): + return False else: return (item == self).any() diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 5ebc2f0891a05..f987a153c5b0c 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -527,7 +527,7 @@ def test_to_numpy_na_value(dtype, nulls_fixture): def test_contains(): - # GH-xxxxx + # GH-37867 arr = pd.array(["a", "b"], dtype="string") assert "a" in arr @@ -538,5 +538,5 @@ def test_contains(): arr = pd.array(["a", pd.NA], dtype="string") assert "a" in arr assert "x" not in arr - assert np.nan in arr + assert np.nan not in arr assert pd.NA in arr diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py index 9ae4b01508d79..f099138403421 100644 --- a/pandas/tests/extension/base/interface.py +++ b/pandas/tests/extension/base/interface.py @@ -29,6 +29,23 @@ def test_can_hold_na_valid(self, data): # GH-20761 assert data._can_hold_na is True + def test_contains(self, data): + # GH-37867 + scalar = data[~data.isna()][0] + + assert scalar in data + + na_value = data.dtype.na_value + other_na_value_types = {np.nan, pd.NA, pd.NaT}.difference({na_value}) + if data.isna().any(): + assert na_value in data + for na_value_type in other_na_value_types: + assert na_value_type not in data + else: + assert na_value not in data + for na_value_type in other_na_value_types: + assert na_value_type not in data + def test_memory_usage(self, data): s = pd.Series(data) result = s.memory_usage(index=False) diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 74ca341e27bf8..ac065a9245dec 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -143,6 +143,11 @@ def test_custom_asserts(self): with pytest.raises(AssertionError, match=msg): self.assert_frame_equal(a.to_frame(), b.to_frame()) + @pytest.mark.xfail(reason="comparison method not implemented on JSONArray") + def test_contains(self, data): + # GH-37867 + super().test_contains(data) + class TestConstructors(BaseJSON, base.BaseConstructorsTests): @pytest.mark.skip(reason="not implemented constructor from dtype") From 83c9fe4744ffd225f82f2f808674a2aeeabae8d8 Mon Sep 17 00:00:00 2001 From: tp Date: Wed, 25 Nov 2020 19:43:11 +0000 Subject: [PATCH 09/26] flake8 cleanup --- pandas/core/arrays/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 58f441e581545..80ef701325f85 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -42,7 +42,7 @@ ) from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries -from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna +from pandas.core.dtypes.missing import isna from pandas.core import ops from pandas.core.algorithms import factorize_array, unique From 08c4c984f1bf4f81906b598dcb8768f533f15ecf Mon Sep 17 00:00:00 2001 From: tp Date: Wed, 25 Nov 2020 20:03:16 +0000 Subject: [PATCH 10/26] add None to BaseInterfaceTests.test_contains --- pandas/tests/extension/base/interface.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py index f099138403421..0ec0503c531dc 100644 --- a/pandas/tests/extension/base/interface.py +++ b/pandas/tests/extension/base/interface.py @@ -36,7 +36,7 @@ def test_contains(self, data): assert scalar in data na_value = data.dtype.na_value - other_na_value_types = {np.nan, pd.NA, pd.NaT}.difference({na_value}) + other_na_value_types = {None, np.nan, pd.NA, pd.NaT}.difference({na_value}) if data.isna().any(): assert na_value in data for na_value_type in other_na_value_types: From 5a23b1dc73744994a8f424291722838bde46811d Mon Sep 17 00:00:00 2001 From: tp Date: Wed, 25 Nov 2020 20:06:14 +0000 Subject: [PATCH 11/26] simpify tests --- pandas/tests/extension/base/interface.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py index 0ec0503c531dc..2e998d7ee3c0c 100644 --- a/pandas/tests/extension/base/interface.py +++ b/pandas/tests/extension/base/interface.py @@ -37,14 +37,14 @@ def test_contains(self, data): na_value = data.dtype.na_value other_na_value_types = {None, np.nan, pd.NA, pd.NaT}.difference({na_value}) + if data.isna().any(): assert na_value in data - for na_value_type in other_na_value_types: - assert na_value_type not in data else: assert na_value not in data - for na_value_type in other_na_value_types: - assert na_value_type not in data + + for na_value_type in other_na_value_types: + assert na_value_type not in data def test_memory_usage(self, data): s = pd.Series(data) From 4b0c2008fc9174366af4feb6e7085870b3f7b328 Mon Sep 17 00:00:00 2001 From: tp Date: Wed, 25 Nov 2020 22:44:06 +0000 Subject: [PATCH 12/26] fix pyarrow issue --- doc/source/whatsnew/v1.2.0.rst | 1 - pandas/tests/extension/base/interface.py | 5 +++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 62045c050fb8f..19e322c8ea75d 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -753,7 +753,6 @@ ExtensionArray - Fixed an inconsistency in :class:`.PeriodArray`'s ``__init__`` signature to those of :class:`.DatetimeArray` and :class:`.TimedeltaArray` (:issue:`37289`) - Reductions for :class:`.BooleanArray`, :class:`.Categorical`, :class:`.DatetimeArray`, :class:`.FloatingArray`, :class:`.IntegerArray`, :class:`.PeriodArray`, :class:`.TimedeltaArray`, and :class:`.PandasArray` are now keyword-only methods (:issue:`37541`) - Fixed a bug where a `ValueError` was wrongly raised if a membership check was made on an `ExtensionArray` containing nan-like values (:issue:`37867`) -- Bug, where a `ValueError` was wrongly raised if a membership check was made on an `ExtensionArray` with :class:`NA` values, but without a custom ``__contains__`` method (:issue:`37867`) Other ^^^^^ diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py index 2e998d7ee3c0c..16abb78ca4cdd 100644 --- a/pandas/tests/extension/base/interface.py +++ b/pandas/tests/extension/base/interface.py @@ -36,14 +36,15 @@ def test_contains(self, data): assert scalar in data na_value = data.dtype.na_value - other_na_value_types = {None, np.nan, pd.NA, pd.NaT}.difference({na_value}) if data.isna().any(): assert na_value in data else: assert na_value not in data - for na_value_type in other_na_value_types: + for na_value_type in {None, np.nan, pd.NA, pd.NaT}: + if na_value_type is na_value: + continue assert na_value_type not in data def test_memory_usage(self, data): From 92604e99151761e47251e4fa1fe0f6c0d3c365ce Mon Sep 17 00:00:00 2001 From: tp Date: Wed, 25 Nov 2020 23:14:25 +0000 Subject: [PATCH 13/26] rst-backticks --- doc/source/whatsnew/v1.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 19e322c8ea75d..49e2f7f5fc484 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -752,7 +752,7 @@ ExtensionArray - Fixed bug when applying a NumPy ufunc with multiple outputs to an :class:`.IntegerArray` returning None (:issue:`36913`) - Fixed an inconsistency in :class:`.PeriodArray`'s ``__init__`` signature to those of :class:`.DatetimeArray` and :class:`.TimedeltaArray` (:issue:`37289`) - Reductions for :class:`.BooleanArray`, :class:`.Categorical`, :class:`.DatetimeArray`, :class:`.FloatingArray`, :class:`.IntegerArray`, :class:`.PeriodArray`, :class:`.TimedeltaArray`, and :class:`.PandasArray` are now keyword-only methods (:issue:`37541`) -- Fixed a bug where a `ValueError` was wrongly raised if a membership check was made on an `ExtensionArray` containing nan-like values (:issue:`37867`) +- Fixed a bug where a ``ValueError`` was wrongly raised if a membership check was made on an ``ExtensionArray`` containing nan-like values (:issue:`37867`) Other ^^^^^ From 8a24f0d10e861c15ef8f362928354d86026e6143 Mon Sep 17 00:00:00 2001 From: tp Date: Thu, 26 Nov 2020 15:54:12 +0000 Subject: [PATCH 14/26] add tests for nan-likes --- pandas/tests/extension/base/interface.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py index 16abb78ca4cdd..2154a43a5d6c8 100644 --- a/pandas/tests/extension/base/interface.py +++ b/pandas/tests/extension/base/interface.py @@ -31,6 +31,25 @@ def test_can_hold_na_valid(self, data): def test_contains(self, data): # GH-37867 + + data = data[~data.isna()] + + scalar = data[0] + + assert scalar in data + assert "124jhujbhjhb5" not in data + + na_value = data.dtype.na_value + + assert na_value not in data + + for na_value_type in {None, np.nan, pd.NA, pd.NaT}: + assert na_value_type not in data + + def test_contains_nan(self, data_missing): + # GH-37867 + data = data_missing + scalar = data[~data.isna()][0] assert scalar in data From fdb9deb8bd7ff3cb1778a9a349a46a074d0d3a8b Mon Sep 17 00:00:00 2001 From: tp Date: Thu, 26 Nov 2020 16:13:56 +0000 Subject: [PATCH 15/26] minor changes --- pandas/core/arrays/base.py | 2 +- pandas/tests/extension/base/interface.py | 48 +++++++++--------------- 2 files changed, 19 insertions(+), 31 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 80ef701325f85..4ef9be4eccf8c 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -362,7 +362,7 @@ def __contains__(self, item) -> bool: # comparisons of any item to pd.NA always return pd.NA, so e.g. "a" in [pd.NA] # would raise a TypeError. The implementation below works around that. if item is self.dtype.na_value: - return isna(self).any() if self._can_hold_na else False + return self.isna().any() if self._can_hold_na else False elif is_scalar(item) and isna(item): return False else: diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py index 2154a43a5d6c8..36b65bb2c67bf 100644 --- a/pandas/tests/extension/base/interface.py +++ b/pandas/tests/extension/base/interface.py @@ -29,42 +29,30 @@ def test_can_hold_na_valid(self, data): # GH-20761 assert data._can_hold_na is True - def test_contains(self, data): + def test_contains(self, data, data_missing): # GH-37867 + # Tests for membership checks. Membership checks for nan-likes is tricky and + # the settled on rule is: `nan_like in arr` is True if nan_like is + # arr.dtype.na_value and arr.isna().any() is True. Else the check returns False. - data = data[~data.isna()] + for this_data in [data, data_missing]: + scalar = this_data[~this_data.isna()][0] - scalar = data[0] + assert scalar in this_data + assert "124jhujbhjhb5" not in data - assert scalar in data - assert "124jhujbhjhb5" not in data + na_value = this_data.dtype.na_value - na_value = data.dtype.na_value + if this_data.isna().any(): + assert na_value in this_data + else: + assert na_value not in this_data - assert na_value not in data - - for na_value_type in {None, np.nan, pd.NA, pd.NaT}: - assert na_value_type not in data - - def test_contains_nan(self, data_missing): - # GH-37867 - data = data_missing - - scalar = data[~data.isna()][0] - - assert scalar in data - - na_value = data.dtype.na_value - - if data.isna().any(): - assert na_value in data - else: - assert na_value not in data - - for na_value_type in {None, np.nan, pd.NA, pd.NaT}: - if na_value_type is na_value: - continue - assert na_value_type not in data + # this_data can never contain other nan-likes than na_value + for na_value_type in {None, np.nan, pd.NA, pd.NaT}: + if na_value_type is na_value: + continue + assert na_value_type not in this_data def test_memory_usage(self, data): s = pd.Series(data) From 52e2b43370dd1885cf54e917eaa147b2435787b3 Mon Sep 17 00:00:00 2001 From: tp Date: Thu, 26 Nov 2020 23:32:01 +0000 Subject: [PATCH 16/26] minor issues --- pandas/core/arrays/base.py | 1 + pandas/tests/extension/base/interface.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 4ef9be4eccf8c..76bc0b7b3eaf6 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -359,6 +359,7 @@ def __contains__(self, item) -> bool: """ Return for `item in self`. """ + # GH37867 # comparisons of any item to pd.NA always return pd.NA, so e.g. "a" in [pd.NA] # would raise a TypeError. The implementation below works around that. if item is self.dtype.na_value: diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py index 36b65bb2c67bf..43fc981f8d9eb 100644 --- a/pandas/tests/extension/base/interface.py +++ b/pandas/tests/extension/base/interface.py @@ -39,7 +39,6 @@ def test_contains(self, data, data_missing): scalar = this_data[~this_data.isna()][0] assert scalar in this_data - assert "124jhujbhjhb5" not in data na_value = this_data.dtype.na_value From f21890eec1e9591b3f1755139e6d3b754dc54a09 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Fri, 27 Nov 2020 08:29:40 +0000 Subject: [PATCH 17/26] Update pandas/tests/extension/base/interface.py Co-authored-by: Joris Van den Bossche --- pandas/tests/extension/base/interface.py | 35 ++++++++++++------------ 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py index 43fc981f8d9eb..ced3d44e9a37d 100644 --- a/pandas/tests/extension/base/interface.py +++ b/pandas/tests/extension/base/interface.py @@ -35,23 +35,24 @@ def test_contains(self, data, data_missing): # the settled on rule is: `nan_like in arr` is True if nan_like is # arr.dtype.na_value and arr.isna().any() is True. Else the check returns False. - for this_data in [data, data_missing]: - scalar = this_data[~this_data.isna()][0] - - assert scalar in this_data - - na_value = this_data.dtype.na_value - - if this_data.isna().any(): - assert na_value in this_data - else: - assert na_value not in this_data - - # this_data can never contain other nan-likes than na_value - for na_value_type in {None, np.nan, pd.NA, pd.NaT}: - if na_value_type is na_value: - continue - assert na_value_type not in this_data + na_value = data.dtype.na_value + # ensure data without missing values + data = data[~data.isna()] + + # first elements are non-missing + assert data[0] in data + assert data_missing[0] in data_missing + + # check the presence of na_value + assert na_value in data_missing + assert na_value not in data + + # the data can never contain other nan-likes than na_value + for na_value_type in {None, np.nan, pd.NA, pd.NaT}: + if na_value_type is na_value: + continue + assert na_value_type not in data + assert na_value_type not in data_missing def test_memory_usage(self, data): s = pd.Series(data) From 6f633c7d0ee4b741f13ea8a4757d3ff9fa5dd884 Mon Sep 17 00:00:00 2001 From: tp Date: Sat, 28 Nov 2020 07:51:08 +0000 Subject: [PATCH 18/26] Allow for na values that are of same type as the data --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/arrays/base.py | 11 +++++---- pandas/core/arrays/categorical.py | 8 +++++++ pandas/core/arrays/numpy_.py | 7 ++++++ pandas/tests/extension/arrow/test_bool.py | 4 ++++ pandas/tests/extension/decimal/array.py | 8 +++++++ pandas/tests/extension/test_categorical.py | 26 ++++++++++++++++++++++ 7 files changed, 61 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 49e2f7f5fc484..c78e7b510ad53 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -484,6 +484,7 @@ Deprecations - Deprecated :meth:`Index.asi8` for :class:`Index` subclasses other than :class:`.DatetimeIndex`, :class:`.TimedeltaIndex`, and :class:`PeriodIndex` (:issue:`37877`) - The ``inplace`` parameter of :meth:`Categorical.remove_unused_categories` is deprecated and will be removed in a future version (:issue:`37643`) - The ``null_counts`` parameter of :meth:`DataFrame.info` is deprecated and replaced by ``show_counts``. It will be removed in a future version (:issue:`37999`) +- Deprecated membership checks for nan-likes in :class:`Categorical`. In the future the membership check will only return True if the nan-like is ``nan`` or of the same dtype as the underlying categories (:issue:`37867`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 76bc0b7b3eaf6..76b7877b0ac70 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -362,10 +362,13 @@ def __contains__(self, item) -> bool: # GH37867 # comparisons of any item to pd.NA always return pd.NA, so e.g. "a" in [pd.NA] # would raise a TypeError. The implementation below works around that. - if item is self.dtype.na_value: - return self.isna().any() if self._can_hold_na else False - elif is_scalar(item) and isna(item): - return False + if is_scalar(item) and isna(item): + if not self._can_hold_na: + return False + elif item is self.dtype.na_value or isinstance(item, self.dtype.type): + return self.isna().any() + else: + return False else: return (item == self).any() diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 62e508c491740..0c4827137eb5a 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1745,8 +1745,16 @@ def __contains__(self, key) -> bool: """ Returns True if `key` is in this Categorical. """ + # in pandas 2.0, remove this method. + # if key is a NaN, check if any NaN is in self. if is_valid_nat_for_dtype(key, self.categories.dtype): + if key is not self.dtype.na_value and not isinstance(key, self.dtype.type): + warn(f"Membership check with {key} will return False in the future. " + f"Consider using {self.dtype.na_value} instead", + FutureWarning, + stacklevel=2, + ) return self.isna().any() return contains(self, key, container=self._codes) diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 0cdce1eabccc6..7997eaf88a8b5 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -51,6 +51,13 @@ def numpy_dtype(self) -> np.dtype: """ return self._dtype + @property + def na_value(self) -> object: + if issubclass(self.type, np.floating): + return self.type("nan") + else: + return super().na_value + @property def name(self) -> str: """ diff --git a/pandas/tests/extension/arrow/test_bool.py b/pandas/tests/extension/arrow/test_bool.py index 12426a0c92c55..b731859a761a4 100644 --- a/pandas/tests/extension/arrow/test_bool.py +++ b/pandas/tests/extension/arrow/test_bool.py @@ -50,6 +50,10 @@ def test_view(self, data): # __setitem__ does not work, so we only have a smoke-test data.view() + @pytest.mark.xfail(raises=AssertionError, reason="Not implemented yet") + def test_contains(self, data, data_missing): + super().test_contains(data, data_missing) + class TestConstructors(BaseArrowTests, base.BaseConstructorsTests): def test_from_dtype(self, data): diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 9ede9c7fbd0fd..a713550dafa5c 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -155,6 +155,14 @@ def __setitem__(self, key, value): def __len__(self) -> int: return len(self._data) + def __contains__(self, item) -> bool: + if not isinstance(item, decimal.Decimal): + return False + elif item.is_nan(): + return self.isna().any() + else: + return super().__contains__(item) + @property def nbytes(self) -> int: n = len(self) diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 95f338cbc3240..196911a566adc 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -87,6 +87,32 @@ def test_memory_usage(self, data): # Is this deliberate? super().test_memory_usage(data) + def test_contains(self, data, data_missing): + # GH-37867 + # na value handling in Categorical.__contains__ is deprecated. + # See base.BaseInterFaceTests.test_contains for more details. + + na_value = data.dtype.na_value + # ensure data without missing values + data = data[~data.isna()] + + # first elements are non-missing + assert data[0] in data + assert data_missing[0] in data_missing + + # check the presence of na_value + assert na_value in data_missing + assert na_value not in data + + # the data can never contain other nan-likes than na_value + for na_value_type in {None, np.nan, pd.NA, pd.NaT}: + if na_value_type is na_value: + continue + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + assert na_value_type not in data + assert na_value_type in data_missing + class TestConstructors(base.BaseConstructorsTests): pass From d8bdb2e5bfc408d8f139caccb64df4a415655c00 Mon Sep 17 00:00:00 2001 From: tp Date: Sat, 28 Nov 2020 08:01:02 +0000 Subject: [PATCH 19/26] cleanups --- pandas/core/arrays/categorical.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 0c4827137eb5a..3448778cf2c3d 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1750,11 +1750,12 @@ def __contains__(self, key) -> bool: # if key is a NaN, check if any NaN is in self. if is_valid_nat_for_dtype(key, self.categories.dtype): if key is not self.dtype.na_value and not isinstance(key, self.dtype.type): - warn(f"Membership check with {key} will return False in the future. " - f"Consider using {self.dtype.na_value} instead", - FutureWarning, - stacklevel=2, - ) + warn( + f"Membership check with {key} will return False in the future. " + f"Consider using {self.dtype.na_value} instead", + FutureWarning, + stacklevel=2, + ) return self.isna().any() return contains(self, key, container=self._codes) From 4e4dbc4118611262084b51eaea89de1b04bc9c34 Mon Sep 17 00:00:00 2001 From: tp Date: Sat, 28 Nov 2020 08:54:27 +0000 Subject: [PATCH 20/26] Fixes --- doc/source/whatsnew/v1.2.0.rst | 1 - pandas/core/arrays/categorical.py | 7 ------- pandas/core/arrays/numpy_.py | 7 ------- pandas/tests/extension/test_categorical.py | 5 ++--- 4 files changed, 2 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index c78e7b510ad53..49e2f7f5fc484 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -484,7 +484,6 @@ Deprecations - Deprecated :meth:`Index.asi8` for :class:`Index` subclasses other than :class:`.DatetimeIndex`, :class:`.TimedeltaIndex`, and :class:`PeriodIndex` (:issue:`37877`) - The ``inplace`` parameter of :meth:`Categorical.remove_unused_categories` is deprecated and will be removed in a future version (:issue:`37643`) - The ``null_counts`` parameter of :meth:`DataFrame.info` is deprecated and replaced by ``show_counts``. It will be removed in a future version (:issue:`37999`) -- Deprecated membership checks for nan-likes in :class:`Categorical`. In the future the membership check will only return True if the nan-like is ``nan`` or of the same dtype as the underlying categories (:issue:`37867`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 3448778cf2c3d..142fc2ed61fd1 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1749,13 +1749,6 @@ def __contains__(self, key) -> bool: # if key is a NaN, check if any NaN is in self. if is_valid_nat_for_dtype(key, self.categories.dtype): - if key is not self.dtype.na_value and not isinstance(key, self.dtype.type): - warn( - f"Membership check with {key} will return False in the future. " - f"Consider using {self.dtype.na_value} instead", - FutureWarning, - stacklevel=2, - ) return self.isna().any() return contains(self, key, container=self._codes) diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 7997eaf88a8b5..0cdce1eabccc6 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -51,13 +51,6 @@ def numpy_dtype(self) -> np.dtype: """ return self._dtype - @property - def na_value(self) -> object: - if issubclass(self.type, np.floating): - return self.type("nan") - else: - return super().na_value - @property def name(self) -> str: """ diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 196911a566adc..971f2c4beddca 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -109,9 +109,8 @@ def test_contains(self, data, data_missing): if na_value_type is na_value: continue - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - assert na_value_type not in data - assert na_value_type in data_missing + assert na_value_type not in data + assert na_value_type in data_missing class TestConstructors(base.BaseConstructorsTests): From a1583e704e43e41d48c2cff6fc7c778ccabf02bf Mon Sep 17 00:00:00 2001 From: tp Date: Sat, 28 Nov 2020 18:05:10 +0000 Subject: [PATCH 21/26] remove text in categorical.py --- pandas/core/arrays/categorical.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 142fc2ed61fd1..62e508c491740 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1745,8 +1745,6 @@ def __contains__(self, key) -> bool: """ Returns True if `key` is in this Categorical. """ - # in pandas 2.0, remove this method. - # if key is a NaN, check if any NaN is in self. if is_valid_nat_for_dtype(key, self.categories.dtype): return self.isna().any() From 3c2c2b05c5ddbba29bc67dbfe2732695508b7c3a Mon Sep 17 00:00:00 2001 From: tp Date: Sat, 28 Nov 2020 18:12:10 +0000 Subject: [PATCH 22/26] doc fix --- doc/source/whatsnew/v1.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 49e2f7f5fc484..825c5367a8b37 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -752,7 +752,7 @@ ExtensionArray - Fixed bug when applying a NumPy ufunc with multiple outputs to an :class:`.IntegerArray` returning None (:issue:`36913`) - Fixed an inconsistency in :class:`.PeriodArray`'s ``__init__`` signature to those of :class:`.DatetimeArray` and :class:`.TimedeltaArray` (:issue:`37289`) - Reductions for :class:`.BooleanArray`, :class:`.Categorical`, :class:`.DatetimeArray`, :class:`.FloatingArray`, :class:`.IntegerArray`, :class:`.PeriodArray`, :class:`.TimedeltaArray`, and :class:`.PandasArray` are now keyword-only methods (:issue:`37541`) -- Fixed a bug where a ``ValueError`` was wrongly raised if a membership check was made on an ``ExtensionArray`` containing nan-like values (:issue:`37867`) +- Fixed a bug where a ``TypeError`` was wrongly raised if a membership check was made on an ``ExtensionArray`` containing nan-like values (:issue:`37867`) Other ^^^^^ From 237fe4541032af630ec4626cfb890a6d53d3c2b8 Mon Sep 17 00:00:00 2001 From: tp Date: Sat, 28 Nov 2020 18:21:14 +0000 Subject: [PATCH 23/26] add gh number --- pandas/tests/extension/json/test_json.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index ac065a9245dec..7cc75d658a026 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -143,7 +143,7 @@ def test_custom_asserts(self): with pytest.raises(AssertionError, match=msg): self.assert_frame_equal(a.to_frame(), b.to_frame()) - @pytest.mark.xfail(reason="comparison method not implemented on JSONArray") + @pytest.mark.xfail(reason="comparison method not implemented JSONArray (GH-37867)") def test_contains(self, data): # GH-37867 super().test_contains(data) From 37219c3dbfa2e4cec7b378650c4234f0f68d834a Mon Sep 17 00:00:00 2001 From: tp Date: Sat, 28 Nov 2020 18:22:34 +0000 Subject: [PATCH 24/26] linting --- pandas/tests/extension/json/test_json.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 7cc75d658a026..3a5e49796c53b 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -143,7 +143,9 @@ def test_custom_asserts(self): with pytest.raises(AssertionError, match=msg): self.assert_frame_equal(a.to_frame(), b.to_frame()) - @pytest.mark.xfail(reason="comparison method not implemented JSONArray (GH-37867)") + @pytest.mark.xfail( + reason="comparison method not implemented for JSONArray (GH-37867)" + ) def test_contains(self, data): # GH-37867 super().test_contains(data) From c4a6c3661ca0d86449b6c0e267e0a99bc39f2e7e Mon Sep 17 00:00:00 2001 From: tp Date: Sat, 28 Nov 2020 20:13:41 +0000 Subject: [PATCH 25/26] clean tests --- .../arrays/categorical/test_operators.py | 20 ------------------- pandas/tests/arrays/string_/test_string.py | 16 --------------- 2 files changed, 36 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index 216fd73be5775..328b5771e617c 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -395,23 +395,3 @@ def test_numeric_like_ops(self): msg = "Object with dtype category cannot perform the numpy op log" with pytest.raises(TypeError, match=msg): np.log(s) - - def test_contains(self, ordered): - # GH-37867 - cat = Categorical(["a", "b"], ordered=ordered) - assert "a" in cat - assert "x" not in cat - assert np.nan not in cat - assert pd.NA not in cat - - cat = Categorical([np.nan, "a"], ordered=ordered) - assert "a" in cat - assert "x" not in cat - assert np.nan in cat - assert pd.NA in cat - - cat = cat[::-1] - assert "a" in cat - assert "x" not in cat - assert np.nan in cat - assert pd.NA in cat diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index f987a153c5b0c..07e9484994c26 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -524,19 +524,3 @@ def test_to_numpy_na_value(dtype, nulls_fixture): result = arr.to_numpy(na_value=na_value) expected = np.array(["a", na_value, "b"], dtype=object) tm.assert_numpy_array_equal(result, expected) - - -def test_contains(): - # GH-37867 - arr = pd.array(["a", "b"], dtype="string") - - assert "a" in arr - assert "x" not in arr - assert np.nan not in arr - assert pd.NA not in arr - - arr = pd.array(["a", pd.NA], dtype="string") - assert "a" in arr - assert "x" not in arr - assert np.nan not in arr - assert pd.NA in arr From 245c99ae37b1fbeffaa1e27a14d26bf91f5fbdf9 Mon Sep 17 00:00:00 2001 From: tp Date: Sun, 29 Nov 2020 17:36:06 +0000 Subject: [PATCH 26/26] use nulls_fixture --- pandas/tests/extension/arrow/test_bool.py | 4 ++-- pandas/tests/extension/base/interface.py | 12 +++++------- pandas/tests/extension/test_categorical.py | 13 +++++-------- 3 files changed, 12 insertions(+), 17 deletions(-) diff --git a/pandas/tests/extension/arrow/test_bool.py b/pandas/tests/extension/arrow/test_bool.py index b731859a761a4..922b3b94c16c1 100644 --- a/pandas/tests/extension/arrow/test_bool.py +++ b/pandas/tests/extension/arrow/test_bool.py @@ -51,8 +51,8 @@ def test_view(self, data): data.view() @pytest.mark.xfail(raises=AssertionError, reason="Not implemented yet") - def test_contains(self, data, data_missing): - super().test_contains(data, data_missing) + def test_contains(self, data, data_missing, nulls_fixture): + super().test_contains(data, data_missing, nulls_fixture) class TestConstructors(BaseArrowTests, base.BaseConstructorsTests): diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py index ced3d44e9a37d..d7997310dde3d 100644 --- a/pandas/tests/extension/base/interface.py +++ b/pandas/tests/extension/base/interface.py @@ -29,7 +29,7 @@ def test_can_hold_na_valid(self, data): # GH-20761 assert data._can_hold_na is True - def test_contains(self, data, data_missing): + def test_contains(self, data, data_missing, nulls_fixture): # GH-37867 # Tests for membership checks. Membership checks for nan-likes is tricky and # the settled on rule is: `nan_like in arr` is True if nan_like is @@ -47,12 +47,10 @@ def test_contains(self, data, data_missing): assert na_value in data_missing assert na_value not in data - # the data can never contain other nan-likes than na_value - for na_value_type in {None, np.nan, pd.NA, pd.NaT}: - if na_value_type is na_value: - continue - assert na_value_type not in data - assert na_value_type not in data_missing + if nulls_fixture is not na_value: + # the data can never contain other nan-likes than na_value + assert nulls_fixture not in data + assert nulls_fixture not in data_missing def test_memory_usage(self, data): s = pd.Series(data) diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 971f2c4beddca..d03a9ab6b2588 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -87,7 +87,7 @@ def test_memory_usage(self, data): # Is this deliberate? super().test_memory_usage(data) - def test_contains(self, data, data_missing): + def test_contains(self, data, data_missing, nulls_fixture): # GH-37867 # na value handling in Categorical.__contains__ is deprecated. # See base.BaseInterFaceTests.test_contains for more details. @@ -104,13 +104,10 @@ def test_contains(self, data, data_missing): assert na_value in data_missing assert na_value not in data - # the data can never contain other nan-likes than na_value - for na_value_type in {None, np.nan, pd.NA, pd.NaT}: - if na_value_type is na_value: - continue - - assert na_value_type not in data - assert na_value_type in data_missing + # Categoricals can contain other nan-likes than na_value + if nulls_fixture is not na_value: + assert nulls_fixture not in data + assert nulls_fixture in data_missing # this line differs from super method class TestConstructors(base.BaseConstructorsTests):