From fc73f9058b74d29e87a80a417e36e5b74357b1e3 Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 8 Nov 2021 23:09:15 +0100 Subject: [PATCH 1/2] Fixed regression in Series.duplicated for categorical dtype with bool categories --- doc/source/whatsnew/v1.3.5.rst | 1 + pandas/core/algorithms.py | 2 +- pandas/tests/series/methods/test_duplicated.py | 18 +++++++++++++++++- 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.3.5.rst b/doc/source/whatsnew/v1.3.5.rst index 589092c0dd7e3..d00c400b63b4f 100644 --- a/doc/source/whatsnew/v1.3.5.rst +++ b/doc/source/whatsnew/v1.3.5.rst @@ -16,6 +16,7 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :meth:`Series.equals` when comparing floats with dtype object to None (:issue:`44190`) - Fixed performance regression in :func:`read_csv` (:issue:`44106`) +- Fixed regression in :meth:`Series.duplicated` when Series has :class:`Categorical` dtype with boolean categories (:issue:`44351`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index c1b587ce3a6b2..8c2c01b6aedc8 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -148,7 +148,7 @@ def _ensure_data(values: ArrayLike) -> np.ndarray: # i.e. all-bool Categorical, BooleanArray try: return np.asarray(values).astype("uint8", copy=False) - except TypeError: + except (TypeError, ValueError): # GH#42107 we have pd.NAs present return np.asarray(values) diff --git a/pandas/tests/series/methods/test_duplicated.py b/pandas/tests/series/methods/test_duplicated.py index 5cc297913e851..8e5f81d1326cb 100644 --- a/pandas/tests/series/methods/test_duplicated.py +++ b/pandas/tests/series/methods/test_duplicated.py @@ -1,7 +1,11 @@ import numpy as np import pytest -from pandas import Series +from pandas import ( + NA, + Categorical, + Series, +) import pandas._testing as tm @@ -33,3 +37,15 @@ def test_duplicated_nan_none(keep, expected): result = ser.duplicated(keep=keep) tm.assert_series_equal(result, expected) + + +def test_duplicated_categorical_bool(): + # GH#44351 + ser = Series( + Categorical( + [True, False, True, False, NA], categories=[True, False], ordered=True + ) + ) + result = ser.duplicated() + expected = Series([False, False, True, True, False]) + tm.assert_series_equal(result, expected) From eeba436eab7eef433c3baedda30c1d0be6073790 Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 9 Nov 2021 20:50:29 +0100 Subject: [PATCH 2/2] Add test --- doc/source/whatsnew/v1.3.5.rst | 2 +- .../tests/series/methods/test_drop_duplicates.py | 15 +++++++++++++++ pandas/tests/series/methods/test_duplicated.py | 2 +- 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.3.5.rst b/doc/source/whatsnew/v1.3.5.rst index d00c400b63b4f..951b05b65c81b 100644 --- a/doc/source/whatsnew/v1.3.5.rst +++ b/doc/source/whatsnew/v1.3.5.rst @@ -16,7 +16,7 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :meth:`Series.equals` when comparing floats with dtype object to None (:issue:`44190`) - Fixed performance regression in :func:`read_csv` (:issue:`44106`) -- Fixed regression in :meth:`Series.duplicated` when Series has :class:`Categorical` dtype with boolean categories (:issue:`44351`) +- Fixed regression in :meth:`Series.duplicated` and :meth:`Series.drop_duplicates` when Series has :class:`Categorical` dtype with boolean categories (:issue:`44351`) - .. --------------------------------------------------------------------------- diff --git a/pandas/tests/series/methods/test_drop_duplicates.py b/pandas/tests/series/methods/test_drop_duplicates.py index 7eb51f8037792..f72d85337df8e 100644 --- a/pandas/tests/series/methods/test_drop_duplicates.py +++ b/pandas/tests/series/methods/test_drop_duplicates.py @@ -2,6 +2,7 @@ import pytest from pandas import ( + NA, Categorical, Series, ) @@ -224,6 +225,20 @@ def test_drop_duplicates_categorical_bool(self, ordered): assert return_value is None tm.assert_series_equal(sc, tc[~expected]) + def test_drop_duplicates_categorical_bool_na(self): + # GH#44351 + ser = Series( + Categorical( + [True, False, True, False, NA], categories=[True, False], ordered=True + ) + ) + result = ser.drop_duplicates() + expected = Series( + Categorical([True, False, np.nan], categories=[True, False], ordered=True), + index=[0, 1, 4], + ) + tm.assert_series_equal(result, expected) + def test_drop_duplicates_pos_args_deprecation(): # GH#41485 diff --git a/pandas/tests/series/methods/test_duplicated.py b/pandas/tests/series/methods/test_duplicated.py index 8e5f81d1326cb..c61492168da63 100644 --- a/pandas/tests/series/methods/test_duplicated.py +++ b/pandas/tests/series/methods/test_duplicated.py @@ -39,7 +39,7 @@ def test_duplicated_nan_none(keep, expected): tm.assert_series_equal(result, expected) -def test_duplicated_categorical_bool(): +def test_duplicated_categorical_bool_na(): # GH#44351 ser = Series( Categorical(