From 466a9f30c65832bea077daa64ee036b562357209 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Fri, 9 Feb 2024 20:03:40 -0500 Subject: [PATCH 1/2] fix CategoricalIndex.difference regression --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/indexes/base.py | 8 ++++++-- .../tests/indexes/categorical/test_setops.py | 19 +++++++++++++++++++ 3 files changed, 26 insertions(+), 2 deletions(-) create mode 100644 pandas/tests/indexes/categorical/test_setops.py diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 69e14a9028dd3..335dada439029 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -20,6 +20,7 @@ Fixed regressions - Fixed regression in :func:`wide_to_long` raising an ``AttributeError`` for string columns (:issue:`57066`) - Fixed regression in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) - Fixed regression in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) +- Fixed regression in :meth:`CategoricalIndex.difference` raising ``KeyError`` when other contains null values other than NaN (:issue:`57318`) - Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`) - Fixed regression in :meth:`DataFrame.sort_index` not producing a stable sort for a index with duplicates (:issue:`57151`) - Fixed regression in :meth:`DataFrame.to_dict` with ``orient='list'`` and datetime or timedelta types returning integers (:issue:`54824`) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 9f0dac05c9355..b00d444114ace 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3272,9 +3272,13 @@ def difference(self, other, sort=None): def _difference(self, other, sort): # overridden by RangeIndex + if isinstance(self, ABCCategoricalIndex) and self.hasnans and other.hasnans: + this = self.dropna() + else: + this = self other = other.unique() - the_diff = self[other.get_indexer_for(self) == -1] - the_diff = the_diff if self.is_unique else the_diff.unique() + the_diff = this[other.get_indexer_for(this) == -1] + the_diff = the_diff if this.is_unique else the_diff.unique() the_diff = _maybe_try_sort(the_diff, sort) return the_diff diff --git a/pandas/tests/indexes/categorical/test_setops.py b/pandas/tests/indexes/categorical/test_setops.py new file mode 100644 index 0000000000000..3817001940926 --- /dev/null +++ b/pandas/tests/indexes/categorical/test_setops.py @@ -0,0 +1,19 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + CategoricalIndex, + Index, +) +import pandas._testing as tm + + +@pytest.mark.parametrize("na_value", [None, np.nan, pd.NA]) +def test_difference_with_na(na_value): + # GH 57318 + ci = CategoricalIndex(["a", "b", "c", None]) + other = Index(["c", na_value]) + result = ci.difference(other) + expected = CategoricalIndex(["a", "b"], categories=["a", "b", "c"]) + tm.assert_index_equal(result, expected) From 366c97b98dd5ed4e983d4fa72ce9ade1ca355dc9 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Fri, 9 Feb 2024 20:58:17 -0500 Subject: [PATCH 2/2] fix --- pandas/core/indexes/base.py | 5 ++--- pandas/tests/indexes/categorical/test_setops.py | 3 +-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index b00d444114ace..e2b3666ea9d85 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3272,10 +3272,9 @@ def difference(self, other, sort=None): def _difference(self, other, sort): # overridden by RangeIndex + this = self if isinstance(self, ABCCategoricalIndex) and self.hasnans and other.hasnans: - this = self.dropna() - else: - this = self + this = this.dropna() other = other.unique() the_diff = this[other.get_indexer_for(this) == -1] the_diff = the_diff if this.is_unique else the_diff.unique() diff --git a/pandas/tests/indexes/categorical/test_setops.py b/pandas/tests/indexes/categorical/test_setops.py index 3817001940926..2e87b90efd54c 100644 --- a/pandas/tests/indexes/categorical/test_setops.py +++ b/pandas/tests/indexes/categorical/test_setops.py @@ -1,7 +1,6 @@ import numpy as np import pytest -import pandas as pd from pandas import ( CategoricalIndex, Index, @@ -9,7 +8,7 @@ import pandas._testing as tm -@pytest.mark.parametrize("na_value", [None, np.nan, pd.NA]) +@pytest.mark.parametrize("na_value", [None, np.nan]) def test_difference_with_na(na_value): # GH 57318 ci = CategoricalIndex(["a", "b", "c", None])