Skip to content

BUG: MultiIndex.get_indexer not matching nan values #49442

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Nov 7, 2022
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -534,6 +534,7 @@ Missing

MultiIndex
^^^^^^^^^^
- Bug in :meth:`MultiIndex.get_indexer` not matching ``NaN`` values (:issue:`37222`)
- Bug in :meth:`MultiIndex.argsort` raising ``TypeError`` when index contains :attr:`NA` (:issue:`48495`)
- Bug in :meth:`MultiIndex.difference` losing extension array dtype (:issue:`48606`)
- Bug in :class:`MultiIndex.set_levels` raising ``IndexError`` when setting empty level (:issue:`48636`)
Expand Down
25 changes: 18 additions & 7 deletions pandas/_libs/index.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ from pandas._libs.missing cimport (
is_matching_na,
)

# Defines shift of MultiIndex codes to avoid negative codes (missing values)
multiindex_nulls_shift = 2
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

comment about what this is

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done



cdef inline bint is_definitely_invalid_key(object val):
try:
Expand Down Expand Up @@ -648,10 +651,13 @@ cdef class BaseMultiIndexCodesEngine:
self.levels = levels
self.offsets = offsets

# Transform labels in a single array, and add 1 so that we are working
# with positive integers (-1 for NaN becomes 0):
codes = (np.array(labels, dtype='int64').T + 1).astype('uint64',
copy=False)
# Transform labels in a single array, and add 2 so that we are working
# with positive integers (-1 for NaN becomes 1). This enables us to
# differentiate between values that are missing in other and matching
# NaNs. We will set values that are not found to 0 later:
labels_arr = np.array(labels, dtype='int64').T + multiindex_nulls_shift
codes = labels_arr.astype('uint64', copy=False)
self.level_has_nans = [-1 in lab for lab in labels]

# Map each codes combination in the index to an integer unambiguously
# (no collisions possible), based on the "offsets", which describe the
Expand Down Expand Up @@ -680,8 +686,13 @@ cdef class BaseMultiIndexCodesEngine:
Integers representing one combination each
"""
zt = [target._get_level_values(i) for i in range(target.nlevels)]
level_codes = [lev.get_indexer_for(codes) + 1 for lev, codes
in zip(self.levels, zt)]
level_codes = []
for i, (lev, codes) in enumerate(zip(self.levels, zt)):
result = lev.get_indexer_for(codes) + 1
result[result > 0] += 1
if self.level_has_nans[i] and codes.hasnans:
result[codes.isna()] += 1
level_codes.append(result)
return self._codes_to_ints(np.array(level_codes, dtype='uint64').T)

def get_indexer(self, target: np.ndarray) -> np.ndarray:
Expand Down Expand Up @@ -792,7 +803,7 @@ cdef class BaseMultiIndexCodesEngine:
if not isinstance(key, tuple):
raise KeyError(key)
try:
indices = [0 if checknull(v) else lev.get_loc(v) + 1
indices = [1 if checknull(v) else lev.get_loc(v) + multiindex_nulls_shift
for lev, v in zip(self.levels, key)]
except KeyError:
raise KeyError(key)
Expand Down
14 changes: 12 additions & 2 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -1087,8 +1087,18 @@ def set_codes(self, codes, *, level=None, verify_integrity: bool = True):
@cache_readonly
def _engine(self):
# Calculate the number of bits needed to represent labels in each
# level, as log2 of their sizes (including -1 for NaN):
sizes = np.ceil(np.log2([len(level) + 1 for level in self.levels]))
# level, as log2 of their sizes:
# NaN values are shifted to 1 and missing values in other while
# calculating the indexer are shifted to 0
sizes = np.ceil(
np.log2(
[
len(level)
+ libindex.multiindex_nulls_shift # type: ignore[attr-defined]
for level in self.levels
]
)
)

# Sum bit counts, starting from the _right_....
lev_bits = np.cumsum(sizes[::-1])[::-1]
Expand Down
6 changes: 3 additions & 3 deletions pandas/tests/indexes/multi/test_drop.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,16 @@ def test_drop(idx):
tm.assert_index_equal(dropped, expected)

index = MultiIndex.from_tuples([("bar", "two")])
with pytest.raises(KeyError, match=r"^10$"):
with pytest.raises(KeyError, match=r"^15$"):
idx.drop([("bar", "two")])
with pytest.raises(KeyError, match=r"^10$"):
with pytest.raises(KeyError, match=r"^15$"):
idx.drop(index)
with pytest.raises(KeyError, match=r"^'two'$"):
idx.drop(["foo", "two"])

# partially correct argument
mixed_index = MultiIndex.from_tuples([("qux", "one"), ("bar", "two")])
with pytest.raises(KeyError, match=r"^10$"):
with pytest.raises(KeyError, match=r"^15$"):
idx.drop(mixed_index)

# error='ignore'
Expand Down
12 changes: 11 additions & 1 deletion pandas/tests/indexes/multi/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,6 +471,16 @@ def test_get_indexer_kwarg_validation(self):
with pytest.raises(ValueError, match=msg):
mi.get_indexer(mi[:-1], tolerance="piano")

def test_get_indexer_nan(self):
# GH#37222
idx1 = MultiIndex.from_product([["A"], [1.0, 2.0]], names=["id1", "id2"])
idx2 = MultiIndex.from_product([["A"], [np.nan, 2.0]], names=["id1", "id2"])
expected = np.array([-1, 1])
result = idx2.get_indexer(idx1)
tm.assert_numpy_array_equal(result, expected, check_dtype=False)
result = idx1.get_indexer(idx2)
tm.assert_numpy_array_equal(result, expected, check_dtype=False)


def test_getitem(idx):
# scalar
Expand Down Expand Up @@ -527,7 +537,7 @@ class TestGetLoc:
def test_get_loc(self, idx):
assert idx.get_loc(("foo", "two")) == 1
assert idx.get_loc(("baz", "two")) == 3
with pytest.raises(KeyError, match=r"^10$"):
with pytest.raises(KeyError, match=r"^15$"):
idx.get_loc(("bar", "two"))
with pytest.raises(KeyError, match=r"^'quux'$"):
idx.get_loc("quux")
Expand Down
3 changes: 1 addition & 2 deletions pandas/tests/indexes/multi/test_setops.py
Original file line number Diff line number Diff line change
Expand Up @@ -659,9 +659,8 @@ def test_union_keep_ea_dtype_with_na(any_numeric_ea_dtype):
midx = MultiIndex.from_arrays([arr1, [2, 1]], names=["a", None])
midx2 = MultiIndex.from_arrays([arr2, [1, 2]])
result = midx.union(midx2)
# Expected is actually off and should contain (1, 1) too. See GH#37222
expected = MultiIndex.from_arrays(
[Series([4, pd.NA, pd.NA], dtype=any_numeric_ea_dtype), [2, 1, 2]]
[Series([1, 4, pd.NA, pd.NA], dtype=any_numeric_ea_dtype), [1, 2, 1, 2]]
)
tm.assert_index_equal(result, expected)

Expand Down
29 changes: 9 additions & 20 deletions pandas/tests/indexing/multiindex/test_multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,10 +162,10 @@ def test_rename_multiindex_with_duplicates(self):
[1, 2],
],
[
[(81.0, np.nan), (np.nan, np.nan)],
[(81.0, np.nan), (np.nan, np.nan)],
[1, 2],
[1, 1],
[[81, 82.0, np.nan], Series([np.nan, np.nan, np.nan])],
[[81, 82.0, np.nan], Series([np.nan, np.nan, np.nan])],
[1, np.nan, 2],
[np.nan, 2, 1],
],
),
(
Expand All @@ -176,8 +176,8 @@ def test_rename_multiindex_with_duplicates(self):
[1, 2],
],
[
[(81.0, np.nan), (np.nan, np.nan)],
[(81.0, np.nan), (np.nan, np.nan)],
[[81.0, np.nan], Series([np.nan, np.nan])],
[[81.0, np.nan], Series([np.nan, np.nan])],
[1, 2],
[2, 1],
],
Expand All @@ -188,28 +188,17 @@ def test_subtracting_two_series_with_unordered_index_and_all_nan_index(
self, data_result, data_expected
):
# GH 38439
# TODO: Refactor. This is impossible to understand GH#49443
a_index_result = MultiIndex.from_tuples(data_result[0])
b_index_result = MultiIndex.from_tuples(data_result[1])
a_series_result = Series(data_result[2], index=a_index_result)
b_series_result = Series(data_result[3], index=b_index_result)
result = a_series_result.align(b_series_result)

a_index_expected = MultiIndex.from_tuples(data_expected[0])
b_index_expected = MultiIndex.from_tuples(data_expected[1])
a_index_expected = MultiIndex.from_arrays(data_expected[0])
b_index_expected = MultiIndex.from_arrays(data_expected[1])
a_series_expected = Series(data_expected[2], index=a_index_expected)
b_series_expected = Series(data_expected[3], index=b_index_expected)
a_series_expected.index = a_series_expected.index.set_levels(
[
a_series_expected.index.levels[0].astype("float"),
a_series_expected.index.levels[1].astype("float"),
]
)
b_series_expected.index = b_series_expected.index.set_levels(
[
b_series_expected.index.levels[0].astype("float"),
b_series_expected.index.levels[1].astype("float"),
]
)

tm.assert_series_equal(result[0], a_series_expected)
tm.assert_series_equal(result[1], b_series_expected)
Expand Down