pandas-dev · mroeschke · Nov 7, 2022 · Nov 1, 2022 · Nov 1, 2022 · Nov 1, 2022
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -534,6 +534,7 @@ Missing
 
 MultiIndex
 ^^^^^^^^^^
+- Bug in :meth:`MultiIndex.get_indexer` not matching ``NaN`` values (:issue:`37222`)
 - Bug in :meth:`MultiIndex.argsort` raising ``TypeError`` when index contains :attr:`NA` (:issue:`48495`)
 - Bug in :meth:`MultiIndex.difference` losing extension array dtype (:issue:`48606`)
 - Bug in :class:`MultiIndex.set_levels` raising ``IndexError`` when setting empty level (:issue:`48636`)

@@ -36,6 +36,9 @@ from pandas._libs.missing cimport (
     is_matching_na,
 )
 
+# Defines shift of MultiIndex codes to avoid negative codes (missing values)
+multiindex_nulls_shift = 2
+
 
 cdef inline bint is_definitely_invalid_key(object val):
     try:
@@ -648,10 +651,13 @@ cdef class BaseMultiIndexCodesEngine:
         self.levels = levels
         self.offsets = offsets
 
-        # Transform labels in a single array, and add 1 so that we are working
-        # with positive integers (-1 for NaN becomes 0):
-        codes = (np.array(labels, dtype='int64').T + 1).astype('uint64',
-                                                               copy=False)
+        # Transform labels in a single array, and add 2 so that we are working
+        # with positive integers (-1 for NaN becomes 1). This enables us to
+        # differentiate between values that are missing in other and matching
+        # NaNs. We will set values that are not found to 0 later:
+        labels_arr = np.array(labels, dtype='int64').T + multiindex_nulls_shift
+        codes = labels_arr.astype('uint64', copy=False)
+        self.level_has_nans = [-1 in lab for lab in labels]
 
         # Map each codes combination in the index to an integer unambiguously
         # (no collisions possible), based on the "offsets", which describe the
@@ -680,8 +686,13 @@ cdef class BaseMultiIndexCodesEngine:
             Integers representing one combination each
         """
         zt = [target._get_level_values(i) for i in range(target.nlevels)]
-        level_codes = [lev.get_indexer_for(codes) + 1 for lev, codes
-                       in zip(self.levels, zt)]
+        level_codes = []
+        for i, (lev, codes) in enumerate(zip(self.levels, zt)):
+            result = lev.get_indexer_for(codes) + 1
+            result[result > 0] += 1
+            if self.level_has_nans[i] and codes.hasnans:
+                result[codes.isna()] += 1
+            level_codes.append(result)
         return self._codes_to_ints(np.array(level_codes, dtype='uint64').T)
 
     def get_indexer(self, target: np.ndarray) -> np.ndarray:
@@ -792,7 +803,7 @@ cdef class BaseMultiIndexCodesEngine:
         if not isinstance(key, tuple):
             raise KeyError(key)
         try:
-            indices = [0 if checknull(v) else lev.get_loc(v) + 1
+            indices = [1 if checknull(v) else lev.get_loc(v) + multiindex_nulls_shift
                        for lev, v in zip(self.levels, key)]
         except KeyError:
             raise KeyError(key)

diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -1087,8 +1087,18 @@ def set_codes(self, codes, *, level=None, verify_integrity: bool = True):
     @cache_readonly
     def _engine(self):
         # Calculate the number of bits needed to represent labels in each
-        # level, as log2 of their sizes (including -1 for NaN):
-        sizes = np.ceil(np.log2([len(level) + 1 for level in self.levels]))
+        # level, as log2 of their sizes:
+        # NaN values are shifted to 1 and missing values in other while
+        # calculating the indexer are shifted to 0
+        sizes = np.ceil(
+            np.log2(
+                [
+                    len(level)
+                    + libindex.multiindex_nulls_shift  # type: ignore[attr-defined]
+                    for level in self.levels
+                ]
+            )
+        )
 
         # Sum bit counts, starting from the _right_....
         lev_bits = np.cumsum(sizes[::-1])[::-1]

diff --git a/pandas/tests/indexes/multi/test_drop.py b/pandas/tests/indexes/multi/test_drop.py
@@ -32,16 +32,16 @@ def test_drop(idx):
     tm.assert_index_equal(dropped, expected)
 
     index = MultiIndex.from_tuples([("bar", "two")])
-    with pytest.raises(KeyError, match=r"^10$"):
+    with pytest.raises(KeyError, match=r"^15$"):
         idx.drop([("bar", "two")])
-    with pytest.raises(KeyError, match=r"^10$"):
+    with pytest.raises(KeyError, match=r"^15$"):
         idx.drop(index)
     with pytest.raises(KeyError, match=r"^'two'$"):
         idx.drop(["foo", "two"])
 
     # partially correct argument
     mixed_index = MultiIndex.from_tuples([("qux", "one"), ("bar", "two")])
-    with pytest.raises(KeyError, match=r"^10$"):
+    with pytest.raises(KeyError, match=r"^15$"):
         idx.drop(mixed_index)
 
     # error='ignore'

diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py
@@ -471,6 +471,16 @@ def test_get_indexer_kwarg_validation(self):
         with pytest.raises(ValueError, match=msg):
             mi.get_indexer(mi[:-1], tolerance="piano")
 
+    def test_get_indexer_nan(self):
+        # GH#37222
+        idx1 = MultiIndex.from_product([["A"], [1.0, 2.0]], names=["id1", "id2"])
+        idx2 = MultiIndex.from_product([["A"], [np.nan, 2.0]], names=["id1", "id2"])
+        expected = np.array([-1, 1])
+        result = idx2.get_indexer(idx1)
+        tm.assert_numpy_array_equal(result, expected, check_dtype=False)
+        result = idx1.get_indexer(idx2)
+        tm.assert_numpy_array_equal(result, expected, check_dtype=False)
+
 
 def test_getitem(idx):
     # scalar
@@ -527,7 +537,7 @@ class TestGetLoc:
     def test_get_loc(self, idx):
         assert idx.get_loc(("foo", "two")) == 1
         assert idx.get_loc(("baz", "two")) == 3
-        with pytest.raises(KeyError, match=r"^10$"):
+        with pytest.raises(KeyError, match=r"^15$"):
             idx.get_loc(("bar", "two"))
         with pytest.raises(KeyError, match=r"^'quux'$"):
             idx.get_loc("quux")

diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py
@@ -659,9 +659,8 @@ def test_union_keep_ea_dtype_with_na(any_numeric_ea_dtype):
     midx = MultiIndex.from_arrays([arr1, [2, 1]], names=["a", None])
     midx2 = MultiIndex.from_arrays([arr2, [1, 2]])
     result = midx.union(midx2)
-    # Expected is actually off and should contain (1, 1) too. See GH#37222
     expected = MultiIndex.from_arrays(
-        [Series([4, pd.NA, pd.NA], dtype=any_numeric_ea_dtype), [2, 1, 2]]
+        [Series([1, 4, pd.NA, pd.NA], dtype=any_numeric_ea_dtype), [1, 2, 1, 2]]
     )
     tm.assert_index_equal(result, expected)
 

diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py
@@ -162,10 +162,10 @@ def test_rename_multiindex_with_duplicates(self):
                     [1, 2],
                 ],
                 [
-                    [(81.0, np.nan), (np.nan, np.nan)],
-                    [(81.0, np.nan), (np.nan, np.nan)],
-                    [1, 2],
-                    [1, 1],
+                    [[81, 82.0, np.nan], Series([np.nan, np.nan, np.nan])],
+                    [[81, 82.0, np.nan], Series([np.nan, np.nan, np.nan])],
+                    [1, np.nan, 2],
+                    [np.nan, 2, 1],
                 ],
             ),
             (
@@ -176,8 +176,8 @@ def test_rename_multiindex_with_duplicates(self):
                     [1, 2],
                 ],
                 [
-                    [(81.0, np.nan), (np.nan, np.nan)],
-                    [(81.0, np.nan), (np.nan, np.nan)],
+                    [[81.0, np.nan], Series([np.nan, np.nan])],
+                    [[81.0, np.nan], Series([np.nan, np.nan])],
                     [1, 2],
                     [2, 1],
                 ],
@@ -188,28 +188,17 @@ def test_subtracting_two_series_with_unordered_index_and_all_nan_index(
         self, data_result, data_expected
     ):
         # GH 38439
+        # TODO: Refactor. This is impossible to understand GH#49443
         a_index_result = MultiIndex.from_tuples(data_result[0])
         b_index_result = MultiIndex.from_tuples(data_result[1])
         a_series_result = Series(data_result[2], index=a_index_result)
         b_series_result = Series(data_result[3], index=b_index_result)
         result = a_series_result.align(b_series_result)
 
-        a_index_expected = MultiIndex.from_tuples(data_expected[0])
-        b_index_expected = MultiIndex.from_tuples(data_expected[1])
+        a_index_expected = MultiIndex.from_arrays(data_expected[0])
+        b_index_expected = MultiIndex.from_arrays(data_expected[1])
         a_series_expected = Series(data_expected[2], index=a_index_expected)
         b_series_expected = Series(data_expected[3], index=b_index_expected)
-        a_series_expected.index = a_series_expected.index.set_levels(
-            [
-                a_series_expected.index.levels[0].astype("float"),
-                a_series_expected.index.levels[1].astype("float"),
-            ]
-        )
-        b_series_expected.index = b_series_expected.index.set_levels(
-            [
-                b_series_expected.index.levels[0].astype("float"),
-                b_series_expected.index.levels[1].astype("float"),
-            ]
-        )
 
         tm.assert_series_equal(result[0], a_series_expected)
         tm.assert_series_equal(result[1], b_series_expected)