pandas-dev · jorisvandenbossche · Jan 28, 2018 · Dec 29, 2017 · Dec 30, 2017 · Dec 30, 2017
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -380,6 +380,7 @@ Performance Improvements
 - Improved performance of ``DatetimeIndex`` and ``Series`` arithmetic operations with Business-Month and Business-Quarter frequencies (:issue:`18489`)
 - :func:`Series` / :func:`DataFrame` tab completion limits to 100 values, for better performance. (:issue:`18587`)
 - Improved performance of :func:`DataFrame.median` with ``axis=1`` when bottleneck is not installed (:issue:`16468`)
+- Improved performance of :func:`MultiIndex.get_loc` for large indexes, at the cost of a reduction in performance for small ones (:issue:`18519`)
 
 
 .. _whatsnew_0230.docs:
@@ -476,7 +477,11 @@ MultiIndex
 - Bug in :func:`MultiIndex.get_level_values` which would return an invalid index on level of ints with missing values (:issue:`17924`)
 - Bug in :func:`MultiIndex.remove_unused_levels` which would fill nan values (:issue:`18417`)
 - Bug in :func:`MultiIndex.from_tuples`` which would fail to take zipped tuples in python3 (:issue:`18434`)
--
+- Bug in :func:`MultiIndex.get_loc`` which would fail to automatically cast values between float and int (:issue:`18818`, :issue:`15994`)
+- Bug in :func:`MultiIndex.get_loc`` which would cast boolean to integer labels (:issue:`19086`)
+- Bug in :func:`MultiIndex.get_loc`` which would fail to locate keys containing ``NaN`` (:issue:`18485`)
+- Bug in :func:`MultiIndex.get_loc`` in large :class:`MultiIndex`, would fail when levels had different dtypes (:issue:`18520`)
+
 
 I/O
 ^^^

diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd
@@ -31,15 +31,6 @@ cdef class PyObjectHashTable(HashTable):
     cpdef get_item(self, object val)
     cpdef set_item(self, object key, Py_ssize_t val)
 
-cdef class MultiIndexHashTable(HashTable):
-    cdef:
-        kh_uint64_t *table
-        object mi
-
-    cpdef get_item(self, object val)
-    cpdef set_item(self, object key, Py_ssize_t val)
-    cdef inline void _check_for_collision(self, Py_ssize_t loc, object label)
-
 
 cdef class StringHashTable(HashTable):
     cdef kh_str_t *table

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -899,139 +899,3 @@ cdef class PyObjectHashTable(HashTable):
                 count += 1
 
         return np.asarray(labels)
-
-
-cdef class MultiIndexHashTable(HashTable):
-
-    def __init__(self, size_hint=1):
-        self.table = kh_init_uint64()
-        self.mi = None
-        kh_resize_uint64(self.table, size_hint)
-
-    def __dealloc__(self):
-        if self.table is not NULL:
-            kh_destroy_uint64(self.table)
-            self.table = NULL
-
-    def __len__(self):
-        return self.table.size
-
-    def sizeof(self, deep=False):
-        """ return the size of my table in bytes """
-        return self.table.n_buckets * (sizeof(uint64_t) + # keys
-                                       sizeof(size_t) + # vals
-                                       sizeof(uint32_t)) # flags
-
-    def _check_for_collisions(self, int64_t[:] locs, object mi):
-        # validate that the locs map to the actual values
-        # provided in the mi
-        # we can only check if we *don't* have any missing values
-        # :<
-        cdef:
-            ndarray[int64_t] alocs
-
-        alocs = np.asarray(locs)
-        if (alocs != -1).all():
-
-            result = self.mi.take(locs)
-            if isinstance(mi, tuple):
-                from pandas import Index
-                mi = Index([mi])
-            if not result.equals(mi):
-                raise AssertionError(
-                    "hash collision\nlocs:\n{}\n"
-                    "result:\n{}\nmi:\n{}".format(alocs, result, mi))
-
-    cdef inline void _check_for_collision(self, Py_ssize_t loc, object label):
-        # validate that the loc maps to the actual value
-        # version of _check_for_collisions above for single label (tuple)
-
-        result = self.mi[loc]
-
-        if not all(l == r or (is_null_datetimelike(l)
-                              and is_null_datetimelike(r))
-                   for l, r in zip(result, label)):
-            raise AssertionError(
-                "hash collision\nloc:\n{}\n"
-                "result:\n{}\nmi:\n{}".format(loc, result, label))
-
-    def __contains__(self, object key):
-        try:
-            self.get_item(key)
-            return True
-        except (KeyError, ValueError, TypeError):
-            return False
-
-    cpdef get_item(self, object key):
-        cdef:
-            khiter_t k
-            uint64_t value
-            int64_t[:] locs
-            Py_ssize_t loc
-
-        value = self.mi._hashed_indexing_key(key)
-        k = kh_get_uint64(self.table, value)
-        if k != self.table.n_buckets:
-            loc = self.table.vals[k]
-            self._check_for_collision(loc, key)
-            return loc
-        else:
-            raise KeyError(key)
-
-    cpdef set_item(self, object key, Py_ssize_t val):
-        raise NotImplementedError
-
-    @cython.boundscheck(False)
-    def map_locations(self, object mi):
-        cdef:
-            Py_ssize_t i, n
-            ndarray[uint64_t] values
-            uint64_t val
-            int ret = 0
-            khiter_t k
-
-        self.mi = mi
-        n = len(mi)
-        values = mi._hashed_values
-
-        with nogil:
-            for i in range(n):
-                val = values[i]
-                k = kh_put_uint64(self.table, val, &ret)
-                self.table.vals[k] = i
-
-    @cython.boundscheck(False)
-    def lookup(self, object mi):
-        # look up with a target mi
-        cdef:
-            Py_ssize_t i, n
-            ndarray[uint64_t] values
-            int ret = 0
-            uint64_t val
-            khiter_t k
-            int64_t[:] locs
-
-        n = len(mi)
-        values = mi._hashed_values
-
-        locs = np.empty(n, dtype=np.int64)
-
-        with nogil:
-            for i in range(n):
-                val = values[i]
-                k = kh_get_uint64(self.table, val)
-                if k != self.table.n_buckets:
-                    locs[i] = self.table.vals[k]
-                else:
-                    locs[i] = -1
-
-        self._check_for_collisions(locs, mi)
-        return np.asarray(locs)
-
-    def unique(self, object mi):
-        raise NotImplementedError
-
-    def get_labels(self, object mi, ObjectVector uniques,
-                   Py_ssize_t count_prior, int64_t na_sentinel,
-                   bint check_null=True):
-        raise NotImplementedError
diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
@@ -26,11 +26,12 @@ from hashtable cimport HashTable
 from pandas._libs import algos, hashtable as _hash
 from pandas._libs.tslibs import period as periodlib
 from pandas._libs.tslib import Timestamp, Timedelta
+from pandas._libs.missing import checknull
 
 cdef int64_t iNaT = util.get_nat()
 
 
-cdef inline is_definitely_invalid_key(object val):
+cdef inline bint is_definitely_invalid_key(object val):
     if PyTuple_Check(val):
         try:
             hash(val)
@@ -585,70 +586,137 @@ cpdef convert_scalar(ndarray arr, object value):
     return value
 
 
-cdef class MultiIndexObjectEngine(ObjectEngine):
+cdef class BaseMultiIndexCodesEngine:
     """
-    provide the same interface as the MultiIndexEngine
-    but use the IndexEngine for computation
-
-    This provides good performance with samller MI's
+    Base class for MultiIndexUIntEngine and MultiIndexPyIntEngine, which
+    represent each label in a MultiIndex as an integer, by juxtaposing the bits
+    encoding each level, with appropriate offsets.
+
+    For instance: if 3 levels have respectively 3, 6 and 1 possible values,
+    then their labels can be represented using respectively 2, 3 and 1 bits,
+    as follows:
+     _ _ _ _____ _ __ __ __
+    |0|0|0| ... |0| 0|a1|a0| -> offset 0 (first level)
+     — — — ————— — —— —— ——
+    |0|0|0| ... |0|b2|b1|b0| -> offset 2 (bits required for first level)
+     — — — ————— — —— —— ——
+    |0|0|0| ... |0| 0| 0|c0| -> offset 5 (bits required for first two levels)
+     ‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾
+    and the resulting unsigned integer representation will be:
+     _ _ _ _____ _ __ __ __ __ __ __
+    |0|0|0| ... |0|c0|b2|b1|b0|a1|a0|
+     ‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾
+
+    Offsets are calculated at initialization, labels are transformed by method
+    _codes_to_ints.
+
+    Keys are located by first locating each component against the respective
+    level, then locating (the integer representation of) codes.
     """
-    def get_indexer(self, values):
-        # convert a MI to an ndarray
-        if hasattr(values, 'values'):
-            values = values.values
-        return super(MultiIndexObjectEngine, self).get_indexer(values)
+    def __init__(self, object levels, object labels,
+                 ndarray[uint64_t, ndim=1] offsets):
+        """
+        Parameters
+        ----------
+        levels : list-like of numpy arrays
+            Levels of the MultiIndex
+        labels : list-like of numpy arrays of integer dtype
+            Labels of the MultiIndex
+        offsets : numpy array of uint64 dtype
+            Pre-calculated offsets, one for each level of the index
+        """
 
-    cpdef get_loc(self, object val):
+        self.levels = levels
+        self.offsets = offsets
 
-        # convert a MI to an ndarray
-        if hasattr(val, 'values'):
-            val = val.values
-        return super(MultiIndexObjectEngine, self).get_loc(val)
+        # Transform labels in a single array, and add 1 so that we are working
+        # with positive integers (-1 for NaN becomes 0):
+        codes = (np.array(labels, dtype='int64').T + 1).astype('uint64',
+                                                               copy=False)
 
+        # Map each codes combination in the index to an integer unambiguously
+        # (no collisions possible), based on the "offsets", which describe the
+        # number of bits to switch labels for each level:
+        lab_ints = self._codes_to_ints(codes)
 
-cdef class MultiIndexHashEngine(ObjectEngine):
-    """
-    Use a hashing based MultiIndex impl
-    but use the IndexEngine for computation
+        # Initialize underlying index (e.g. libindex.UInt64Engine) with
+        # integers representing labels: we will use its get_loc and get_indexer
+        self._base.__init__(self, lambda: lab_ints, len(lab_ints))
 
-    This provides good performance with larger MI's
-    """
+    def _extract_level_codes(self, object target, object method=None):
+        """
+        Map the requested list of (tuple) keys to their integer representations
+        for searching in the underlying integer index.
+
+        Parameters
+        ----------
+        target : list-like of keys
+            Each key is a tuple, with a label for each level of the index.
+
+        Returns
+        ------
+        int_keys : 1-dimensional array of dtype uint64 or object
+            Integers representing one combination each
+        """
 
-    def _call_monotonic(self, object mi):
-        # defer these back to the mi iteself
-        return (mi.is_monotonic_increasing,
-                mi.is_monotonic_decreasing,
-                mi.is_unique)
+        level_codes = [lev.get_indexer(codes) + 1 for lev, codes
+                       in zip(self.levels, zip(*target))]
+        return self._codes_to_ints(np.array(level_codes, dtype='uint64').T)
+
+    def get_indexer(self, object target, object method=None,
+                    object limit=None):
+        lab_ints = self._extract_level_codes(target)
+
+        # All methods (exact, backfill, pad) directly map to the respective
+        # methods of the underlying (integers) index...
+        if method is not None:
+            # but underlying backfill and pad methods require index and keys
+            # to be sorted. The index already is (checked in
+            # Index._get_fill_indexer), sort (integer representations of) keys:
+            order = np.argsort(lab_ints)
+            lab_ints = lab_ints[order]
+            indexer = (getattr(self._base, 'get_{}_indexer'.format(method))
+                       (self, lab_ints, limit=limit))
+            indexer = indexer[order]
+        else:
+            indexer = self._base.get_indexer(self, lab_ints)
 
-    def get_backfill_indexer(self, other, limit=None):
-        # we coerce to ndarray-of-tuples
-        values = np.array(self._get_index_values())
-        return algos.backfill_object(values, other, limit=limit)
+        return indexer
 
-    def get_pad_indexer(self, other, limit=None):
-        # we coerce to ndarray-of-tuples
-        values = np.array(self._get_index_values())
-        return algos.pad_object(values, other, limit=limit)
+    def get_loc(self, object key):
+        if is_definitely_invalid_key(key):
+            raise TypeError("'{key}' is an invalid key".format(key=key))
+        if not PyTuple_Check(key):
+            raise KeyError(key)
+        try:
+            indices = [0 if checknull(v) else lev.get_loc(v) + 1
+                       for lev, v in zip(self.levels, key)]
+        except KeyError:
+            raise KeyError(key)
 
-    cpdef get_loc(self, object val):
-        if is_definitely_invalid_key(val):
-            raise TypeError("'{val}' is an invalid key".format(val=val))
+        # Transform indices into single integer:
+        lab_int = self._codes_to_ints(np.array(indices, dtype='uint64'))
 
-        self._ensure_mapping_populated()
-        if not self.unique:
-            return self._get_loc_duplicates(val)
+        return self._base.get_loc(self, lab_int)
 
-        try:
-            return self.mapping.get_item(val)
-        except TypeError:
-            raise KeyError(val)
+    def get_indexer_non_unique(self, object target):
+        # This needs to be overridden just because the default one works on
+        # target._values, and target can be itself a MultiIndex.
 
-    def get_indexer(self, values):
-        self._ensure_mapping_populated()
-        return self.mapping.lookup(values)
+        lab_ints = self._extract_level_codes(target)
+        indexer = self._base.get_indexer_non_unique(self, lab_ints)
+
+        return indexer
+
+    def __contains__(self, object val):
+        # Default __contains__ looks in the underlying mapping, which in this
+        # case only contains integer representations.
+        try:
+            self.get_loc(val)
+            return True
+        except (KeyError, TypeError, ValueError):
+            return False
 
-    cdef _make_hash_table(self, n):
-        return _hash.MultiIndexHashTable(n)
 
 # Generated from template.
 include "index_class_helper.pxi"