pandas-dev · jreback · Jan 25, 2022 · Jan 20, 2022 · Jan 23, 2022 · Jan 23, 2022
@@ -63,3 +63,21 @@ class BaseMultiIndexCodesEngine:
         method: str,
         limit: int | None,
     ) -> npt.NDArray[np.intp]: ...
+
+class ExtensionEngine:
+    def __init__(self, values: "ExtensionArray"): ...
+    def __contains__(self, val: object) -> bool: ...
+    def get_loc(self, val: object) -> int | slice | np.ndarray: ...
+    def get_indexer(self, values: np.ndarray) -> npt.NDArray[np.intp]: ...
+    def get_indexer_non_unique(
+        self,
+        targets: np.ndarray,
+    ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
+    @property
+    def is_unique(self) -> bool: ...
+    @property
+    def is_monotonic_increasing(self) -> bool: ...
+    @property
+    def is_monotonic_decreasing(self) -> bool: ...
+    def sizeof(self, deep: bool = ...) -> int: ...
+    def clear_mapping(self): ...
@@ -797,3 +797,274 @@ cdef class BaseMultiIndexCodesEngine:
 
 # Generated from template.
 include "index_class_helper.pxi"
+
+
+@cython.internal
+@cython.freelist(32)
+cdef class SharedEngine:
+    cdef readonly:
+        object values  # ExtensionArray
+        bint over_size_threshold
+
+    cdef:
+        bint unique, monotonic_inc, monotonic_dec
+        bint need_monotonic_check, need_unique_check
+
+    def __contains__(self, val: object) -> bool:
+        # We assume before we get here:
+        #  - val is hashable
+        try:
+            self.get_loc(val)
+            return True
+        except KeyError:
+            return False
+
+    def clear_mapping(self):
+        # for compat with IndexEngine
+        pass
+
+    @property
+    def is_unique(self) -> bool:
+        if self.need_unique_check:
+            arr = self.values.unique()
+            self.unique = len(arr) == len(self.values)
+
+            self.need_unique_check = False
+        return self.unique
+
+    cdef _do_monotonic_check(self):
+        raise NotImplementedError
+
+    @property
+    def is_monotonic_increasing(self) -> bool:
+        if self.need_monotonic_check:
+            self._do_monotonic_check()
+
+        return self.monotonic_inc == 1
+
+    @property
+    def is_monotonic_decreasing(self) -> bool:
+        if self.need_monotonic_check:
+            self._do_monotonic_check()
+
+        return self.monotonic_dec == 1
+
+    cdef _call_monotonic(self, values):
+        return algos.is_monotonic(values, timelike=False)
+
+    def sizeof(self, deep: bool = False) -> int:
+        """ return the sizeof our mapping """
+        return 0
+
+    def __sizeof__(self) -> int:
+        return self.sizeof()
+
+    cdef _check_type(self, object obj):
+        raise NotImplementedError
+
+    cpdef get_loc(self, object val):
+        # -> Py_ssize_t | slice | ndarray[bool]
+        cdef:
+            Py_ssize_t loc
+
+        if is_definitely_invalid_key(val):
+            raise TypeError(f"'{val}' is an invalid key")
+
+        self._check_type(val)
+
+        if self.over_size_threshold and self.is_monotonic_increasing:
+            if not self.is_unique:
+                return self._get_loc_duplicates(val)
+
+            values = self.values
+
+            loc = self._searchsorted_left(val)
+            if loc >= len(values):
+                raise KeyError(val)
+            if values[loc] != val:
+                raise KeyError(val)
+            return loc
+
+        if not self.unique:
+            return self._get_loc_duplicates(val)
+
+        return self._get_loc_duplicates(val)
+
+    cdef inline _get_loc_duplicates(self, object val):
+        # -> Py_ssize_t | slice | ndarray[bool]
+        cdef:
+            Py_ssize_t diff
+
+        if self.is_monotonic_increasing:
+            values = self.values
+            try:
+                left = values.searchsorted(val, side='left')
+                right = values.searchsorted(val, side='right')
+            except TypeError:
+                # e.g. GH#29189 get_loc(None) with a Float64Index
+                raise KeyError(val)
+
+            diff = right - left
+            if diff == 0:
+                raise KeyError(val)
+            elif diff == 1:
+                return left
+            else:
+                return slice(left, right)
+
+        return self._maybe_get_bool_indexer(val)
+
+    cdef Py_ssize_t _searchsorted_left(self, val) except? -1:
+        """
+        See ObjectEngine._searchsorted_left.__doc__.
+        """
+        try:
+            loc = self.values.searchsorted(val, side="left")
+        except TypeError as err:
+            # GH#35788 e.g. val=None with float64 values
+            raise KeyError(val)
+        return loc
+
+    cdef ndarray _get_bool_indexer(self, val):
+        raise NotImplementedError
+
+    cdef _maybe_get_bool_indexer(self, object val):
+        # Returns ndarray[bool] or int
+        cdef:
+            ndarray[uint8_t, ndim=1, cast=True] indexer
+
+        indexer = self._get_bool_indexer(val)
+        return _unpack_bool_indexer(indexer, val)
+
+    def get_indexer(self, values) -> np.ndarray:
+        # values : type(self.values)
+        # Note: we only get here with self.is_unique
+        cdef:
+            Py_ssize_t i, N = len(values)
+
+        res = np.empty(N, dtype=np.intp)
+
+        for i in range(N):
+            val = values[i]
+            try:
+                loc = self.get_loc(val)
+                # Because we are unique, loc should always be an integer
+            except KeyError:
+                loc = -1
+            else:
+                assert util.is_integer_object(loc), (loc, val)
+            res[i] = loc
+
+        return res
+
+    def get_indexer_non_unique(self, targets):
+        """
+        Return an indexer suitable for taking from a non unique index
+        return the labels in the same order as the target
+        and a missing indexer into the targets (which correspond
+        to the -1 indices in the results
+        Parameters
+        ----------
+        targets : type(self.values)
+        Returns
+        -------
+        indexer : np.ndarray[np.intp]
+        missing : np.ndarray[np.intp]
+        """
+        cdef:
+            Py_ssize_t i, N = len(targets)
+
+        indexer = []
+        missing = []
+
+        # See also IntervalIndex.get_indexer_pointwise
+        for i in range(N):
+            val = targets[i]
+
+            try:
+                locs = self.get_loc(val)
+            except KeyError:
+                locs = np.array([-1], dtype=np.intp)
+                missing.append(i)
+            else:
+                if isinstance(locs, slice):
+                    # Only needed for get_indexer_non_unique
+                    locs = np.arange(locs.start, locs.stop, locs.step, dtype=np.intp)
+                elif util.is_integer_object(locs):
+                    locs = np.array([locs], dtype=np.intp)
+                else:
+                    assert locs.dtype.kind == "b"
+                    locs = locs.nonzero()[0]
+
+            indexer.append(locs)
+
+        try:
+            indexer = np.concatenate(indexer, dtype=np.intp)
+        except TypeError:
+            # numpy<1.20 doesn't accept dtype keyword
+            indexer = np.concatenate(indexer).astype(np.intp, copy=False)
+        missing = np.array(missing, dtype=np.intp)
+
+        return indexer, missing
+
+
+cdef class ExtensionEngine(SharedEngine):
+    def __init__(self, values: "ExtensionArray"):
+        self.values = values
+
+        self.over_size_threshold = len(values) >= _SIZE_CUTOFF
+        self.need_unique_check = True
+        self.need_monotonic_check = True
+        self.need_unique_check = True
+
+    cdef _do_monotonic_check(self):
+        cdef:
+            bint is_unique
+
+        values = self.values
+        if values._hasnans:
+            self.monotonic_inc = 0
+            self.monotonic_dec = 0
+
+            nunique = len(values.unique())
+            self.unique = nunique == len(values)
+            self.need_unique_check = 0
+            return
+
+        try:
+            ranks = values._rank()
+
+        except TypeError:
+            self.monotonic_inc = 0
+            self.monotonic_dec = 0
+            is_unique = 0
+        else:
+            self.monotonic_inc, self.monotonic_dec, is_unique = \
+                self._call_monotonic(ranks)
+
+        self.need_monotonic_check = 0
+
+        # we can only be sure of uniqueness if is_unique=1
+        if is_unique:
+            self.unique = 1
+            self.need_unique_check = 0
+
+    cdef ndarray _get_bool_indexer(self, val):
+        if checknull(val):
+            return self.values.isna().view("uint8")
+
+        try:
+            return self.values == val
+        except TypeError:
+            # e.g. if __eq__ returns a BooleanArray instead of ndarry[bool]
+            try:
+                return (self.values == val).to_numpy(dtype=bool, na_value=False)
+            except (TypeError, AttributeError) as err:
+                # e.g. (self.values == val) returned a bool
+                #  see test_get_loc_generator[string[pyarrow]]
+                # e.g. self.value == val raises TypeError bc generator has no len
+                #  see test_get_loc_generator[string[python]]
+                raise KeyError from err
+
+    cdef _check_type(self, object val):
+        hash(val)
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -144,7 +144,6 @@
     tz_to_dtype,
     validate_tz_from_dtype,
 )
-from pandas.core.arrays.masked import BaseMaskedArray
 from pandas.core.arrays.sparse import SparseDtype
 from pandas.core.base import (
     IndexOpsMixin,
@@ -853,15 +852,11 @@ def _engine(
     ) -> libindex.IndexEngine:
         # For base class (object dtype) we get ObjectEngine
 
-        if isinstance(self._values, BaseMaskedArray):
-            # TODO(ExtensionIndex): use libindex.NullableEngine(self._values)
-            return libindex.ObjectEngine(self._get_engine_target())
-        elif (
+        if (
             isinstance(self._values, ExtensionArray)
             and self._engine_type is libindex.ObjectEngine
         ):
-            # TODO(ExtensionIndex): use libindex.ExtensionEngine(self._values)
-            return libindex.ObjectEngine(self._get_engine_target())
+            return libindex.ExtensionEngine(self._values)
 
         # to avoid a reference cycle, bind `target_values` to a local variable, so
         # `self` is not passed into the lambda.
@@ -3316,12 +3311,6 @@ def _wrap_setop_result(self, other: Index, result) -> Index:
                 result = result.rename(name)
         else:
             result = self._shallow_copy(result, name=name)
-
-        if type(self) is Index and self.dtype != _dtype_obj:
-            # i.e. ExtensionArray-backed
-            # TODO(ExtensionIndex): revert this astype; it is a kludge to make
-            #  it possible to split ExtensionEngine from ExtensionIndex PR.
-            return result.astype(self.dtype, copy=False)
         return result
 
     @final
@@ -4879,8 +4868,9 @@ def _can_use_libjoin(self) -> bool:
         """
         Whether we can use the fastpaths implement in _libs.join
         """
-        # Note: this will need to be updated when e.g. Nullable dtypes
-        #  are supported in Indexes.
+        if type(self) is Index:
+            # excludes EAs
+            return isinstance(self.dtype, np.dtype)
         return not is_interval_dtype(self.dtype)
 
     # --------------------------------------------------------------------
@@ -4946,16 +4936,12 @@ def _values(self) -> ExtensionArray | np.ndarray:
         """
         return self._data
 
-    def _get_engine_target(self) -> np.ndarray:
+    def _get_engine_target(self) -> ArrayLike:
         """
-        Get the ndarray that we can pass to the IndexEngine constructor.
+        Get the ndarray or ExtensionArray that we can pass to the IndexEngine
+        constructor.
         """
-        # error: Incompatible return value type (got "Union[ExtensionArray,
-        # ndarray]", expected "ndarray")
-        if type(self) is Index and isinstance(self._values, ExtensionArray):
-            # TODO(ExtensionIndex): remove special-case, just use self._values
-            return self._values.astype(object)
-        return self._values  # type: ignore[return-value]
+        return self._values
 
     def _from_join_target(self, result: np.ndarray) -> ArrayLike:
         """