Skip to content

Commit

Permalink
ENH: ExtensionEngine (#45514)
Browse files Browse the repository at this point in the history
  • Loading branch information
jbrockmendel authored Jan 25, 2022
1 parent 1fd31bd commit 4248b23
Show file tree
Hide file tree
Showing 3 changed files with 337 additions and 37 deletions.
19 changes: 19 additions & 0 deletions pandas/_libs/index.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import numpy as np
from pandas._typing import npt

from pandas import MultiIndex
from pandas.core.arrays import ExtensionArray

class IndexEngine:
over_size_threshold: bool
Expand Down Expand Up @@ -63,3 +64,21 @@ class BaseMultiIndexCodesEngine:
method: str,
limit: int | None,
) -> npt.NDArray[np.intp]: ...

class ExtensionEngine:
def __init__(self, values: "ExtensionArray"): ...
def __contains__(self, val: object) -> bool: ...
def get_loc(self, val: object) -> int | slice | np.ndarray: ...
def get_indexer(self, values: np.ndarray) -> npt.NDArray[np.intp]: ...
def get_indexer_non_unique(
self,
targets: np.ndarray,
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
@property
def is_unique(self) -> bool: ...
@property
def is_monotonic_increasing(self) -> bool: ...
@property
def is_monotonic_decreasing(self) -> bool: ...
def sizeof(self, deep: bool = ...) -> int: ...
def clear_mapping(self): ...
271 changes: 271 additions & 0 deletions pandas/_libs/index.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -797,3 +797,274 @@ cdef class BaseMultiIndexCodesEngine:

# Generated from template.
include "index_class_helper.pxi"


@cython.internal
@cython.freelist(32)
cdef class SharedEngine:
cdef readonly:
object values # ExtensionArray
bint over_size_threshold

cdef:
bint unique, monotonic_inc, monotonic_dec
bint need_monotonic_check, need_unique_check

def __contains__(self, val: object) -> bool:
# We assume before we get here:
# - val is hashable
try:
self.get_loc(val)
return True
except KeyError:
return False

def clear_mapping(self):
# for compat with IndexEngine
pass

@property
def is_unique(self) -> bool:
if self.need_unique_check:
arr = self.values.unique()
self.unique = len(arr) == len(self.values)

self.need_unique_check = False
return self.unique

cdef _do_monotonic_check(self):
raise NotImplementedError

@property
def is_monotonic_increasing(self) -> bool:
if self.need_monotonic_check:
self._do_monotonic_check()

return self.monotonic_inc == 1

@property
def is_monotonic_decreasing(self) -> bool:
if self.need_monotonic_check:
self._do_monotonic_check()

return self.monotonic_dec == 1

cdef _call_monotonic(self, values):
return algos.is_monotonic(values, timelike=False)

def sizeof(self, deep: bool = False) -> int:
""" return the sizeof our mapping """
return 0

def __sizeof__(self) -> int:
return self.sizeof()

cdef _check_type(self, object obj):
raise NotImplementedError

cpdef get_loc(self, object val):
# -> Py_ssize_t | slice | ndarray[bool]
cdef:
Py_ssize_t loc

if is_definitely_invalid_key(val):
raise TypeError(f"'{val}' is an invalid key")

self._check_type(val)

if self.over_size_threshold and self.is_monotonic_increasing:
if not self.is_unique:
return self._get_loc_duplicates(val)

values = self.values

loc = self._searchsorted_left(val)
if loc >= len(values):
raise KeyError(val)
if values[loc] != val:
raise KeyError(val)
return loc

if not self.unique:
return self._get_loc_duplicates(val)

return self._get_loc_duplicates(val)

cdef inline _get_loc_duplicates(self, object val):
# -> Py_ssize_t | slice | ndarray[bool]
cdef:
Py_ssize_t diff

if self.is_monotonic_increasing:
values = self.values
try:
left = values.searchsorted(val, side='left')
right = values.searchsorted(val, side='right')
except TypeError:
# e.g. GH#29189 get_loc(None) with a Float64Index
raise KeyError(val)

diff = right - left
if diff == 0:
raise KeyError(val)
elif diff == 1:
return left
else:
return slice(left, right)

return self._maybe_get_bool_indexer(val)

cdef Py_ssize_t _searchsorted_left(self, val) except? -1:
"""
See ObjectEngine._searchsorted_left.__doc__.
"""
try:
loc = self.values.searchsorted(val, side="left")
except TypeError as err:
# GH#35788 e.g. val=None with float64 values
raise KeyError(val)
return loc

cdef ndarray _get_bool_indexer(self, val):
raise NotImplementedError

cdef _maybe_get_bool_indexer(self, object val):
# Returns ndarray[bool] or int
cdef:
ndarray[uint8_t, ndim=1, cast=True] indexer

indexer = self._get_bool_indexer(val)
return _unpack_bool_indexer(indexer, val)

def get_indexer(self, values) -> np.ndarray:
# values : type(self.values)
# Note: we only get here with self.is_unique
cdef:
Py_ssize_t i, N = len(values)

res = np.empty(N, dtype=np.intp)

for i in range(N):
val = values[i]
try:
loc = self.get_loc(val)
# Because we are unique, loc should always be an integer
except KeyError:
loc = -1
else:
assert util.is_integer_object(loc), (loc, val)
res[i] = loc

return res

def get_indexer_non_unique(self, targets):
"""
Return an indexer suitable for taking from a non unique index
return the labels in the same order as the target
and a missing indexer into the targets (which correspond
to the -1 indices in the results
Parameters
----------
targets : type(self.values)
Returns
-------
indexer : np.ndarray[np.intp]
missing : np.ndarray[np.intp]
"""
cdef:
Py_ssize_t i, N = len(targets)

indexer = []
missing = []

# See also IntervalIndex.get_indexer_pointwise
for i in range(N):
val = targets[i]

try:
locs = self.get_loc(val)
except KeyError:
locs = np.array([-1], dtype=np.intp)
missing.append(i)
else:
if isinstance(locs, slice):
# Only needed for get_indexer_non_unique
locs = np.arange(locs.start, locs.stop, locs.step, dtype=np.intp)
elif util.is_integer_object(locs):
locs = np.array([locs], dtype=np.intp)
else:
assert locs.dtype.kind == "b"
locs = locs.nonzero()[0]

indexer.append(locs)

try:
indexer = np.concatenate(indexer, dtype=np.intp)
except TypeError:
# numpy<1.20 doesn't accept dtype keyword
indexer = np.concatenate(indexer).astype(np.intp, copy=False)
missing = np.array(missing, dtype=np.intp)

return indexer, missing


cdef class ExtensionEngine(SharedEngine):
def __init__(self, values: "ExtensionArray"):
self.values = values

self.over_size_threshold = len(values) >= _SIZE_CUTOFF
self.need_unique_check = True
self.need_monotonic_check = True
self.need_unique_check = True

cdef _do_monotonic_check(self):
cdef:
bint is_unique

values = self.values
if values._hasna:
self.monotonic_inc = 0
self.monotonic_dec = 0

nunique = len(values.unique())
self.unique = nunique == len(values)
self.need_unique_check = 0
return

try:
ranks = values._rank()

except TypeError:
self.monotonic_inc = 0
self.monotonic_dec = 0
is_unique = 0
else:
self.monotonic_inc, self.monotonic_dec, is_unique = \
self._call_monotonic(ranks)

self.need_monotonic_check = 0

# we can only be sure of uniqueness if is_unique=1
if is_unique:
self.unique = 1
self.need_unique_check = 0

cdef ndarray _get_bool_indexer(self, val):
if checknull(val):
return self.values.isna().view("uint8")

try:
return self.values == val
except TypeError:
# e.g. if __eq__ returns a BooleanArray instead of ndarry[bool]
try:
return (self.values == val).to_numpy(dtype=bool, na_value=False)
except (TypeError, AttributeError) as err:
# e.g. (self.values == val) returned a bool
# see test_get_loc_generator[string[pyarrow]]
# e.g. self.value == val raises TypeError bc generator has no len
# see test_get_loc_generator[string[python]]
raise KeyError from err

cdef _check_type(self, object val):
hash(val)
Loading

0 comments on commit 4248b23

Please sign in to comment.