Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: ExtensionEngine #45514

Merged
merged 6 commits into from
Jan 25, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions pandas/_libs/index.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import numpy as np
from pandas._typing import npt

from pandas import MultiIndex
from pandas.core.arrays import ExtensionArray

class IndexEngine:
over_size_threshold: bool
Expand Down Expand Up @@ -63,3 +64,21 @@ class BaseMultiIndexCodesEngine:
method: str,
limit: int | None,
) -> npt.NDArray[np.intp]: ...

class ExtensionEngine:
def __init__(self, values: "ExtensionArray"): ...
def __contains__(self, val: object) -> bool: ...
def get_loc(self, val: object) -> int | slice | np.ndarray: ...
def get_indexer(self, values: np.ndarray) -> npt.NDArray[np.intp]: ...
def get_indexer_non_unique(
self,
targets: np.ndarray,
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
@property
def is_unique(self) -> bool: ...
@property
def is_monotonic_increasing(self) -> bool: ...
@property
def is_monotonic_decreasing(self) -> bool: ...
def sizeof(self, deep: bool = ...) -> int: ...
def clear_mapping(self): ...
271 changes: 271 additions & 0 deletions pandas/_libs/index.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -797,3 +797,274 @@ cdef class BaseMultiIndexCodesEngine:

# Generated from template.
include "index_class_helper.pxi"


@cython.internal
@cython.freelist(32)
cdef class SharedEngine:
cdef readonly:
object values # ExtensionArray
bint over_size_threshold

cdef:
bint unique, monotonic_inc, monotonic_dec
bint need_monotonic_check, need_unique_check

def __contains__(self, val: object) -> bool:
# We assume before we get here:
# - val is hashable
try:
self.get_loc(val)
return True
except KeyError:
return False

def clear_mapping(self):
# for compat with IndexEngine
pass

@property
def is_unique(self) -> bool:
if self.need_unique_check:
arr = self.values.unique()
self.unique = len(arr) == len(self.values)

self.need_unique_check = False
return self.unique

cdef _do_monotonic_check(self):
raise NotImplementedError

@property
def is_monotonic_increasing(self) -> bool:
if self.need_monotonic_check:
self._do_monotonic_check()

return self.monotonic_inc == 1

@property
def is_monotonic_decreasing(self) -> bool:
if self.need_monotonic_check:
self._do_monotonic_check()

return self.monotonic_dec == 1

cdef _call_monotonic(self, values):
return algos.is_monotonic(values, timelike=False)

def sizeof(self, deep: bool = False) -> int:
""" return the sizeof our mapping """
return 0

def __sizeof__(self) -> int:
return self.sizeof()

cdef _check_type(self, object obj):
raise NotImplementedError

cpdef get_loc(self, object val):
# -> Py_ssize_t | slice | ndarray[bool]
cdef:
Py_ssize_t loc

if is_definitely_invalid_key(val):
raise TypeError(f"'{val}' is an invalid key")

self._check_type(val)

if self.over_size_threshold and self.is_monotonic_increasing:
if not self.is_unique:
return self._get_loc_duplicates(val)

values = self.values

loc = self._searchsorted_left(val)
if loc >= len(values):
raise KeyError(val)
if values[loc] != val:
raise KeyError(val)
return loc

if not self.unique:
return self._get_loc_duplicates(val)

return self._get_loc_duplicates(val)

cdef inline _get_loc_duplicates(self, object val):
# -> Py_ssize_t | slice | ndarray[bool]
cdef:
Py_ssize_t diff

if self.is_monotonic_increasing:
values = self.values
try:
left = values.searchsorted(val, side='left')
right = values.searchsorted(val, side='right')
except TypeError:
# e.g. GH#29189 get_loc(None) with a Float64Index
raise KeyError(val)

diff = right - left
if diff == 0:
raise KeyError(val)
elif diff == 1:
return left
else:
return slice(left, right)

return self._maybe_get_bool_indexer(val)

cdef Py_ssize_t _searchsorted_left(self, val) except? -1:
"""
See ObjectEngine._searchsorted_left.__doc__.
"""
try:
loc = self.values.searchsorted(val, side="left")
except TypeError as err:
# GH#35788 e.g. val=None with float64 values
raise KeyError(val)
return loc

cdef ndarray _get_bool_indexer(self, val):
raise NotImplementedError

cdef _maybe_get_bool_indexer(self, object val):
# Returns ndarray[bool] or int
cdef:
ndarray[uint8_t, ndim=1, cast=True] indexer

indexer = self._get_bool_indexer(val)
return _unpack_bool_indexer(indexer, val)

def get_indexer(self, values) -> np.ndarray:
# values : type(self.values)
# Note: we only get here with self.is_unique
cdef:
Py_ssize_t i, N = len(values)

res = np.empty(N, dtype=np.intp)

for i in range(N):
val = values[i]
try:
loc = self.get_loc(val)
# Because we are unique, loc should always be an integer
except KeyError:
loc = -1
else:
assert util.is_integer_object(loc), (loc, val)
res[i] = loc

return res

def get_indexer_non_unique(self, targets):
"""
Return an indexer suitable for taking from a non unique index
return the labels in the same order as the target
and a missing indexer into the targets (which correspond
to the -1 indices in the results
Parameters
----------
targets : type(self.values)
Returns
-------
indexer : np.ndarray[np.intp]
missing : np.ndarray[np.intp]
"""
cdef:
Py_ssize_t i, N = len(targets)

indexer = []
missing = []

# See also IntervalIndex.get_indexer_pointwise
for i in range(N):
val = targets[i]

try:
locs = self.get_loc(val)
except KeyError:
locs = np.array([-1], dtype=np.intp)
missing.append(i)
else:
if isinstance(locs, slice):
# Only needed for get_indexer_non_unique
locs = np.arange(locs.start, locs.stop, locs.step, dtype=np.intp)
elif util.is_integer_object(locs):
locs = np.array([locs], dtype=np.intp)
else:
assert locs.dtype.kind == "b"
locs = locs.nonzero()[0]

indexer.append(locs)

try:
indexer = np.concatenate(indexer, dtype=np.intp)
except TypeError:
# numpy<1.20 doesn't accept dtype keyword
indexer = np.concatenate(indexer).astype(np.intp, copy=False)
missing = np.array(missing, dtype=np.intp)

return indexer, missing


cdef class ExtensionEngine(SharedEngine):
def __init__(self, values: "ExtensionArray"):
self.values = values

self.over_size_threshold = len(values) >= _SIZE_CUTOFF
self.need_unique_check = True
self.need_monotonic_check = True
self.need_unique_check = True

cdef _do_monotonic_check(self):
cdef:
bint is_unique

values = self.values
if values._hasna:
self.monotonic_inc = 0
self.monotonic_dec = 0

nunique = len(values.unique())
self.unique = nunique == len(values)
self.need_unique_check = 0
return

try:
ranks = values._rank()

except TypeError:
self.monotonic_inc = 0
self.monotonic_dec = 0
is_unique = 0
else:
self.monotonic_inc, self.monotonic_dec, is_unique = \
self._call_monotonic(ranks)

self.need_monotonic_check = 0

# we can only be sure of uniqueness if is_unique=1
if is_unique:
self.unique = 1
self.need_unique_check = 0

cdef ndarray _get_bool_indexer(self, val):
if checknull(val):
return self.values.isna().view("uint8")

try:
return self.values == val
except TypeError:
# e.g. if __eq__ returns a BooleanArray instead of ndarry[bool]
try:
return (self.values == val).to_numpy(dtype=bool, na_value=False)
except (TypeError, AttributeError) as err:
# e.g. (self.values == val) returned a bool
# see test_get_loc_generator[string[pyarrow]]
# e.g. self.value == val raises TypeError bc generator has no len
# see test_get_loc_generator[string[python]]
raise KeyError from err

cdef _check_type(self, object val):
hash(val)
Loading