Skip to content

REF: codes-based MultiIndex engine #19074

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Jan 28, 2018
7 changes: 6 additions & 1 deletion doc/source/whatsnew/v0.23.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,7 @@ Performance Improvements
- Improved performance of ``DatetimeIndex`` and ``Series`` arithmetic operations with Business-Month and Business-Quarter frequencies (:issue:`18489`)
- :func:`Series` / :func:`DataFrame` tab completion limits to 100 values, for better performance. (:issue:`18587`)
- Improved performance of :func:`DataFrame.median` with ``axis=1`` when bottleneck is not installed (:issue:`16468`)
- Improved performance of :func:`MultiIndex.get_loc` for large indexes, at the cost of a reduction in performance for small ones (:issue:`18519`)


.. _whatsnew_0230.docs:
Expand Down Expand Up @@ -476,7 +477,11 @@ MultiIndex
- Bug in :func:`MultiIndex.get_level_values` which would return an invalid index on level of ints with missing values (:issue:`17924`)
- Bug in :func:`MultiIndex.remove_unused_levels` which would fill nan values (:issue:`18417`)
- Bug in :func:`MultiIndex.from_tuples`` which would fail to take zipped tuples in python3 (:issue:`18434`)
-
- Bug in :func:`MultiIndex.get_loc`` which would fail to automatically cast values between float and int (:issue:`18818`, :issue:`15994`)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i count 5 issues here, but 6 in the top of the PR?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Already replied twice

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and I still want it listed appropriately.

Copy link
Member Author

@toobaz toobaz Jan 26, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, but it's not a bug, where shall I list it?

- Bug in :func:`MultiIndex.get_loc`` which would cast boolean to integer labels (:issue:`19086`)
- Bug in :func:`MultiIndex.get_loc`` which would fail to locate keys containing ``NaN`` (:issue:`18485`)
- Bug in :func:`MultiIndex.get_loc`` in large :class:`MultiIndex`, would fail when levels had different dtypes (:issue:`18520`)


I/O
^^^
Expand Down
9 changes: 0 additions & 9 deletions pandas/_libs/hashtable.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,6 @@ cdef class PyObjectHashTable(HashTable):
cpdef get_item(self, object val)
cpdef set_item(self, object key, Py_ssize_t val)

cdef class MultiIndexHashTable(HashTable):
cdef:
kh_uint64_t *table
object mi

cpdef get_item(self, object val)
cpdef set_item(self, object key, Py_ssize_t val)
cdef inline void _check_for_collision(self, Py_ssize_t loc, object label)


cdef class StringHashTable(HashTable):
cdef kh_str_t *table
Expand Down
136 changes: 0 additions & 136 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -899,139 +899,3 @@ cdef class PyObjectHashTable(HashTable):
count += 1

return np.asarray(labels)


cdef class MultiIndexHashTable(HashTable):

def __init__(self, size_hint=1):
self.table = kh_init_uint64()
self.mi = None
kh_resize_uint64(self.table, size_hint)

def __dealloc__(self):
if self.table is not NULL:
kh_destroy_uint64(self.table)
self.table = NULL

def __len__(self):
return self.table.size

def sizeof(self, deep=False):
""" return the size of my table in bytes """
return self.table.n_buckets * (sizeof(uint64_t) + # keys
sizeof(size_t) + # vals
sizeof(uint32_t)) # flags

def _check_for_collisions(self, int64_t[:] locs, object mi):
# validate that the locs map to the actual values
# provided in the mi
# we can only check if we *don't* have any missing values
# :<
cdef:
ndarray[int64_t] alocs

alocs = np.asarray(locs)
if (alocs != -1).all():

result = self.mi.take(locs)
if isinstance(mi, tuple):
from pandas import Index
mi = Index([mi])
if not result.equals(mi):
raise AssertionError(
"hash collision\nlocs:\n{}\n"
"result:\n{}\nmi:\n{}".format(alocs, result, mi))

cdef inline void _check_for_collision(self, Py_ssize_t loc, object label):
# validate that the loc maps to the actual value
# version of _check_for_collisions above for single label (tuple)

result = self.mi[loc]

if not all(l == r or (is_null_datetimelike(l)
and is_null_datetimelike(r))
for l, r in zip(result, label)):
raise AssertionError(
"hash collision\nloc:\n{}\n"
"result:\n{}\nmi:\n{}".format(loc, result, label))

def __contains__(self, object key):
try:
self.get_item(key)
return True
except (KeyError, ValueError, TypeError):
return False

cpdef get_item(self, object key):
cdef:
khiter_t k
uint64_t value
int64_t[:] locs
Py_ssize_t loc

value = self.mi._hashed_indexing_key(key)
k = kh_get_uint64(self.table, value)
if k != self.table.n_buckets:
loc = self.table.vals[k]
self._check_for_collision(loc, key)
return loc
else:
raise KeyError(key)

cpdef set_item(self, object key, Py_ssize_t val):
raise NotImplementedError

@cython.boundscheck(False)
def map_locations(self, object mi):
cdef:
Py_ssize_t i, n
ndarray[uint64_t] values
uint64_t val
int ret = 0
khiter_t k

self.mi = mi
n = len(mi)
values = mi._hashed_values

with nogil:
for i in range(n):
val = values[i]
k = kh_put_uint64(self.table, val, &ret)
self.table.vals[k] = i

@cython.boundscheck(False)
def lookup(self, object mi):
# look up with a target mi
cdef:
Py_ssize_t i, n
ndarray[uint64_t] values
int ret = 0
uint64_t val
khiter_t k
int64_t[:] locs

n = len(mi)
values = mi._hashed_values

locs = np.empty(n, dtype=np.int64)

with nogil:
for i in range(n):
val = values[i]
k = kh_get_uint64(self.table, val)
if k != self.table.n_buckets:
locs[i] = self.table.vals[k]
else:
locs[i] = -1

self._check_for_collisions(locs, mi)
return np.asarray(locs)

def unique(self, object mi):
raise NotImplementedError

def get_labels(self, object mi, ObjectVector uniques,
Py_ssize_t count_prior, int64_t na_sentinel,
bint check_null=True):
raise NotImplementedError
168 changes: 118 additions & 50 deletions pandas/_libs/index.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,12 @@ from hashtable cimport HashTable
from pandas._libs import algos, hashtable as _hash
from pandas._libs.tslibs import period as periodlib
from pandas._libs.tslib import Timestamp, Timedelta
from pandas._libs.missing import checknull

cdef int64_t iNaT = util.get_nat()


cdef inline is_definitely_invalid_key(object val):
cdef inline bint is_definitely_invalid_key(object val):
if PyTuple_Check(val):
try:
hash(val)
Expand Down Expand Up @@ -585,70 +586,137 @@ cpdef convert_scalar(ndarray arr, object value):
return value


cdef class MultiIndexObjectEngine(ObjectEngine):
cdef class BaseMultiIndexCodesEngine:
"""
provide the same interface as the MultiIndexEngine
but use the IndexEngine for computation

This provides good performance with samller MI's
Base class for MultiIndexUIntEngine and MultiIndexPyIntEngine, which
represent each label in a MultiIndex as an integer, by juxtaposing the bits
encoding each level, with appropriate offsets.

For instance: if 3 levels have respectively 3, 6 and 1 possible values,
then their labels can be represented using respectively 2, 3 and 1 bits,
as follows:
_ _ _ _____ _ __ __ __
|0|0|0| ... |0| 0|a1|a0| -> offset 0 (first level)
— — — ————— — —— —— ——
|0|0|0| ... |0|b2|b1|b0| -> offset 2 (bits required for first level)
— — — ————— — —— —— ——
|0|0|0| ... |0| 0| 0|c0| -> offset 5 (bits required for first two levels)
‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾
and the resulting unsigned integer representation will be:
_ _ _ _____ _ __ __ __ __ __ __
|0|0|0| ... |0|c0|b2|b1|b0|a1|a0|
‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾

Offsets are calculated at initialization, labels are transformed by method
_codes_to_ints.

Keys are located by first locating each component against the respective
level, then locating (the integer representation of) codes.
"""
def get_indexer(self, values):
# convert a MI to an ndarray
if hasattr(values, 'values'):
values = values.values
return super(MultiIndexObjectEngine, self).get_indexer(values)
def __init__(self, object levels, object labels,
ndarray[uint64_t, ndim=1] offsets):
"""
Parameters
----------
levels : list-like of numpy arrays
Levels of the MultiIndex
labels : list-like of numpy arrays of integer dtype
Labels of the MultiIndex
offsets : numpy array of uint64 dtype
Pre-calculated offsets, one for each level of the index
"""

cpdef get_loc(self, object val):
self.levels = levels
self.offsets = offsets

# convert a MI to an ndarray
if hasattr(val, 'values'):
val = val.values
return super(MultiIndexObjectEngine, self).get_loc(val)
# Transform labels in a single array, and add 1 so that we are working
# with positive integers (-1 for NaN becomes 0):
codes = (np.array(labels, dtype='int64').T + 1).astype('uint64',
copy=False)

# Map each codes combination in the index to an integer unambiguously
# (no collisions possible), based on the "offsets", which describe the
# number of bits to switch labels for each level:
lab_ints = self._codes_to_ints(codes)

cdef class MultiIndexHashEngine(ObjectEngine):
"""
Use a hashing based MultiIndex impl
but use the IndexEngine for computation
# Initialize underlying index (e.g. libindex.UInt64Engine) with
# integers representing labels: we will use its get_loc and get_indexer
self._base.__init__(self, lambda: lab_ints, len(lab_ints))

This provides good performance with larger MI's
"""
def _extract_level_codes(self, object target, object method=None):
"""
Map the requested list of (tuple) keys to their integer representations
for searching in the underlying integer index.

Parameters
----------
target : list-like of keys
Each key is a tuple, with a label for each level of the index.

Returns
------
int_keys : 1-dimensional array of dtype uint64 or object
Integers representing one combination each
"""

def _call_monotonic(self, object mi):
# defer these back to the mi iteself
return (mi.is_monotonic_increasing,
mi.is_monotonic_decreasing,
mi.is_unique)
level_codes = [lev.get_indexer(codes) + 1 for lev, codes
in zip(self.levels, zip(*target))]
return self._codes_to_ints(np.array(level_codes, dtype='uint64').T)

def get_indexer(self, object target, object method=None,
object limit=None):
lab_ints = self._extract_level_codes(target)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

doc-string would be nice here

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Already replied (not in this PR)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and since its being re-written let's take this opportunity to fix these.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nope. They clearly must be inherited/set generally.


# All methods (exact, backfill, pad) directly map to the respective
# methods of the underlying (integers) index...
if method is not None:
# but underlying backfill and pad methods require index and keys
# to be sorted. The index already is (checked in
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pls run/show asv results for reindinxing with methods

# Index._get_fill_indexer), sort (integer representations of) keys:
order = np.argsort(lab_ints)
lab_ints = lab_ints[order]
indexer = (getattr(self._base, 'get_{}_indexer'.format(method))
(self, lab_ints, limit=limit))
indexer = indexer[order]
else:
indexer = self._base.get_indexer(self, lab_ints)

def get_backfill_indexer(self, other, limit=None):
# we coerce to ndarray-of-tuples
values = np.array(self._get_index_values())
return algos.backfill_object(values, other, limit=limit)
return indexer

def get_pad_indexer(self, other, limit=None):
# we coerce to ndarray-of-tuples
values = np.array(self._get_index_values())
return algos.pad_object(values, other, limit=limit)
def get_loc(self, object key):
if is_definitely_invalid_key(key):
raise TypeError("'{key}' is an invalid key".format(key=key))
if not PyTuple_Check(key):
raise KeyError(key)
try:
indices = [0 if checknull(v) else lev.get_loc(v) + 1
for lev, v in zip(self.levels, key)]
except KeyError:
raise KeyError(key)

cpdef get_loc(self, object val):
if is_definitely_invalid_key(val):
raise TypeError("'{val}' is an invalid key".format(val=val))
# Transform indices into single integer:
lab_int = self._codes_to_ints(np.array(indices, dtype='uint64'))

self._ensure_mapping_populated()
if not self.unique:
return self._get_loc_duplicates(val)
return self._base.get_loc(self, lab_int)

try:
return self.mapping.get_item(val)
except TypeError:
raise KeyError(val)
def get_indexer_non_unique(self, object target):
# This needs to be overridden just because the default one works on
# target._values, and target can be itself a MultiIndex.

def get_indexer(self, values):
self._ensure_mapping_populated()
return self.mapping.lookup(values)
lab_ints = self._extract_level_codes(target)
indexer = self._base.get_indexer_non_unique(self, lab_ints)

return indexer

def __contains__(self, object val):
# Default __contains__ looks in the underlying mapping, which in this
# case only contains integer representations.
try:
self.get_loc(val)
return True
except (KeyError, TypeError, ValueError):
return False

cdef _make_hash_table(self, n):
return _hash.MultiIndexHashTable(n)

# Generated from template.
include "index_class_helper.pxi"
Loading