@@ -26,11 +26,12 @@ from hashtable cimport HashTable
2626from pandas._libs import algos, hashtable as _hash
2727from pandas._libs.tslibs import period as periodlib
2828from pandas._libs.tslib import Timestamp, Timedelta
29+ from pandas._libs.missing import checknull
2930
3031cdef int64_t iNaT = util.get_nat()
3132
3233
33- cdef inline is_definitely_invalid_key(object val):
34+ cdef inline bint is_definitely_invalid_key(object val):
3435 if PyTuple_Check(val):
3536 try :
3637 hash (val)
@@ -585,70 +586,137 @@ cpdef convert_scalar(ndarray arr, object value):
585586 return value
586587
587588
588- cdef class MultiIndexObjectEngine(ObjectEngine) :
589+ cdef class BaseMultiIndexCodesEngine :
589590 """
590- provide the same interface as the MultiIndexEngine
591- but use the IndexEngine for computation
592-
593- This provides good performance with samller MI's
591+ Base class for MultiIndexUIntEngine and MultiIndexPyIntEngine, which
592+ represent each label in a MultiIndex as an integer, by juxtaposing the bits
593+ encoding each level, with appropriate offsets.
594+
595+ For instance: if 3 levels have respectively 3, 6 and 1 possible values,
596+ then their labels can be represented using respectively 2, 3 and 1 bits,
597+ as follows:
598+ _ _ _ _____ _ __ __ __
599+ |0|0|0| ... |0| 0|a1|a0| -> offset 0 (first level)
600+ — — — ————— — —— —— ——
601+ |0|0|0| ... |0|b2|b1|b0| -> offset 2 (bits required for first level)
602+ — — — ————— — —— —— ——
603+ |0|0|0| ... |0| 0| 0|c0| -> offset 5 (bits required for first two levels)
604+ ‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾
605+ and the resulting unsigned integer representation will be:
606+ _ _ _ _____ _ __ __ __ __ __ __
607+ |0|0|0| ... |0|c0|b2|b1|b0|a1|a0|
608+ ‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾
609+
610+ Offsets are calculated at initialization, labels are transformed by method
611+ _codes_to_ints.
612+
613+ Keys are located by first locating each component against the respective
614+ level, then locating (the integer representation of) codes.
594615 """
595- def get_indexer (self , values ):
596- # convert a MI to an ndarray
597- if hasattr (values, ' values' ):
598- values = values.values
599- return super (MultiIndexObjectEngine, self ).get_indexer(values)
616+ def __init__ (self , object levels , object labels ,
617+ ndarray[uint64_t , ndim = 1 ] offsets):
618+ """
619+ Parameters
620+ ----------
621+ levels : list-like of numpy arrays
622+ Levels of the MultiIndex
623+ labels : list-like of numpy arrays of integer dtype
624+ Labels of the MultiIndex
625+ offsets : numpy array of uint64 dtype
626+ Pre-calculated offsets, one for each level of the index
627+ """
600628
601- cpdef get_loc(self , object val):
629+ self .levels = levels
630+ self .offsets = offsets
602631
603- # convert a MI to an ndarray
604- if hasattr (val, ' values ' ):
605- val = val.values
606- return super (MultiIndexObjectEngine, self ).get_loc(val )
632+ # Transform labels in a single array, and add 1 so that we are working
633+ # with positive integers (-1 for NaN becomes 0 ):
634+ codes = (np.array(labels, dtype = ' int64 ' ).T + 1 ).astype( ' uint64 ' ,
635+ copy = False )
607636
637+ # Map each codes combination in the index to an integer unambiguously
638+ # (no collisions possible), based on the "offsets", which describe the
639+ # number of bits to switch labels for each level:
640+ lab_ints = self ._codes_to_ints(codes)
608641
609- cdef class MultiIndexHashEngine(ObjectEngine):
610- """
611- Use a hashing based MultiIndex impl
612- but use the IndexEngine for computation
642+ # Initialize underlying index (e.g. libindex.UInt64Engine) with
643+ # integers representing labels: we will use its get_loc and get_indexer
644+ self ._base.__init__ (self , lambda : lab_ints, len (lab_ints))
613645
614- This provides good performance with larger MI's
615- """
646+ def _extract_level_codes (self , object target , object method = None ):
647+ """
648+ Map the requested list of (tuple) keys to their integer representations
649+ for searching in the underlying integer index.
650+
651+ Parameters
652+ ----------
653+ target : list-like of keys
654+ Each key is a tuple, with a label for each level of the index.
655+
656+ Returns
657+ ------
658+ int_keys : 1-dimensional array of dtype uint64 or object
659+ Integers representing one combination each
660+ """
616661
617- def _call_monotonic (self , object mi ):
618- # defer these back to the mi iteself
619- return (mi.is_monotonic_increasing,
620- mi.is_monotonic_decreasing,
621- mi.is_unique)
662+ level_codes = [lev.get_indexer(codes) + 1 for lev, codes
663+ in zip (self .levels, zip (* target))]
664+ return self ._codes_to_ints(np.array(level_codes, dtype = ' uint64' ).T)
665+
666+ def get_indexer (self , object target , object method = None ,
667+ object limit = None ):
668+ lab_ints = self ._extract_level_codes(target)
669+
670+ # All methods (exact, backfill, pad) directly map to the respective
671+ # methods of the underlying (integers) index...
672+ if method is not None :
673+ # but underlying backfill and pad methods require index and keys
674+ # to be sorted. The index already is (checked in
675+ # Index._get_fill_indexer), sort (integer representations of) keys:
676+ order = np.argsort(lab_ints)
677+ lab_ints = lab_ints[order]
678+ indexer = (getattr (self ._base, ' get_{}_indexer' .format(method))
679+ (self , lab_ints, limit= limit))
680+ indexer = indexer[order]
681+ else :
682+ indexer = self ._base.get_indexer(self , lab_ints)
622683
623- def get_backfill_indexer (self , other , limit = None ):
624- # we coerce to ndarray-of-tuples
625- values = np.array(self ._get_index_values())
626- return algos.backfill_object(values, other, limit = limit)
684+ return indexer
627685
628- def get_pad_indexer (self , other , limit = None ):
629- # we coerce to ndarray-of-tuples
630- values = np.array(self ._get_index_values())
631- return algos.pad_object(values, other, limit = limit)
686+ def get_loc (self , object key ):
687+ if is_definitely_invalid_key(key):
688+ raise TypeError (" '{key}' is an invalid key" .format(key = key))
689+ if not PyTuple_Check(key):
690+ raise KeyError (key)
691+ try :
692+ indices = [0 if checknull(v) else lev.get_loc(v) + 1
693+ for lev, v in zip (self .levels, key)]
694+ except KeyError :
695+ raise KeyError (key)
632696
633- cpdef get_loc(self , object val):
634- if is_definitely_invalid_key(val):
635- raise TypeError (" '{val}' is an invalid key" .format(val = val))
697+ # Transform indices into single integer:
698+ lab_int = self ._codes_to_ints(np.array(indices, dtype = ' uint64' ))
636699
637- self ._ensure_mapping_populated()
638- if not self .unique:
639- return self ._get_loc_duplicates(val)
700+ return self ._base.get_loc(self , lab_int)
640701
641- try :
642- return self .mapping.get_item(val)
643- except TypeError :
644- raise KeyError (val)
702+ def get_indexer_non_unique (self , object target ):
703+ # This needs to be overridden just because the default one works on
704+ # target._values, and target can be itself a MultiIndex.
645705
646- def get_indexer (self , values ):
647- self ._ensure_mapping_populated()
648- return self .mapping.lookup(values)
706+ lab_ints = self ._extract_level_codes(target)
707+ indexer = self ._base.get_indexer_non_unique(self , lab_ints)
708+
709+ return indexer
710+
711+ def __contains__ (self , object val ):
712+ # Default __contains__ looks in the underlying mapping, which in this
713+ # case only contains integer representations.
714+ try :
715+ self .get_loc(val)
716+ return True
717+ except (KeyError , TypeError , ValueError ):
718+ return False
649719
650- cdef _make_hash_table(self , n):
651- return _hash.MultiIndexHashTable(n)
652720
653721# Generated from template.
654722include " index_class_helper.pxi"
0 commit comments