Skip to content

POC: Use khash sets instead of maps for isin #53059

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 11 commits into from
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -493,6 +493,7 @@ Performance improvements
- Performance improvement in :meth:`Series.value_counts` and :meth:`Series.mode` for masked dtypes (:issue:`54984`, :issue:`55340`)
- Performance improvement in :meth:`.DataFrameGroupBy.nunique` and :meth:`.SeriesGroupBy.nunique` (:issue:`55972`)
- Performance improvement in :meth:`.SeriesGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.DataFrameGroupBy.idxmin` (:issue:`54234`)
- Performance improvement in :meth:`Series.isin` (:issue:`39799`)
- Performance improvement when indexing into a non-unique index (:issue:`55816`)
- Performance improvement when indexing with more than 4 keys (:issue:`54550`)
- Performance improvement when localizing time to UTC (:issue:`55241`)
Expand Down
97 changes: 92 additions & 5 deletions pandas/_libs/hashtable_func_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,28 @@ dtypes = [('Complex128', 'complex128', 'complex128',

}}

cdef extern from "pandas/vendored/klib/khash.h":
ctypedef uint32_t khuint_t

{{for name, dtype, ttype, c_type, to_c_type in dtypes}}

{{if dtype != "object" }}
cdef extern from "pandas/vendored/klib/khash.h":
ctypedef struct kh_{{dtype}}_set_t:
khuint_t n_buckets, size, n_occupied, upper_bound
uint32_t *flags
{{c_type}} *keys
char *vals

kh_{{dtype}}_set_t* kh_init_{{dtype}}_set() nogil
void kh_destroy_{{dtype}}_set(kh_{{dtype}}_set_t*) nogil
void kh_clear_{{dtype}}_set(kh_{{dtype}}_set_t*) nogil
void kh_resize_{{dtype}}_set(kh_{{dtype}}_set_t*, khuint_t) nogil
khuint_t kh_put_{{dtype}}_set(kh_{{dtype}}_set_t*, {{c_type}}, int*) nogil
void kh_del_{{dtype}}_set(kh_{{dtype}}_set_t*, khuint_t) nogil

bint kh_exist_{{dtype}}(kh_{{dtype}}_t*, khiter_t) nogil
{{endif}}

@cython.wraparound(False)
@cython.boundscheck(False)
Expand Down Expand Up @@ -138,12 +158,17 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', cons
{{endif}}
Py_ssize_t i, n = len(values), first_na = -1
khiter_t k
kh_{{ttype}}_t *table = kh_init_{{ttype}}()
ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
bint seen_na = False, uses_mask = mask is not None
bint seen_multiple_na = False

{{if dtype == "object"}}
cdef kh_{{ttype}}_t *table = kh_init_{{ttype}}()
kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
{{else}}
cdef kh_{{ttype}}_set_t *table = kh_init_{{ttype}}_set()
kh_resize_{{ttype}}_set(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
{{endif}}

if keep not in ('last', 'first', False):
raise ValueError('keep must be either "first", "last" or False')
Expand All @@ -152,6 +177,21 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', cons
{{cond}} keep == {{keep}}:
{{if dtype == 'object'}}
if True:
{{if keep == '"last"'}}
for i in range(n - 1, -1, -1):
{{else}}
for i in range(n):
{{endif}}
if uses_mask and mask[i]:
if seen_na:
out[i] = True
else:
out[i] = False
seen_na = True
else:
value = {{to_c_type}}(values[i])
kh_put_{{ttype}}(table, value, &ret)
out[i] = ret == 0
{{else}}
with nogil:
{{endif}}
Expand All @@ -168,16 +208,13 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', cons
seen_na = True
else:
value = {{to_c_type}}(values[i])
kh_put_{{ttype}}(table, value, &ret)
kh_put_{{ttype}}_set(table, value, &ret)
out[i] = ret == 0
{{endfor}}

else:
{{if dtype == 'object'}}
if True:
{{else}}
with nogil:
{{endif}}
for i in range(n):
if uses_mask and mask[i]:
if not seen_na:
Expand All @@ -201,8 +238,38 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', cons
k = kh_put_{{ttype}}(table, value, &ret)
table.vals[k] = i
out[i] = 0
{{else}}
with nogil:
for i in range(n):
if uses_mask and mask[i]:
if not seen_na:
first_na = i
seen_na = True
out[i] = 0
elif not seen_multiple_na:
out[i] = 1
out[first_na] = 1
seen_multiple_na = True
else:
out[i] = 1

else:
value = {{to_c_type}}(values[i])
k = kh_exist_{{ttype}}(table, value)
if k:
out[table.vals[k]] = 1
out[i] = 1
else:
k = kh_put_{{ttype}}_set(table, value, &ret)
table.vals[k] = i
out[i] = 0
{{endif}}

{{if dtype == "object"}}
kh_destroy_{{ttype}}(table)
{{else}}
kh_destroy_{{ttype}}_set(table)
{{endif}}
return out


Expand Down Expand Up @@ -243,11 +310,19 @@ cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values):
{{c_type}} val
{{endif}}

{{if dtype != "object"}}
kh_{{ttype}}_set_t *table = kh_init_{{ttype}}_set()
{{else}}
kh_{{ttype}}_t *table = kh_init_{{ttype}}()
{{endif}}

# construct the table
n = len(values)
{{if dtype != "object"}}
kh_resize_{{ttype}}_set(table, n)
{{else}}
kh_resize_{{ttype}}(table, n)
{{endif}}

{{if dtype == 'object'}}
if True:
Expand All @@ -256,7 +331,11 @@ cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values):
{{endif}}
for i in range(n):
val = {{to_c_type}}(values[i])
{{if dtype != "object"}}
kh_put_{{ttype}}_set(table, val, &ret)
{{else}}
kh_put_{{ttype}}(table, val, &ret)
{{endif}}

# test membership
n = len(arr)
Expand All @@ -269,10 +348,18 @@ cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values):
{{endif}}
for i in range(n):
val = {{to_c_type}}(arr[i])
{{if dtype != "object"}}
k = kh_get_{{ttype}}_set(table, val)
{{else}}
k = kh_get_{{ttype}}(table, val)
{{endif}}
result[i] = (k != table.n_buckets)

{{if dtype != "object"}}
kh_destroy_{{ttype}}_set(table)
{{else}}
kh_destroy_{{ttype}}(table)
{{endif}}
return result.view(np.bool_)

# ----------------------------------------------------------------------
Expand Down
23 changes: 23 additions & 0 deletions pandas/_libs/include/pandas/vendored/klib/khash.h
Original file line number Diff line number Diff line change
Expand Up @@ -653,6 +653,9 @@ static inline khuint_t __ac_Wang_hash(khuint_t key) {
#define KHASH_MAP_INIT_UINT(name, khval_t) \
KHASH_INIT(name, khuint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)

#define KHASH_SET_INIT_UINT(name) \
KHASH_INIT(name, khuint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)

/*! @function
@abstract Instantiate a hash map containing 64-bit integer keys
@param name Name of the hash table [symbol]
Expand Down Expand Up @@ -684,9 +687,15 @@ static inline khuint_t __ac_Wang_hash(khuint_t key) {
#define KHASH_MAP_INIT_INT16(name, khval_t) \
KHASH_INIT(name, khint16_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)

#define KHASH_SET_INIT_INT16(name) \
KHASH_INIT(name, khint16_t, char, 0, kh_int_hash_func, kh_int_hash_equal)

#define KHASH_MAP_INIT_UINT16(name, khval_t) \
KHASH_INIT(name, khuint16_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)

#define KHASH_SET_INIT_UINT16(name) \
KHASH_INIT(name, khuint16_t, char, 0, kh_int_hash_func, kh_int_hash_equal)

/*! @function
@abstract Instantiate a hash map containing 8bit-integer keys
@param name Name of the hash table [symbol]
Expand All @@ -695,9 +704,15 @@ static inline khuint_t __ac_Wang_hash(khuint_t key) {
#define KHASH_MAP_INIT_INT8(name, khval_t) \
KHASH_INIT(name, khint8_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)

#define KHASH_SET_INIT_INT8(name) \
KHASH_INIT(name, khint8_t, char, 0, kh_int_hash_func, kh_int_hash_equal)

#define KHASH_MAP_INIT_UINT8(name, khval_t) \
KHASH_INIT(name, khuint8_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)

#define KHASH_SET_INIT_UINT8(name) \
KHASH_INIT(name, khuint8_t, char, 0, kh_int_hash_func, kh_int_hash_equal)

typedef const char *kh_cstr_t;
/*! @function
@abstract Instantiate a hash map containing const char* keys
Expand Down Expand Up @@ -728,12 +743,20 @@ typedef const char *kh_cstr_t;

KHASH_MAP_INIT_STR(str, size_t)
KHASH_MAP_INIT_INT(int32, size_t)
KHASH_SET_INIT_INT(int32_set)
KHASH_MAP_INIT_UINT(uint32, size_t)
KHASH_SET_INIT_UINT(uint32_set)
KHASH_MAP_INIT_INT64(int64, size_t)
KHASH_SET_INIT_INT64(int64_set)
KHASH_MAP_INIT_UINT64(uint64, size_t)
KHASH_SET_INIT_UINT64(uint64_set)
KHASH_MAP_INIT_INT16(int16, size_t)
KHASH_SET_INIT_INT16(int16_set)
KHASH_MAP_INIT_UINT16(uint16, size_t)
KHASH_SET_INIT_UINT16(uint16_set)
KHASH_MAP_INIT_INT8(int8, size_t)
KHASH_SET_INIT_INT8(int8_set)
KHASH_MAP_INIT_UINT8(uint8, size_t)
KHASH_SET_INIT_UINT8(uint8_set)

#endif /* __AC_KHASH_H */
20 changes: 20 additions & 0 deletions pandas/_libs/include/pandas/vendored/klib/khash_python.h
Original file line number Diff line number Diff line change
Expand Up @@ -130,13 +130,23 @@ static inline khuint32_t kh_float32_hash_func(float val) {
KHASH_INIT(name, khfloat64_t, khval_t, 1, kh_float64_hash_func, \
kh_floats_hash_equal)

#define KHASH_SET_INIT_FLOAT64(name) \
KHASH_INIT(name, khfloat64_t, char, 0, kh_float64_hash_func, \
kh_floats_hash_equal)

KHASH_MAP_INIT_FLOAT64(float64, size_t)
KHASH_SET_INIT_FLOAT64(float64_set)

#define KHASH_MAP_INIT_FLOAT32(name, khval_t) \
KHASH_INIT(name, khfloat32_t, khval_t, 1, kh_float32_hash_func, \
kh_floats_hash_equal)

#define KHASH_SET_INIT_FLOAT32(name) \
KHASH_INIT(name, khfloat32_t, char, 0, kh_float32_hash_func, \
kh_floats_hash_equal)

KHASH_MAP_INIT_FLOAT32(float32, size_t)
KHASH_SET_INIT_FLOAT32(float32_set)

static inline khint32_t kh_complex128_hash_func(khcomplex128_t val) {
return kh_float64_hash_func(val.real) ^ kh_float64_hash_func(val.imag);
Expand All @@ -152,13 +162,23 @@ static inline khint32_t kh_complex64_hash_func(khcomplex64_t val) {
KHASH_INIT(name, khcomplex64_t, khval_t, 1, kh_complex64_hash_func, \
kh_complex_hash_equal)

#define KHASH_SET_INIT_COMPLEX64(name) \
KHASH_INIT(name, khcomplex64_t, char, 0, kh_complex64_hash_func, \
kh_complex_hash_equal)

KHASH_MAP_INIT_COMPLEX64(complex64, size_t)
KHASH_SET_INIT_COMPLEX64(complex64_set)

#define KHASH_MAP_INIT_COMPLEX128(name, khval_t) \
KHASH_INIT(name, khcomplex128_t, khval_t, 1, kh_complex128_hash_func, \
kh_complex_hash_equal)

#define KHASH_SET_INIT_COMPLEX128(name) \
KHASH_INIT(name, khcomplex128_t, char, 0, kh_complex128_hash_func, \
kh_complex_hash_equal)

KHASH_MAP_INIT_COMPLEX128(complex128, size_t)
KHASH_SET_INIT_COMPLEX128(complex128_set)

#define kh_exist_complex64(h, k) (kh_exist(h, k))
#define kh_exist_complex128(h, k) (kh_exist(h, k))
Expand Down