diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index 7b630c264753f..cc9341665b8db 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -1,12 +1,16 @@ from numpy cimport intp_t, ndarray from pandas._libs.khash cimport ( + complex64_t, + complex128_t, float32_t, float64_t, int8_t, int16_t, int32_t, int64_t, + kh_complex64_t, + kh_complex128_t, kh_float32_t, kh_float64_t, kh_int8_t, @@ -19,6 +23,8 @@ from pandas._libs.khash cimport ( kh_uint16_t, kh_uint32_t, kh_uint64_t, + khcomplex64_t, + khcomplex128_t, uint8_t, uint16_t, uint32_t, @@ -90,6 +96,18 @@ cdef class Float32HashTable(HashTable): cpdef get_item(self, float32_t val) cpdef set_item(self, float32_t key, Py_ssize_t val) +cdef class Complex64HashTable(HashTable): + cdef kh_complex64_t *table + + cpdef get_item(self, complex64_t val) + cpdef set_item(self, complex64_t key, Py_ssize_t val) + +cdef class Complex128HashTable(HashTable): + cdef kh_complex128_t *table + + cpdef get_item(self, complex128_t val) + cpdef set_item(self, complex128_t key, Py_ssize_t val) + cdef class PyObjectHashTable(HashTable): cdef kh_pymap_t *table diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 963fddd4d5af9..2c7780e0d95fd 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -13,7 +13,17 @@ cnp.import_array() from pandas._libs cimport util -from pandas._libs.khash cimport KHASH_TRACE_DOMAIN, kh_str_t, khiter_t +from pandas._libs.khash cimport ( + KHASH_TRACE_DOMAIN, + are_equivalent_float32_t, + are_equivalent_float64_t, + are_equivalent_khcomplex64_t, + are_equivalent_khcomplex128_t, + kh_str_t, + khcomplex64_t, + khcomplex128_t, + khiter_t, +) from pandas._libs.missing cimport checknull diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index b582ed1533a8e..276f162545399 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -8,7 +8,73 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in {{py: # name -cimported_types = ['float32', +complex_types = ['complex64', + 'complex128'] +}} + +{{for name in complex_types}} +cdef kh{{name}}_t to_kh{{name}}_t({{name}}_t val) nogil: + cdef kh{{name}}_t res + res.real = val.real + res.imag = val.imag + return res + + +cdef {{name}}_t to_{{name}}(kh{{name}}_t val) nogil: + cdef {{name}}_t res + res.real = val.real + res.imag = val.imag + return res + +{{endfor}} + + +{{py: + + +# name +c_types = ['khcomplex128_t', + 'khcomplex64_t', + 'float64_t', + 'float32_t', + 'int64_t', + 'int32_t', + 'int16_t', + 'int8_t', + 'uint64_t', + 'uint32_t', + 'uint16_t', + 'uint8_t'] +}} + +{{for c_type in c_types}} + +cdef bint is_nan_{{c_type}}({{c_type}} val) nogil: + {{if c_type in {'khcomplex128_t', 'khcomplex64_t'} }} + return val.real != val.real or val.imag != val.imag + {{elif c_type in {'float64_t', 'float32_t'} }} + return val != val + {{else}} + return False + {{endif}} + + +{{if c_type in {'khcomplex128_t', 'khcomplex64_t', 'float64_t', 'float32_t'} }} +# are_equivalent_{{c_type}} is cimported via khash.pxd +{{else}} +cdef bint are_equivalent_{{c_type}}({{c_type}} val1, {{c_type}} val2) nogil: + return val1 == val2 +{{endif}} + +{{endfor}} + + +{{py: + +# name +cimported_types = ['complex64', + 'complex128', + 'float32', 'float64', 'int8', 'int16', @@ -32,6 +98,7 @@ from pandas._libs.khash cimport ( kh_put_{{name}}, kh_resize_{{name}}, ) + {{endfor}} # ---------------------------------------------------------------------- @@ -48,7 +115,9 @@ from pandas._libs.missing cimport C_NA # but is included for completeness (rather ObjectVector is used # for uniques in hashtables) -dtypes = [('Float64', 'float64', 'float64_t'), +dtypes = [('Complex128', 'complex128', 'khcomplex128_t'), + ('Complex64', 'complex64', 'khcomplex64_t'), + ('Float64', 'float64', 'float64_t'), ('Float32', 'float32', 'float32_t'), ('Int64', 'int64', 'int64_t'), ('Int32', 'int32', 'int32_t'), @@ -94,6 +163,8 @@ ctypedef fused vector_data: UInt8VectorData Float64VectorData Float32VectorData + Complex128VectorData + Complex64VectorData StringVectorData cdef inline bint needs_resize(vector_data *data) nogil: @@ -106,7 +177,9 @@ cdef inline bint needs_resize(vector_data *data) nogil: {{py: # name, dtype, c_type -dtypes = [('Float64', 'float64', 'float64_t'), +dtypes = [('Complex128', 'complex128', 'khcomplex128_t'), + ('Complex64', 'complex64', 'khcomplex64_t'), + ('Float64', 'float64', 'float64_t'), ('UInt64', 'uint64', 'uint64_t'), ('Int64', 'int64', 'int64_t'), ('Float32', 'float32', 'float32_t'), @@ -303,22 +376,24 @@ cdef class HashTable: {{py: -# name, dtype, float_group -dtypes = [('Float64', 'float64', True), - ('UInt64', 'uint64', False), - ('Int64', 'int64', False), - ('Float32', 'float32', True), - ('UInt32', 'uint32', False), - ('Int32', 'int32', False), - ('UInt16', 'uint16', False), - ('Int16', 'int16', False), - ('UInt8', 'uint8', False), - ('Int8', 'int8', False)] +# name, dtype, c_type, to_c_type +dtypes = [('Complex128', 'complex128', 'khcomplex128_t', 'to_khcomplex128_t'), + ('Float64', 'float64', 'float64_t', ''), + ('UInt64', 'uint64', 'uint64_t', ''), + ('Int64', 'int64', 'int64_t', ''), + ('Complex64', 'complex64', 'khcomplex64_t', 'to_khcomplex64_t'), + ('Float32', 'float32', 'float32_t', ''), + ('UInt32', 'uint32', 'uint32_t', ''), + ('Int32', 'int32', 'int32_t', ''), + ('UInt16', 'uint16', 'uint16_t', ''), + ('Int16', 'int16', 'int16_t', ''), + ('UInt8', 'uint8', 'uint8_t', ''), + ('Int8', 'int8', 'int8_t', '')] }} -{{for name, dtype, float_group in dtypes}} +{{for name, dtype, c_type, to_c_type in dtypes}} cdef class {{name}}HashTable(HashTable): @@ -339,7 +414,9 @@ cdef class {{name}}HashTable(HashTable): def __contains__(self, object key): cdef: khiter_t k - k = kh_get_{{dtype}}(self.table, key) + {{c_type}} ckey + ckey = {{to_c_type}}(key) + k = kh_get_{{dtype}}(self.table, ckey) return k != self.table.n_buckets def sizeof(self, deep=False): @@ -353,7 +430,9 @@ cdef class {{name}}HashTable(HashTable): cpdef get_item(self, {{dtype}}_t val): cdef: khiter_t k - k = kh_get_{{dtype}}(self.table, val) + {{c_type}} cval + cval = {{to_c_type}}(val) + k = kh_get_{{dtype}}(self.table, cval) if k != self.table.n_buckets: return self.table.vals[k] else: @@ -363,9 +442,9 @@ cdef class {{name}}HashTable(HashTable): cdef: khiter_t k int ret = 0 - - k = kh_put_{{dtype}}(self.table, key, &ret) - self.table.keys[k] = key + {{c_type}} ckey + ckey = {{to_c_type}}(key) + k = kh_put_{{dtype}}(self.table, ckey, &ret) if kh_exist_{{dtype}}(self.table, k): self.table.vals[k] = val else: @@ -376,12 +455,12 @@ cdef class {{name}}HashTable(HashTable): cdef: Py_ssize_t i, n = len(values) int ret = 0 - {{dtype}}_t key + {{c_type}} key khiter_t k with nogil: for i in range(n): - key = keys[i] + key = {{to_c_type}}(keys[i]) k = kh_put_{{dtype}}(self.table, key, &ret) self.table.vals[k] = values[i] @@ -390,12 +469,12 @@ cdef class {{name}}HashTable(HashTable): cdef: Py_ssize_t i, n = len(values) int ret = 0 - {{dtype}}_t val + {{c_type}} val khiter_t k with nogil: for i in range(n): - val = values[i] + val= {{to_c_type}}(values[i]) k = kh_put_{{dtype}}(self.table, val, &ret) self.table.vals[k] = i @@ -404,13 +483,13 @@ cdef class {{name}}HashTable(HashTable): cdef: Py_ssize_t i, n = len(values) int ret = 0 - {{dtype}}_t val + {{c_type}} val khiter_t k intp_t[:] locs = np.empty(n, dtype=np.intp) with nogil: for i in range(n): - val = values[i] + val = {{to_c_type}}(values[i]) k = kh_get_{{dtype}}(self.table, val) if k != self.table.n_buckets: locs[i] = self.table.vals[k] @@ -466,7 +545,7 @@ cdef class {{name}}HashTable(HashTable): Py_ssize_t i, idx, count = count_prior, n = len(values) int64_t[:] labels int ret = 0 - {{dtype}}_t val, na_value2 + {{c_type}} val, na_value2 khiter_t k {{name}}VectorData *ud bint use_na_value, use_mask @@ -487,23 +566,21 @@ cdef class {{name}}HashTable(HashTable): # We use None, to make it optional, which requires `object` type # for the parameter. To please the compiler, we use na_value2, # which is only used if it's *specified*. - na_value2 = <{{dtype}}_t>na_value + na_value2 = {{to_c_type}}(na_value) else: - na_value2 = 0 + na_value2 = {{to_c_type}}(0) with nogil: for i in range(n): - val = values[i] + val = {{to_c_type}}(values[i]) if ignore_na and use_mask: if mask_values[i]: labels[i] = na_sentinel continue elif ignore_na and ( - {{if not name.lower().startswith(("uint", "int"))}} - val != val or - {{endif}} - (use_na_value and val == na_value2) + is_nan_{{c_type}}(val) or + (use_na_value and are_equivalent_{{c_type}}(val, na_value2)) ): # if missing values do not count as unique values (i.e. if # ignore_na is True), skip the hashtable entry for them, @@ -606,6 +683,7 @@ cdef class {{name}}HashTable(HashTable): ignore_na=True, return_inverse=True) return labels + {{if dtype == 'int64'}} @cython.boundscheck(False) def get_labels_groupby(self, const {{dtype}}_t[:] values): cdef: @@ -613,7 +691,7 @@ cdef class {{name}}HashTable(HashTable): intp_t[:] labels Py_ssize_t idx, count = 0 int ret = 0 - {{dtype}}_t val + {{c_type}} val khiter_t k {{name}}Vector uniques = {{name}}Vector() {{name}}VectorData *ud @@ -623,14 +701,12 @@ cdef class {{name}}HashTable(HashTable): with nogil: for i in range(n): - val = values[i] + val = {{to_c_type}}(values[i]) # specific for groupby - {{if dtype != 'uint64'}} if val < 0: labels[i] = -1 continue - {{endif}} k = kh_get_{{dtype}}(self.table, val) if k != self.table.n_buckets: @@ -650,6 +726,7 @@ cdef class {{name}}HashTable(HashTable): arr_uniques = uniques.to_array() return np.asarray(labels), arr_uniques + {{endif}} {{endfor}} @@ -698,7 +775,6 @@ cdef class StringHashTable(HashTable): v = get_c_string(key) k = kh_put_str(self.table, v, &ret) - self.table.keys[k] = v if kh_exist_str(self.table, k): self.table.vals[k] = val else: @@ -1022,7 +1098,6 @@ cdef class PyObjectHashTable(HashTable): hash(key) k = kh_put_pymap(self.table, key, &ret) - # self.table.keys[k] = key if kh_exist_pymap(self.table, k): self.table.vals[k] = val else: diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 7c5afa4ff6b27..f8f541235dcb7 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -6,22 +6,26 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in {{py: -# dtype, ttype, c_type -dtypes = [('float64', 'float64', 'float64_t'), - ('float32', 'float32', 'float32_t'), - ('uint64', 'uint64', 'uint64_t'), - ('uint32', 'uint32', 'uint32_t'), - ('uint16', 'uint16', 'uint16_t'), - ('uint8', 'uint8', 'uint8_t'), - ('object', 'pymap', 'object'), - ('int64', 'int64', 'int64_t'), - ('int32', 'int32', 'int32_t'), - ('int16', 'int16', 'int16_t'), - ('int8', 'int8', 'int8_t')] +# dtype, ttype, c_type, to_c_type, to_dtype +dtypes = [('complex128', 'complex128', 'khcomplex128_t', + 'to_khcomplex128_t', 'to_complex128'), + ('complex64', 'complex64', 'khcomplex64_t', + 'to_khcomplex64_t', 'to_complex64'), + ('float64', 'float64', 'float64_t', '', ''), + ('float32', 'float32', 'float32_t', '', ''), + ('uint64', 'uint64', 'uint64_t', '', ''), + ('uint32', 'uint32', 'uint32_t', '', ''), + ('uint16', 'uint16', 'uint16_t', '', ''), + ('uint8', 'uint8', 'uint8_t', '', ''), + ('object', 'pymap', 'object', '', ''), + ('int64', 'int64', 'int64_t', '', ''), + ('int32', 'int32', 'int32_t', '', ''), + ('int16', 'int16', 'int16_t', '', ''), + ('int8', 'int8', 'int8_t', '', '')] }} -{{for dtype, ttype, c_type in dtypes}} +{{for dtype, ttype, c_type, to_c_type, to_dtype in dtypes}} @cython.wraparound(False) @@ -30,7 +34,7 @@ dtypes = [('float64', 'float64', 'float64_t'), cdef build_count_table_{{dtype}}(ndarray[{{dtype}}] values, kh_{{ttype}}_t *table, bint dropna): {{else}} -cdef build_count_table_{{dtype}}({{dtype}}_t[:] values, +cdef build_count_table_{{dtype}}(const {{dtype}}_t[:] values, kh_{{ttype}}_t *table, bint dropna): {{endif}} cdef: @@ -46,7 +50,6 @@ cdef build_count_table_{{dtype}}({{dtype}}_t[:] values, for i in range(n): val = values[i] - if not checknull(val) or not dropna: k = kh_get_{{ttype}}(table, val) if k != table.n_buckets: @@ -59,13 +62,9 @@ cdef build_count_table_{{dtype}}({{dtype}}_t[:] values, kh_resize_{{ttype}}(table, n) for i in range(n): - val = values[i] + val = {{to_c_type}}(values[i]) - {{if dtype == 'float64' or dtype == 'float32'}} - if val == val or not dropna: - {{else}} - if True: - {{endif}} + if not is_nan_{{c_type}}(val) or not dropna: k = kh_get_{{ttype}}(table, val) if k != table.n_buckets: table.vals[k] += 1 @@ -80,7 +79,7 @@ cdef build_count_table_{{dtype}}({{dtype}}_t[:] values, {{if dtype == 'object'}} cpdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna): {{else}} -cpdef value_count_{{dtype}}({{c_type}}[:] values, bint dropna): +cpdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): {{endif}} cdef: Py_ssize_t i = 0 @@ -114,7 +113,7 @@ cpdef value_count_{{dtype}}({{c_type}}[:] values, bint dropna): with nogil: for k in range(table.n_buckets): if kh_exist_{{ttype}}(table, k): - result_keys[i] = table.keys[k] + result_keys[i] = {{to_dtype}}(table.keys[k]) result_counts[i] = table.vals[k] i += 1 {{endif}} @@ -133,12 +132,12 @@ cpdef value_count_{{dtype}}({{c_type}}[:] values, bint dropna): {{if dtype == 'object'}} def duplicated_{{dtype}}(ndarray[{{dtype}}] values, object keep='first'): {{else}} -def duplicated_{{dtype}}(const {{c_type}}[:] values, object keep='first'): +def duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'): {{endif}} cdef: int ret = 0 {{if dtype != 'object'}} - {{dtype}}_t value + {{c_type}} value {{endif}} Py_ssize_t i, n = len(values) khiter_t k @@ -160,7 +159,8 @@ def duplicated_{{dtype}}(const {{c_type}}[:] values, object keep='first'): with nogil: for i in range(n - 1, -1, -1): # equivalent: range(n)[::-1], which cython doesn't like in nogil - kh_put_{{ttype}}(table, values[i], &ret) + value = {{to_c_type}}(values[i]) + kh_put_{{ttype}}(table, value, &ret) out[i] = ret == 0 {{endif}} elif keep == 'first': @@ -171,7 +171,8 @@ def duplicated_{{dtype}}(const {{c_type}}[:] values, object keep='first'): {{else}} with nogil: for i in range(n): - kh_put_{{ttype}}(table, values[i], &ret) + value = {{to_c_type}}(values[i]) + kh_put_{{ttype}}(table, value, &ret) out[i] = ret == 0 {{endif}} else: @@ -184,20 +185,18 @@ def duplicated_{{dtype}}(const {{c_type}}[:] values, object keep='first'): out[i] = 1 else: k = kh_put_{{ttype}}(table, value, &ret) - table.keys[k] = value table.vals[k] = i out[i] = 0 {{else}} with nogil: for i in range(n): - value = values[i] + value = {{to_c_type}}(values[i]) k = kh_get_{{ttype}}(table, value) if k != table.n_buckets: out[table.vals[k]] = 1 out[i] = 1 else: k = kh_put_{{ttype}}(table, value, &ret) - table.keys[k] = value table.vals[k] = i out[i] = 0 {{endif}} @@ -215,7 +214,7 @@ def duplicated_{{dtype}}(const {{c_type}}[:] values, object keep='first'): {{if dtype == 'object'}} def ismember_{{dtype}}(ndarray[{{c_type}}] arr, ndarray[{{c_type}}] values): {{else}} -def ismember_{{dtype}}(const {{c_type}}[:] arr, const {{c_type}}[:] values): +def ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values): {{endif}} """ Return boolean of values in arr on an @@ -248,7 +247,8 @@ def ismember_{{dtype}}(const {{c_type}}[:] arr, const {{c_type}}[:] values): {{else}} with nogil: for i in range(n): - kh_put_{{ttype}}(table, values[i], &ret) + val = {{to_c_type}}(values[i]) + kh_put_{{ttype}}(table, val, &ret) {{endif}} # test membership @@ -263,7 +263,7 @@ def ismember_{{dtype}}(const {{c_type}}[:] arr, const {{c_type}}[:] values): {{else}} with nogil: for i in range(n): - val = arr[i] + val = {{to_c_type}}(arr[i]) k = kh_get_{{ttype}}(table, val) result[i] = (k != table.n_buckets) {{endif}} @@ -281,7 +281,9 @@ def ismember_{{dtype}}(const {{c_type}}[:] arr, const {{c_type}}[:] values): {{py: # dtype, ctype, table_type, npy_dtype -dtypes = [('float64', 'float64_t', 'float64', 'float64'), +dtypes = [('complex128', 'khcomplex128_t', 'complex128', 'complex128'), + ('complex64', 'khcomplex64_t', 'complex64', 'complex64'), + ('float64', 'float64_t', 'float64', 'float64'), ('float32', 'float32_t', 'float32', 'float32'), ('int64', 'int64_t', 'int64', 'int64'), ('int32', 'int32_t', 'int32', 'int32'), @@ -307,7 +309,7 @@ def mode_{{dtype}}(ndarray[{{ctype}}] values, bint dropna): {{else}} -def mode_{{dtype}}({{ctype}}[:] values, bint dropna): +def mode_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): {{endif}} cdef: int count, max_count = 1 diff --git a/pandas/_libs/khash.pxd b/pandas/_libs/khash.pxd index 0d0c5ae058b21..53b94c5a73b83 100644 --- a/pandas/_libs/khash.pxd +++ b/pandas/_libs/khash.pxd @@ -1,5 +1,7 @@ from cpython.object cimport PyObject from numpy cimport ( + complex64_t, + complex128_t, float32_t, float64_t, int8_t, @@ -19,6 +21,26 @@ cdef extern from "khash_python.h": ctypedef uint32_t khint_t ctypedef khint_t khiter_t + ctypedef struct khcomplex128_t: + double real + double imag + + bint are_equivalent_khcomplex128_t \ + "kh_complex_hash_equal" (khcomplex128_t a, khcomplex128_t b) nogil + + ctypedef struct khcomplex64_t: + float real + float imag + + bint are_equivalent_khcomplex64_t \ + "kh_complex_hash_equal" (khcomplex64_t a, khcomplex64_t b) nogil + + bint are_equivalent_float64_t \ + "kh_floats_hash_equal" (float64_t a, float64_t b) nogil + + bint are_equivalent_float32_t \ + "kh_floats_hash_equal" (float32_t a, float32_t b) nogil + ctypedef struct kh_pymap_t: khint_t n_buckets, size, n_occupied, upper_bound uint32_t *flags diff --git a/pandas/_libs/khash_for_primitive_helper.pxi.in b/pandas/_libs/khash_for_primitive_helper.pxi.in index db8d3e0b19417..dc7b11adb957b 100644 --- a/pandas/_libs/khash_for_primitive_helper.pxi.in +++ b/pandas/_libs/khash_for_primitive_helper.pxi.in @@ -17,6 +17,8 @@ primitive_types = [('int64', 'int64_t'), ('uint16', 'uint16_t'), ('int8', 'int8_t'), ('uint8', 'uint8_t'), + ('complex64', 'khcomplex64_t'), + ('complex128', 'khcomplex128_t'), ] }} diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h index 8e4e61b4f3077..d5eb45ec231b8 100644 --- a/pandas/_libs/src/klib/khash_python.h +++ b/pandas/_libs/src/klib/khash_python.h @@ -1,6 +1,14 @@ #include #include + +// use numpy's definitions for complex +#include +typedef npy_complex64 khcomplex64_t; +typedef npy_complex128 khcomplex128_t; + + + // khash should report usage to tracemalloc #if PY_VERSION_HEX >= 0x03060000 #include @@ -128,6 +136,32 @@ KHASH_MAP_INIT_FLOAT64(float64, size_t) KHASH_MAP_INIT_FLOAT32(float32, size_t) +khint32_t PANDAS_INLINE kh_complex128_hash_func(khcomplex128_t val){ + return kh_float64_hash_func(val.real)^kh_float64_hash_func(val.imag); +} +khint32_t PANDAS_INLINE kh_complex64_hash_func(khcomplex64_t val){ + return kh_float32_hash_func(val.real)^kh_float32_hash_func(val.imag); +} + +#define kh_complex_hash_equal(a, b) \ + (kh_floats_hash_equal(a.real, b.real) && kh_floats_hash_equal(a.imag, b.imag)) + + +#define KHASH_MAP_INIT_COMPLEX64(name, khval_t) \ + KHASH_INIT(name, khcomplex64_t, khval_t, 1, kh_complex64_hash_func, kh_complex_hash_equal) + +KHASH_MAP_INIT_COMPLEX64(complex64, size_t) + + +#define KHASH_MAP_INIT_COMPLEX128(name, khval_t) \ + KHASH_INIT(name, khcomplex128_t, khval_t, 1, kh_complex128_hash_func, kh_complex_hash_equal) + +KHASH_MAP_INIT_COMPLEX128(complex128, size_t) + + +#define kh_exist_complex64(h, k) (kh_exist(h, k)) +#define kh_exist_complex128(h, k) (kh_exist(h, k)) + int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) { int result = PyObject_RichCompareBool(a, b, Py_EQ); diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index a6fd421911d3e..894b126cc4269 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -30,9 +30,11 @@ def get_allocated_khash_memory(): "table_type, dtype", [ (ht.PyObjectHashTable, np.object_), + (ht.Complex128HashTable, np.complex128), (ht.Int64HashTable, np.int64), (ht.UInt64HashTable, np.uint64), (ht.Float64HashTable, np.float64), + (ht.Complex64HashTable, np.complex64), (ht.Int32HashTable, np.int32), (ht.UInt32HashTable, np.uint32), (ht.Float32HashTable, np.float32), @@ -73,29 +75,33 @@ def test_get_set_contains_len(self, table_type, dtype): table.get_item(index + 2) assert str(index + 2) in str(excinfo.value) - def test_map(self, table_type, dtype): + def test_map(self, table_type, dtype, writable): # PyObjectHashTable has no map-method if table_type != ht.PyObjectHashTable: N = 77 table = table_type() keys = np.arange(N).astype(dtype) vals = np.arange(N).astype(np.int64) + N + keys.flags.writeable = writable + vals.flags.writeable = writable table.map(keys, vals) for i in range(N): assert table.get_item(keys[i]) == i + N - def test_map_locations(self, table_type, dtype): + def test_map_locations(self, table_type, dtype, writable): N = 8 table = table_type() keys = (np.arange(N) + N).astype(dtype) + keys.flags.writeable = writable table.map_locations(keys) for i in range(N): assert table.get_item(keys[i]) == i - def test_lookup(self, table_type, dtype): + def test_lookup(self, table_type, dtype, writable): N = 3 table = table_type() keys = (np.arange(N) + N).astype(dtype) + keys.flags.writeable = writable table.map_locations(keys) result = table.lookup(keys) expected = np.arange(N) @@ -113,7 +119,7 @@ def test_lookup_wrong(self, table_type, dtype): result = table.lookup(wrong_keys) assert np.all(result == -1) - def test_unique(self, table_type, dtype): + def test_unique(self, table_type, dtype, writable): if dtype in (np.int8, np.uint8): N = 88 else: @@ -121,6 +127,7 @@ def test_unique(self, table_type, dtype): table = table_type() expected = (np.arange(N) + N).astype(dtype) keys = np.repeat(expected, 5) + keys.flags.writeable = writable unique = table.unique(keys) tm.assert_numpy_array_equal(unique, expected) @@ -149,6 +156,17 @@ def test_tracemalloc_for_empty(self, table_type, dtype): assert get_allocated_khash_memory() == 0 +def test_get_labels_groupby_for_Int64(writable): + table = ht.Int64HashTable() + vals = np.array([1, 2, -1, 2, 1, -1], dtype=np.int64) + vals.flags.writeable = writable + arr, unique = table.get_labels_groupby(vals) + expected_arr = np.array([0, 1, -1, 1, 0, -1], dtype=np.int64) + expected_unique = np.array([1, 2], dtype=np.int64) + tm.assert_numpy_array_equal(arr.astype(np.int64), expected_arr) + tm.assert_numpy_array_equal(unique, expected_unique) + + def test_tracemalloc_works_for_StringHashTable(): N = 1000 keys = np.arange(N).astype(np.compat.unicode).astype(np.object_) @@ -177,6 +195,8 @@ def test_tracemalloc_for_empty_StringHashTable(): [ (ht.Float64HashTable, np.float64), (ht.Float32HashTable, np.float32), + (ht.Complex128HashTable, np.complex128), + (ht.Complex64HashTable, np.complex64), ], ) class TestHashTableWithNans: @@ -228,9 +248,11 @@ def get_ht_function(fun_name, type_suffix): "dtype, type_suffix", [ (np.object_, "object"), + (np.complex128, "complex128"), (np.int64, "int64"), (np.uint64, "uint64"), (np.float64, "float64"), + (np.complex64, "complex64"), (np.int32, "int32"), (np.uint32, "uint32"), (np.float32, "float32"), @@ -241,29 +263,33 @@ def get_ht_function(fun_name, type_suffix): ], ) class TestHelpFunctions: - def test_value_count(self, dtype, type_suffix): + def test_value_count(self, dtype, type_suffix, writable): N = 43 value_count = get_ht_function("value_count", type_suffix) expected = (np.arange(N) + N).astype(dtype) values = np.repeat(expected, 5) + values.flags.writeable = writable keys, counts = value_count(values, False) tm.assert_numpy_array_equal(np.sort(keys), expected) assert np.all(counts == 5) - def test_duplicated_first(self, dtype, type_suffix): + def test_duplicated_first(self, dtype, type_suffix, writable): N = 100 duplicated = get_ht_function("duplicated", type_suffix) values = np.repeat(np.arange(N).astype(dtype), 5) + values.flags.writeable = writable result = duplicated(values) expected = np.ones_like(values, dtype=np.bool_) expected[::5] = False tm.assert_numpy_array_equal(result, expected) - def test_ismember_yes(self, dtype, type_suffix): + def test_ismember_yes(self, dtype, type_suffix, writable): N = 127 ismember = get_ht_function("ismember", type_suffix) arr = np.arange(N).astype(dtype) values = np.arange(N).astype(dtype) + arr.flags.writeable = writable + values.flags.writeable = writable result = ismember(arr, values) expected = np.ones_like(values, dtype=np.bool_) tm.assert_numpy_array_equal(result, expected) @@ -277,7 +303,7 @@ def test_ismember_no(self, dtype, type_suffix): expected = np.zeros_like(values, dtype=np.bool_) tm.assert_numpy_array_equal(result, expected) - def test_mode(self, dtype, type_suffix): + def test_mode(self, dtype, type_suffix, writable): if dtype in (np.int8, np.uint8): N = 53 else: @@ -285,6 +311,7 @@ def test_mode(self, dtype, type_suffix): mode = get_ht_function("mode", type_suffix) values = np.repeat(np.arange(N).astype(dtype), 5) values[0] = 42 + values.flags.writeable = writable result = mode(values, False) assert result == 42 @@ -294,6 +321,8 @@ def test_mode(self, dtype, type_suffix): [ (np.float64, "float64"), (np.float32, "float32"), + (np.complex128, "complex128"), + (np.complex64, "complex64"), ], ) class TestHelpFunctionsWithNans: