From c84b7e9e3986cc0dd0c3050a60e5722cf066a386 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 13 Apr 2020 13:42:42 -0700 Subject: [PATCH] CLN: remove tuple kludge in CategoricalDtype.__hash__ --- pandas/core/dtypes/dtypes.py | 40 +++++++++++++++--------------------- pandas/core/util/hashing.py | 2 -- 2 files changed, 16 insertions(+), 26 deletions(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 8fe2b3c60d6d0..0224895774c8e 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -482,31 +482,23 @@ def _hash_categories(categories, ordered: Ordered = True) -> int: from pandas.core.util.hashing import ( hash_array, _combine_hash_arrays, - hash_tuples, ) - from pandas.core.dtypes.common import is_datetime64tz_dtype, DT64NS_DTYPE - - if len(categories) and isinstance(categories[0], tuple): - # assumes if any individual category is a tuple, then all our. ATM - # I don't really want to support just some of the categories being - # tuples. - categories = list(categories) # breaks if a np.array of categories - cat_array = hash_tuples(categories) - else: - if categories.dtype == "O": - if len({type(x) for x in categories}) != 1: - # TODO: hash_array doesn't handle mixed types. It casts - # everything to a str first, which means we treat - # {'1', '2'} the same as {'1', 2} - # find a better solution - hashed = hash((tuple(categories), ordered)) - return hashed - - if is_datetime64tz_dtype(categories.dtype): - # Avoid future warning. - categories = categories.astype(DT64NS_DTYPE) - - cat_array = hash_array(np.asarray(categories), categorize=False) + + if categories.dtype == "O": + if len({type(x) for x in categories}) != 1: + # TODO: hash_array doesn't handle mixed types. It casts + # everything to a str first, which means we treat + # {'1', '2'} the same as {'1', 2} + # find a better solution + hashed = hash((tuple(categories), ordered)) + return hashed + + if isinstance(categories.dtype, DatetimeTZDtype): + # Avoid future warning. + categories = categories.astype("datetime64[ns]") + + cat_array = hash_array(np.asarray(categories), categorize=False) + if ordered: cat_array = np.vstack( [cat_array, np.arange(len(cat_array), dtype=cat_array.dtype)] diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index d9c8611c94cdb..e9d2bc2aa22b0 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -289,8 +289,6 @@ def hash_array( # First, turn whatever array this is into unsigned 64-bit ints, if we can # manage it. - elif isinstance(dtype, np.bool): - vals = vals.astype("u8") elif issubclass(dtype.type, (np.datetime64, np.timedelta64)): vals = vals.view("i8").astype("u8", copy=False) elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8: