Skip to content

REF: remove single-tuple special case for Categorical.__hash__ #33678

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 8 commits into from
Closed
40 changes: 16 additions & 24 deletions pandas/core/dtypes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -482,31 +482,23 @@ def _hash_categories(categories, ordered: Ordered = True) -> int:
from pandas.core.util.hashing import (
hash_array,
_combine_hash_arrays,
hash_tuples,
)
from pandas.core.dtypes.common import is_datetime64tz_dtype, DT64NS_DTYPE

if len(categories) and isinstance(categories[0], tuple):
# assumes if any individual category is a tuple, then all our. ATM
# I don't really want to support just some of the categories being
# tuples.
categories = list(categories) # breaks if a np.array of categories
cat_array = hash_tuples(categories)
else:
if categories.dtype == "O":
if len({type(x) for x in categories}) != 1:
# TODO: hash_array doesn't handle mixed types. It casts
# everything to a str first, which means we treat
# {'1', '2'} the same as {'1', 2}
# find a better solution
hashed = hash((tuple(categories), ordered))
return hashed

if is_datetime64tz_dtype(categories.dtype):
# Avoid future warning.
categories = categories.astype(DT64NS_DTYPE)

cat_array = hash_array(np.asarray(categories), categorize=False)

if categories.dtype == "O":
if len({type(x) for x in categories}) != 1:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we'd like to avoid calling type on every element of categories. I'm guessing that will be prohibitively slow for large categories.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. Can you comment on why something like hash(str(np.asarray(categories))) wasnt used?

The imports from core.util.hashing are a hassle dependency-structure-wise, so if there is a simplification available itd be helpful, but not urgent.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure offhand.

# TODO: hash_array doesn't handle mixed types. It casts
# everything to a str first, which means we treat
# {'1', '2'} the same as {'1', 2}
# find a better solution
hashed = hash((tuple(categories), ordered))
return hashed

if isinstance(categories.dtype, DatetimeTZDtype):
# Avoid future warning.
categories = categories.astype("datetime64[ns]")

cat_array = hash_array(np.asarray(categories), categorize=False)

if ordered:
cat_array = np.vstack(
[cat_array, np.arange(len(cat_array), dtype=cat_array.dtype)]
Expand Down
2 changes: 0 additions & 2 deletions pandas/core/util/hashing.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,8 +264,6 @@ def hash_array(

# First, turn whatever array this is into unsigned 64-bit ints, if we can
# manage it.
elif isinstance(dtype, np.bool):
vals = vals.astype("u8")
elif issubclass(dtype.type, (np.datetime64, np.timedelta64)):
vals = vals.view("i8").astype("u8", copy=False)
elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8:
Expand Down