@@ -114,7 +114,7 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
114
114
return h
115
115
116
116
117
- def hash_tuples (vals , encoding = 'utf8' , hash_key = None , categorize = True ):
117
+ def hash_tuples (vals , encoding = 'utf8' , hash_key = None ):
118
118
"""
119
119
Hash an MultiIndex / list-of-tuples efficiently
120
120
@@ -125,9 +125,6 @@ def hash_tuples(vals, encoding='utf8', hash_key=None, categorize=True):
125
125
vals : MultiIndex, list-of-tuples, or single tuple
126
126
encoding : string, default 'utf8'
127
127
hash_key : string key to encode, default to _default_hash_key
128
- categorize : bool, default True
129
- Whether to first categorize object arrays before hashing. This is more
130
- efficient when the array contains duplicate values.
131
128
132
129
Returns
133
130
-------
@@ -144,16 +141,18 @@ def hash_tuples(vals, encoding='utf8', hash_key=None, categorize=True):
144
141
if not isinstance (vals , MultiIndex ):
145
142
vals = MultiIndex .from_tuples (vals )
146
143
147
- # create a list-of-ndarrays
148
- vals = [vals ._get_level_values (level )
144
+ # create a list-of-Categoricals
145
+ vals = [Categorical (vals .labels [level ],
146
+ vals .levels [level ],
147
+ ordered = False ,
148
+ fastpath = True )
149
149
for level in range (vals .nlevels )]
150
150
151
151
# hash the list-of-ndarrays
152
- hashes = (hash_array (l ,
153
- encoding = encoding ,
154
- hash_key = hash_key ,
155
- categorize = categorize )
156
- for l in vals )
152
+ hashes = (_hash_categorical (cat ,
153
+ encoding = encoding ,
154
+ hash_key = hash_key )
155
+ for cat in vals )
157
156
h = _combine_hash_arrays (hashes , len (vals ))
158
157
if is_tuple :
159
158
h = h [0 ]
0 commit comments