Skip to content

Commit dc37685

Browse files
committed
PERF: a MI is already categorized when we are constructing the hash_tuples
1 parent b20a474 commit dc37685

File tree

2 files changed

+19
-11
lines changed

2 files changed

+19
-11
lines changed

asv_bench/benchmarks/indexing.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,12 @@ def setup(self):
193193
[np.arange(1000),
194194
np.arange(1000)], names=['one', 'two'])
195195

196+
import string
197+
self.mistring = MultiIndex.from_product(
198+
[np.arange(1000),
199+
np.arange(20), list(string.ascii_letters)],
200+
names=['one', 'two', 'three'])
201+
196202
def time_series_xs_mi_ix(self):
197203
self.s.ix[999]
198204

@@ -213,6 +219,9 @@ def time_multiindex_get_indexer(self):
213219
(0, 16), (0, 17), (0, 18),
214220
(0, 19)], dtype=object))
215221

222+
def time_multiindex_string_get_loc(self):
223+
self.mistring.get_loc((999, 19, 'Z'))
224+
216225
def time_is_monotonic(self):
217226
self.miint.is_monotonic
218227

pandas/tools/hashing.py

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
114114
return h
115115

116116

117-
def hash_tuples(vals, encoding='utf8', hash_key=None, categorize=True):
117+
def hash_tuples(vals, encoding='utf8', hash_key=None):
118118
"""
119119
Hash an MultiIndex / list-of-tuples efficiently
120120
@@ -125,9 +125,6 @@ def hash_tuples(vals, encoding='utf8', hash_key=None, categorize=True):
125125
vals : MultiIndex, list-of-tuples, or single tuple
126126
encoding : string, default 'utf8'
127127
hash_key : string key to encode, default to _default_hash_key
128-
categorize : bool, default True
129-
Whether to first categorize object arrays before hashing. This is more
130-
efficient when the array contains duplicate values.
131128
132129
Returns
133130
-------
@@ -144,16 +141,18 @@ def hash_tuples(vals, encoding='utf8', hash_key=None, categorize=True):
144141
if not isinstance(vals, MultiIndex):
145142
vals = MultiIndex.from_tuples(vals)
146143

147-
# create a list-of-ndarrays
148-
vals = [vals._get_level_values(level)
144+
# create a list-of-Categoricals
145+
vals = [Categorical(vals.labels[level],
146+
vals.levels[level],
147+
ordered=False,
148+
fastpath=True)
149149
for level in range(vals.nlevels)]
150150

151151
# hash the list-of-ndarrays
152-
hashes = (hash_array(l,
153-
encoding=encoding,
154-
hash_key=hash_key,
155-
categorize=categorize)
156-
for l in vals)
152+
hashes = (_hash_categorical(cat,
153+
encoding=encoding,
154+
hash_key=hash_key)
155+
for cat in vals)
157156
h = _combine_hash_arrays(hashes, len(vals))
158157
if is_tuple:
159158
h = h[0]

0 commit comments

Comments
 (0)