PERF: a MI is already categorized when we are constructing the hash_tuples

jreback · jreback · commit dc37685437f2 · 2017-02-10T10:54:18.000-05:00
diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py
@@ -193,6 +193,12 @@ def setup(self):
             [np.arange(1000),
              np.arange(1000)], names=['one', 'two'])
 
+        import string
+        self.mistring = MultiIndex.from_product(
+            [np.arange(1000),
+             np.arange(20), list(string.ascii_letters)],
+            names=['one', 'two', 'three'])
+
     def time_series_xs_mi_ix(self):
         self.s.ix[999]
 
@@ -213,6 +219,9 @@ def time_multiindex_get_indexer(self):
                       (0, 16), (0, 17), (0, 18),
                       (0, 19)], dtype=object))
 
+    def time_multiindex_string_get_loc(self):
+        self.mistring.get_loc((999, 19, 'Z'))
+
     def time_is_monotonic(self):
         self.miint.is_monotonic
 
diff --git a/pandas/tools/hashing.py b/pandas/tools/hashing.py
@@ -114,7 +114,7 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
     return h
 
 
-def hash_tuples(vals, encoding='utf8', hash_key=None, categorize=True):
+def hash_tuples(vals, encoding='utf8', hash_key=None):
     """
     Hash an MultiIndex / list-of-tuples efficiently
 
@@ -125,9 +125,6 @@ def hash_tuples(vals, encoding='utf8', hash_key=None, categorize=True):
     vals : MultiIndex, list-of-tuples, or single tuple
     encoding : string, default 'utf8'
     hash_key : string key to encode, default to _default_hash_key
-    categorize : bool, default True
-        Whether to first categorize object arrays before hashing. This is more
-        efficient when the array contains duplicate values.
 
     Returns
     -------
@@ -144,16 +141,18 @@ def hash_tuples(vals, encoding='utf8', hash_key=None, categorize=True):
     if not isinstance(vals, MultiIndex):
         vals = MultiIndex.from_tuples(vals)
 
-    # create a list-of-ndarrays
-    vals = [vals._get_level_values(level)
+    # create a list-of-Categoricals
+    vals = [Categorical(vals.labels[level],
+                        vals.levels[level],
+                        ordered=False,
+                        fastpath=True)
             for level in range(vals.nlevels)]
 
     # hash the list-of-ndarrays
-    hashes = (hash_array(l,
-                         encoding=encoding,
-                         hash_key=hash_key,
-                         categorize=categorize)
-              for l in vals)
+    hashes = (_hash_categorical(cat,
+                                encoding=encoding,
+                                hash_key=hash_key)
+              for cat in vals)
     h = _combine_hash_arrays(hashes, len(vals))
     if is_tuple:
         h = h[0]