Merge pull request #3563 from jreback/GH3561

jreback · jreback · commit 9ae47f949985 · 2013-05-14T14:44:57.000-07:00
BUG: (GH3561) non-unique indexers with a list-like now return in the same order as the passed values
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -91,6 +91,7 @@ pandas 0.11.1
       (removed warning) (GH2786_), and fix (GH3230_)
     - Fix to_csv to handle non-unique columns (GH3495_)
     - Duplicate indexes with getitem will return items in the correct order (GH3455_, GH3457_)
+      and handle missing elements like unique indices (GH3561_)
     - Duplicate indexes with and empty DataFrame.from_records will return a correct frame (GH3562_)
   - Fixed bug in groupby with empty series referencing a variable before assignment. (GH3510_)
   - Fixed bug in mixed-frame assignment with aligned series (GH3492_)
@@ -148,6 +149,7 @@ pandas 0.11.1
 .. _GH3552: https://github.com/pydata/pandas/issues/3552
 .. _GH3562: https://github.com/pydata/pandas/issues/3562
 .. _GH3586: https://github.com/pydata/pandas/issues/3586
+.. _GH3561: https://github.com/pydata/pandas/issues/3561
 .. _GH3493: https://github.com/pydata/pandas/issues/3493
 .. _GH3579: https://github.com/pydata/pandas/issues/3579
 .. _GH3593: https://github.com/pydata/pandas/issues/3593
diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst
@@ -1368,6 +1368,9 @@ incompatible the new object internals are with the ``Index`` functions):
   - ``slice_locs``: returns the "range" to slice between two labels
   - ``get_indexer``: Computes the indexing vector for reindexing / data
     alignment purposes. See the source / docstrings for more on this
+  - ``get_indexer_non_unique``: Computes the indexing vector for reindexing / data
+    alignment purposes when the index is non-unique. See the source / docstrings 
+    for more on this
   - ``reindex``: Does any pre-conversion of the input index then calls
     ``get_indexer``
   - ``union``, ``intersection``: computes the union or intersection of two
diff --git a/pandas/core/index.py b/pandas/core/index.py
@@ -859,6 +859,25 @@ def get_indexer(self, target, method=None, limit=None):
 
         return com._ensure_platform_int(indexer)
 
+    def get_indexer_non_unique(self, target, **kwargs):
+        """ return an indexer suitable for taking from a non unique index
+            return the labels in the same order as the target, and
+            return a missing indexer into the target (missing are marked as -1
+            in the indexer); target must be an iterable """
+        target = _ensure_index(target)
+        pself, ptarget = self._possibly_promote(target)
+        if pself is not self or ptarget is not target:
+            return pself.get_indexer_non_unique(ptarget)
+
+        if self.is_all_dates:
+            self = Index(self.asi8)
+            tgt_values = target.asi8
+        else:
+            tgt_values = target.values
+
+        indexer, missing = self._engine.get_indexer_non_unique(tgt_values)
+        return Index(indexer), missing
+
     def _possibly_promote(self, other):
         # A hack, but it works
         from pandas.tseries.index import DatetimeIndex
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
@@ -458,8 +458,23 @@ def _reindex(keys, level=None):
             if labels.is_unique:
                 return _reindex(keyarr, level=level)
             else:
-                mask = labels.isin(keyarr)
-                return self.obj.take(mask.nonzero()[0], axis=axis, convert=False)
+                indexer, missing = labels.get_indexer_non_unique(keyarr)
+                check = indexer != -1
+                result = self.obj.take(indexer[check], axis=axis, convert=False)
+
+                # need to merge the result labels and the missing labels
+                if len(missing):
+                    l = np.arange(len(indexer))
+
+                    missing_labels = keyarr.take(missing)
+                    missing_labels_indexer = l[~check]
+                    cur_labels = result._get_axis(axis).values
+                    cur_labels_indexer = l[check]
+                    new_labels = lib.combine_from_indexers(cur_labels, cur_labels_indexer,
+                                                           missing_labels, missing_labels_indexer)
+                    result = result.reindex_axis(new_labels,axis=axis)
+
+                return result
 
     def _convert_to_indexer(self, obj, axis=0):
         """
@@ -569,20 +584,8 @@ def _convert_to_indexer(self, obj, axis=0):
 
                     # non-unique (dups)
                     else:
-                        indexer = []
-                        check   = np.arange(len(labels))
-                        lvalues = labels.values
-                        for x in objarr:
-                            # ugh
-                            to_or = lib.map_infer(lvalues, x.__eq__)
-                            if not to_or.any():
-                                raise KeyError('%s not in index' % str(x))
-
-                            # add the indicies (as we want to take)
-                            indexer.extend(check[to_or])
-
-                        indexer = Index(indexer)
-
+                        indexer, missing = labels.get_indexer_non_unique(objarr)
+                        check = indexer
 
                 mask = check == -1
                 if mask.any():
diff --git a/pandas/index.pyx b/pandas/index.pyx
@@ -267,8 +267,46 @@ cdef class IndexEngine:
         self._ensure_mapping_populated()
         return self.mapping.lookup(values)
 
+    def get_indexer_non_unique(self, targets):
+        """ return an indexer suitable for takng from a non unique index
+            return the labels in the same order ast the target
+            and a missing indexer into the targets (which correspond
+            to the -1 indicies in the results """
 
+        cdef:
+            ndarray values
+            ndarray[int64_t] result, missing
+            object v, val
+            int count = 0, count_missing = 0
+            Py_ssize_t i, j, n, found
+
+        self._ensure_mapping_populated()
+        values = self._get_index_values()
+        n = len(values)
+        n_t = len(targets)
+        result  = np.empty(n+n_t, dtype=np.int64)
+        missing = np.empty(n_t, dtype=np.int64)
+
+        for i in range(n_t):
+            val = util.get_value_at(targets, i)
+            found = 0
+
+            for j in range(n):
+                v = util.get_value_at(values, j)
+
+                if v == val:
+                   result[count] = j
+                   count += 1
+                   found = 1
+
+            # value not found
+            if found == 0:
+                result[count] = -1
+                count += 1
+                missing[count_missing] = i
+                count_missing += 1
 
+        return result[0:count], missing[0:count_missing]
 
 cdef class Int64Engine(IndexEngine):
 
diff --git a/pandas/lib.pyx b/pandas/lib.pyx
@@ -416,6 +416,25 @@ def dicts_to_array(list dicts, list columns):
 
     return result
 
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def combine_from_indexers(ndarray a, ndarray[int64_t] a_indexer,
+                          ndarray b, ndarray[int64_t] b_indexer):
+    cdef:
+        Py_ssize_t i, n_a, n_b
+        ndarray result
+
+    n_a = len(a)
+    n_b = len(b)
+    result = np.empty(n_a+n_b,dtype=object)
+
+    for i in range(n_a):
+        result[a_indexer[i]] = a[i]
+    for i in range(n_b):
+        result[b_indexer[i]] = b[i]
+
+    return result
+
 
 def fast_zip(list ndarrays):
     '''
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -4668,8 +4668,29 @@ def _check_df(df,cols=None):
             with ensure_clean() as path:
                 df.to_csv(path,cols = cols,chunksize=chunksize)
                 rs_c = pd.read_csv(path,index_col=0)
-                rs_c.columns = df.columns
-                assert_frame_equal(df,rs_c,check_names=False)
+
+                # we wrote them in a different order
+                # so compare them in that order
+                if cols is not None:
+
+                    if df.columns.is_unique:
+                        rs_c.columns = cols
+                    else:
+                        indexer, missing = df.columns.get_indexer_non_unique(cols)
+                        rs_c.columns = df.columns.take(indexer)
+
+                    for c in cols:
+                       obj_df = df[c]
+                       obj_rs = rs_c[c]
+                       if isinstance(obj_df,Series):
+                           assert_series_equal(obj_df,obj_rs)
+                       else:
+                           assert_frame_equal(obj_df,obj_rs,check_names=False) 
+
+                # wrote in the same order
+                else:
+                    rs_c.columns = df.columns
+                    assert_frame_equal(df,rs_c,check_names=False)
 
         chunksize=5
         N = int(chunksize*2.5)
diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py
@@ -784,6 +784,28 @@ def test_dups_fancy_indexing(self):
 
         assert_frame_equal(df,result)
 
+        # GH 3561, dups not in selected order
+        ind = ['A', 'A', 'B', 'C']
+        df = DataFrame({'test':range(len(ind))}, index=ind)
+        rows = ['C', 'B']
+        res = df.ix[rows]
+        self.assert_(rows == list(res.index))
+
+        res = df.ix[Index(rows)]
+        self.assert_(Index(rows).equals(res.index))
+
+        rows = ['C','B','E']
+        res = df.ix[rows]
+        self.assert_(rows == list(res.index))
+
+        # inconcistent returns for unique/duplicate indices when values are missing
+        df = DataFrame(randn(4,3),index=list('ABCD'))
+        expected = df.ix[['E']]
+
+        dfnu = DataFrame(randn(5,3),index=list('AABCD'))
+        result = dfnu.ix[['E']]
+        assert_frame_equal(result, expected)
+
     def test_indexing_mixed_frame_bug(self):
 
         # GH3492