Skip to content

Commit 9ae47f9

Browse files
committed
Merge pull request #3563 from jreback/GH3561
BUG: (GH3561) non-unique indexers with a list-like now return in the same order as the passed values
2 parents 6d2c57f + b84d649 commit 9ae47f9

File tree

8 files changed

+145
-18
lines changed

8 files changed

+145
-18
lines changed

RELEASE.rst

+2
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ pandas 0.11.1
9191
(removed warning) (GH2786_), and fix (GH3230_)
9292
- Fix to_csv to handle non-unique columns (GH3495_)
9393
- Duplicate indexes with getitem will return items in the correct order (GH3455_, GH3457_)
94+
and handle missing elements like unique indices (GH3561_)
9495
- Duplicate indexes with and empty DataFrame.from_records will return a correct frame (GH3562_)
9596
- Fixed bug in groupby with empty series referencing a variable before assignment. (GH3510_)
9697
- Fixed bug in mixed-frame assignment with aligned series (GH3492_)
@@ -148,6 +149,7 @@ pandas 0.11.1
148149
.. _GH3552: https://github.com/pydata/pandas/issues/3552
149150
.. _GH3562: https://github.com/pydata/pandas/issues/3562
150151
.. _GH3586: https://github.com/pydata/pandas/issues/3586
152+
.. _GH3561: https://github.com/pydata/pandas/issues/3561
151153
.. _GH3493: https://github.com/pydata/pandas/issues/3493
152154
.. _GH3579: https://github.com/pydata/pandas/issues/3579
153155
.. _GH3593: https://github.com/pydata/pandas/issues/3593

doc/source/indexing.rst

+3
Original file line numberDiff line numberDiff line change
@@ -1368,6 +1368,9 @@ incompatible the new object internals are with the ``Index`` functions):
13681368
- ``slice_locs``: returns the "range" to slice between two labels
13691369
- ``get_indexer``: Computes the indexing vector for reindexing / data
13701370
alignment purposes. See the source / docstrings for more on this
1371+
- ``get_indexer_non_unique``: Computes the indexing vector for reindexing / data
1372+
alignment purposes when the index is non-unique. See the source / docstrings
1373+
for more on this
13711374
- ``reindex``: Does any pre-conversion of the input index then calls
13721375
``get_indexer``
13731376
- ``union``, ``intersection``: computes the union or intersection of two

pandas/core/index.py

+19
Original file line numberDiff line numberDiff line change
@@ -859,6 +859,25 @@ def get_indexer(self, target, method=None, limit=None):
859859

860860
return com._ensure_platform_int(indexer)
861861

862+
def get_indexer_non_unique(self, target, **kwargs):
863+
""" return an indexer suitable for taking from a non unique index
864+
return the labels in the same order as the target, and
865+
return a missing indexer into the target (missing are marked as -1
866+
in the indexer); target must be an iterable """
867+
target = _ensure_index(target)
868+
pself, ptarget = self._possibly_promote(target)
869+
if pself is not self or ptarget is not target:
870+
return pself.get_indexer_non_unique(ptarget)
871+
872+
if self.is_all_dates:
873+
self = Index(self.asi8)
874+
tgt_values = target.asi8
875+
else:
876+
tgt_values = target.values
877+
878+
indexer, missing = self._engine.get_indexer_non_unique(tgt_values)
879+
return Index(indexer), missing
880+
862881
def _possibly_promote(self, other):
863882
# A hack, but it works
864883
from pandas.tseries.index import DatetimeIndex

pandas/core/indexing.py

+19-16
Original file line numberDiff line numberDiff line change
@@ -458,8 +458,23 @@ def _reindex(keys, level=None):
458458
if labels.is_unique:
459459
return _reindex(keyarr, level=level)
460460
else:
461-
mask = labels.isin(keyarr)
462-
return self.obj.take(mask.nonzero()[0], axis=axis, convert=False)
461+
indexer, missing = labels.get_indexer_non_unique(keyarr)
462+
check = indexer != -1
463+
result = self.obj.take(indexer[check], axis=axis, convert=False)
464+
465+
# need to merge the result labels and the missing labels
466+
if len(missing):
467+
l = np.arange(len(indexer))
468+
469+
missing_labels = keyarr.take(missing)
470+
missing_labels_indexer = l[~check]
471+
cur_labels = result._get_axis(axis).values
472+
cur_labels_indexer = l[check]
473+
new_labels = lib.combine_from_indexers(cur_labels, cur_labels_indexer,
474+
missing_labels, missing_labels_indexer)
475+
result = result.reindex_axis(new_labels,axis=axis)
476+
477+
return result
463478

464479
def _convert_to_indexer(self, obj, axis=0):
465480
"""
@@ -569,20 +584,8 @@ def _convert_to_indexer(self, obj, axis=0):
569584

570585
# non-unique (dups)
571586
else:
572-
indexer = []
573-
check = np.arange(len(labels))
574-
lvalues = labels.values
575-
for x in objarr:
576-
# ugh
577-
to_or = lib.map_infer(lvalues, x.__eq__)
578-
if not to_or.any():
579-
raise KeyError('%s not in index' % str(x))
580-
581-
# add the indicies (as we want to take)
582-
indexer.extend(check[to_or])
583-
584-
indexer = Index(indexer)
585-
587+
indexer, missing = labels.get_indexer_non_unique(objarr)
588+
check = indexer
586589

587590
mask = check == -1
588591
if mask.any():

pandas/index.pyx

+38
Original file line numberDiff line numberDiff line change
@@ -267,8 +267,46 @@ cdef class IndexEngine:
267267
self._ensure_mapping_populated()
268268
return self.mapping.lookup(values)
269269

270+
def get_indexer_non_unique(self, targets):
271+
""" return an indexer suitable for takng from a non unique index
272+
return the labels in the same order ast the target
273+
and a missing indexer into the targets (which correspond
274+
to the -1 indicies in the results """
270275

276+
cdef:
277+
ndarray values
278+
ndarray[int64_t] result, missing
279+
object v, val
280+
int count = 0, count_missing = 0
281+
Py_ssize_t i, j, n, found
282+
283+
self._ensure_mapping_populated()
284+
values = self._get_index_values()
285+
n = len(values)
286+
n_t = len(targets)
287+
result = np.empty(n+n_t, dtype=np.int64)
288+
missing = np.empty(n_t, dtype=np.int64)
289+
290+
for i in range(n_t):
291+
val = util.get_value_at(targets, i)
292+
found = 0
293+
294+
for j in range(n):
295+
v = util.get_value_at(values, j)
296+
297+
if v == val:
298+
result[count] = j
299+
count += 1
300+
found = 1
301+
302+
# value not found
303+
if found == 0:
304+
result[count] = -1
305+
count += 1
306+
missing[count_missing] = i
307+
count_missing += 1
271308

309+
return result[0:count], missing[0:count_missing]
272310

273311
cdef class Int64Engine(IndexEngine):
274312

pandas/lib.pyx

+19
Original file line numberDiff line numberDiff line change
@@ -416,6 +416,25 @@ def dicts_to_array(list dicts, list columns):
416416

417417
return result
418418

419+
@cython.wraparound(False)
420+
@cython.boundscheck(False)
421+
def combine_from_indexers(ndarray a, ndarray[int64_t] a_indexer,
422+
ndarray b, ndarray[int64_t] b_indexer):
423+
cdef:
424+
Py_ssize_t i, n_a, n_b
425+
ndarray result
426+
427+
n_a = len(a)
428+
n_b = len(b)
429+
result = np.empty(n_a+n_b,dtype=object)
430+
431+
for i in range(n_a):
432+
result[a_indexer[i]] = a[i]
433+
for i in range(n_b):
434+
result[b_indexer[i]] = b[i]
435+
436+
return result
437+
419438

420439
def fast_zip(list ndarrays):
421440
'''

pandas/tests/test_frame.py

+23-2
Original file line numberDiff line numberDiff line change
@@ -4668,8 +4668,29 @@ def _check_df(df,cols=None):
46684668
with ensure_clean() as path:
46694669
df.to_csv(path,cols = cols,chunksize=chunksize)
46704670
rs_c = pd.read_csv(path,index_col=0)
4671-
rs_c.columns = df.columns
4672-
assert_frame_equal(df,rs_c,check_names=False)
4671+
4672+
# we wrote them in a different order
4673+
# so compare them in that order
4674+
if cols is not None:
4675+
4676+
if df.columns.is_unique:
4677+
rs_c.columns = cols
4678+
else:
4679+
indexer, missing = df.columns.get_indexer_non_unique(cols)
4680+
rs_c.columns = df.columns.take(indexer)
4681+
4682+
for c in cols:
4683+
obj_df = df[c]
4684+
obj_rs = rs_c[c]
4685+
if isinstance(obj_df,Series):
4686+
assert_series_equal(obj_df,obj_rs)
4687+
else:
4688+
assert_frame_equal(obj_df,obj_rs,check_names=False)
4689+
4690+
# wrote in the same order
4691+
else:
4692+
rs_c.columns = df.columns
4693+
assert_frame_equal(df,rs_c,check_names=False)
46734694

46744695
chunksize=5
46754696
N = int(chunksize*2.5)

pandas/tests/test_indexing.py

+22
Original file line numberDiff line numberDiff line change
@@ -784,6 +784,28 @@ def test_dups_fancy_indexing(self):
784784

785785
assert_frame_equal(df,result)
786786

787+
# GH 3561, dups not in selected order
788+
ind = ['A', 'A', 'B', 'C']
789+
df = DataFrame({'test':range(len(ind))}, index=ind)
790+
rows = ['C', 'B']
791+
res = df.ix[rows]
792+
self.assert_(rows == list(res.index))
793+
794+
res = df.ix[Index(rows)]
795+
self.assert_(Index(rows).equals(res.index))
796+
797+
rows = ['C','B','E']
798+
res = df.ix[rows]
799+
self.assert_(rows == list(res.index))
800+
801+
# inconcistent returns for unique/duplicate indices when values are missing
802+
df = DataFrame(randn(4,3),index=list('ABCD'))
803+
expected = df.ix[['E']]
804+
805+
dfnu = DataFrame(randn(5,3),index=list('AABCD'))
806+
result = dfnu.ix[['E']]
807+
assert_frame_equal(result, expected)
808+
787809
def test_indexing_mixed_frame_bug(self):
788810

789811
# GH3492

0 commit comments

Comments
 (0)