From 84efa459d13e53be799836d5010ae3aad57e16fe Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Tue, 14 Mar 2017 12:57:14 +0100 Subject: [PATCH 1/3] BUG: Fix iloc with duplicate labels closes #15686 --- pandas/core/dtypes/cast.py | 21 ++++++ pandas/core/indexing.py | 138 ++++++++++++++++++++++++++++--------- 2 files changed, 126 insertions(+), 33 deletions(-) mode change 100755 => 100644 pandas/core/indexing.py diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 723e4f70da4e9..537c929451d69 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1085,3 +1085,24 @@ def cast_scalar_to_array(shape, value, dtype=None): values.fill(fill_value) return values + + +def _maybe_convert_indexer(indexer, until): + """ + Convert slice, tuple, list or scalar "indexer" to 1-d array of indices, + using "until" as maximum for upwards open slices. + """ + + if is_scalar(indexer): + return np.array([indexer], dtype=int) + + if isinstance(indexer, np.ndarray): + if indexer.dtype == bool: + return np.where(indexer)[0] + return indexer + + if isinstance(indexer, slice): + stop = until if indexer.stop is None else indexer.stop + return np.arange(stop, dtype=int)[indexer] + + return np.array(indexer, dtype=int) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py old mode 100755 new mode 100644 index 109183827de4e..75888fe25fd74 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -16,6 +16,7 @@ _is_unorderable_exception, _ensure_platform_int) from pandas.core.dtypes.missing import isna, _infer_fill_value +from pandas.core.dtypes.cast import _maybe_convert_indexer from pandas.core.index import Index, MultiIndex @@ -81,6 +82,24 @@ def __getitem__(self, arg): IndexSlice = _IndexSlice() +class InfoCleaner: + """ + A context manager which temporarily removes labels on the "info" axis, + replacing them with a RangeIndex, and then puts them back in place. + Used to unambiguously index by position. + """ + def __init__(self, obj): + self._obj = obj + self._info_axis = self._obj._AXIS_NAMES[self._obj._info_axis_number] + + def __enter__(self): + self._old_col = getattr(self._obj, self._info_axis) + setattr(self._obj, self._info_axis, range(len(self._old_col))) + + def __exit__(self, *args): + setattr(self._obj, self._info_axis, self._old_col) + + class IndexingError(Exception): pass @@ -492,29 +511,10 @@ def _setitem_with_indexer(self, indexer, value): else: lplane_indexer = 0 - def setter(item, v): - s = self.obj[item] - pi = plane_indexer[0] if lplane_indexer == 1 else plane_indexer - - # perform the equivalent of a setitem on the info axis - # as we have a null slice or a slice with full bounds - # which means essentially reassign to the columns of a - # multi-dim object - # GH6149 (null slice), GH10408 (full bounds) - if (isinstance(pi, tuple) and - all(is_null_slice(idx) or - is_full_slice(idx, len(self.obj)) - for idx in pi)): - s = v - else: - # set the item, possibly having a dtype change - s._consolidate_inplace() - s = s.copy() - s._data = s._data.setitem(indexer=pi, value=v) - s._maybe_update_cacher(clear=True) - - # reset the sliced object if unique - self.obj[item] = s + setter_kwargs = {'items': labels, + 'indexer': indexer, + 'pi': plane_indexer[0] if lplane_indexer == 1 + else plane_indexer} def can_do_equal_len(): """ return True if we have an equal len settable """ @@ -542,7 +542,7 @@ def can_do_equal_len(): sub_indexer = list(indexer) multiindex_indexer = isinstance(labels, MultiIndex) - for item in labels: + for idx, item in enumerate(labels): if item in value: sub_indexer[info_axis] = item v = self._align_series( @@ -551,7 +551,7 @@ def can_do_equal_len(): else: v = np.nan - setter(item, v) + self._setter(idx, v, force_loc=True, **setter_kwargs) # we have an equal len ndarray/convertible to our labels elif np.array(value).ndim == 2: @@ -563,14 +563,15 @@ def can_do_equal_len(): raise ValueError('Must have equal len keys and value ' 'when setting with an ndarray') - for i, item in enumerate(labels): + for i in range(len(labels)): # setting with a list, recoerces - setter(item, value[:, i].tolist()) + self._setter(i, value[:, i].tolist(), force_loc=True, + **setter_kwargs) # we have an equal len list/ndarray elif can_do_equal_len(): - setter(labels[0], value) + self._setter(0, value, **setter_kwargs) # per label values else: @@ -579,13 +580,12 @@ def can_do_equal_len(): raise ValueError('Must have equal len keys and value ' 'when setting with an iterable') - for item, v in zip(labels, value): - setter(item, v) + for i, v in zip(range(len(labels)), value): + self._setter(i, v, **setter_kwargs) else: - # scalar - for item in labels: - setter(item, value) + for idx in range(len(labels)): + self._setter(idx, value, **setter_kwargs) else: if isinstance(indexer, tuple): @@ -619,6 +619,47 @@ def can_do_equal_len(): value=value) self.obj._maybe_update_cacher(clear=True) + def _setter(self, idx, v, items, pi, **kwargs): + """ + Set a single value on the underlying object. Label-based. + + Parameters + ---------- + idx : int + The index of the desired element inside "items" + + v : any + The value to assign to the specified location + + items: list + A list of labels + + pi: tuple or list-like + Components of original indexer preceding the info axis + """ + item = items[idx] + s = self.obj[item] + + # perform the equivalent of a setitem on the info axis + # as we have a null slice or a slice with full bounds + # which means essentially reassign to the columns of a + # multi-dim object + # GH6149 (null slice), GH10408 (full bounds) + if (isinstance(pi, tuple) and + all(is_null_slice(ix) or + is_full_slice(ix, len(self.obj)) + for ix in pi)): + s = v + else: + # set the item, possibly having a dtype change + s._consolidate_inplace() + s = s.copy() + s._data = s._data.setitem(indexer=pi, value=v) + s._maybe_update_cacher(clear=True) + + # reset the sliced object if unique + self.obj[item] = s + def _align_series(self, indexer, ser, multiindex_indexer=False): """ Parameters @@ -1772,6 +1813,37 @@ def _convert_to_indexer(self, obj, axis=0, is_setter=False): raise ValueError("Can only index by location with a [%s]" % self._valid_types) + def _setter(self, idx, v, indexer, force_loc=False, **kwargs): + """ + Set a single value on the underlying object. Position-based by default. + + Parameters + ---------- + idx : int + The index of the desired element + + v : any + The value to assign to the specified location + + indexer: list + The original indexer + + force_loc: bool + If True, use location-based indexing. + + Other keyword arguments are forwarded to _NDFrameIndexer._setter() + """ + + if force_loc: + super(_iLocIndexer, self)._setter(idx, v, **kwargs) + else: + info_axis = self.obj._info_axis_number + max_idx = len(self.obj._get_axis(info_axis)) + kwargs['items'] = _maybe_convert_indexer(indexer[info_axis], + max_idx) + with InfoCleaner(self.obj): + super(_iLocIndexer, self)._setter(idx, v, **kwargs) + class _ScalarAccessIndexer(_NDFrameIndexer): """ access scalars quickly """ From 4a5d18d7a5f1a0cd62972ea6cadea466212e5a25 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Tue, 14 Mar 2017 16:48:55 +0100 Subject: [PATCH 2/3] TST: fixed one tests and added two more for #15686 --- pandas/tests/indexing/test_iloc.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 31fee303a41e2..64c12a472f71e 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -301,15 +301,33 @@ def test_iloc_setitem_dups(self): # iloc with a mask aligning from another iloc df1 = DataFrame([{'A': None, 'B': 1}, {'A': 2, 'B': 2}]) df2 = DataFrame([{'A': 3, 'B': 3}, {'A': 4, 'B': 4}]) - df = concat([df1, df2], axis=1) + df_orig = concat([df1, df2], axis=1) + df = df_orig.copy() + # GH 15686 + # iloc with mask, duplicated index and multiple blocks expected = df.fillna(3) - expected['A'] = expected['A'].astype('float64') + expected.iloc[:, 0] = expected.iloc[:, 0].astype('float64') inds = np.isnan(df.iloc[:, 0]) mask = inds[inds].index df.iloc[mask, 0] = df.iloc[mask, 2] tm.assert_frame_equal(df, expected) + # GH 15686 + # iloc with scalar, duplicated index and multiple blocks + df = df_orig.copy() + expected = df.fillna(15) + df.iloc[0, 0] = 15 + tm.assert_frame_equal(df, expected) + + # GH 15686 + # iloc with repeated value, duplicated index and multiple blocks + df = df_orig.copy() + expected = concat([DataFrame([{'A': 15, 'B': 1}, {'A': 15, 'B': 2}]), + df2], axis=1) + df.iloc[:, 0] = 15 + tm.assert_frame_equal(df, expected) + # del a dup column across blocks expected = DataFrame({0: [1, 2], 1: [3, 4]}) expected.columns = ['B', 'B'] From b8ee051d5d77496688eb19f6673c900ec4cfedc4 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Tue, 14 Mar 2017 17:12:08 +0100 Subject: [PATCH 3/3] TST: added xfailing test on iloc with dups --- pandas/tests/indexing/test_iloc.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 64c12a472f71e..e7810b9035407 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -345,6 +345,20 @@ def test_iloc_setitem_dups(self): drop=True) tm.assert_frame_equal(df, expected) + @pytest.mark.xfail(reason="BlockManager.setitem() broken") + def test_iloc_setitem_dups_slice(self): + # GH 12991 + df = DataFrame(index=['a', 'b', 'c'], + columns=['d', 'e', 'd']).fillna(0) + expected = df.iloc[:, 2].copy(deep=True) + + first_col = df.iloc[:, 0] + first_col['a'] = 3 + + result = df.iloc[:, 2] + + tm.assert_series_equal(result, expected) + def test_iloc_getitem_frame(self): df = DataFrame(np.random.randn(10, 4), index=lrange(0, 20, 2), columns=lrange(0, 8, 2))