From 511ee38047e92e55281afb35569a4f947526ec39 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 11 Jun 2018 15:07:34 +0200 Subject: [PATCH 1/5] API: re-allow duplicate index level names --- pandas/core/indexes/multi.py | 12 --------- pandas/tests/frame/test_alter_axes.py | 15 +++++++---- pandas/tests/groupby/test_categorical.py | 10 ++++---- pandas/tests/indexes/test_multi.py | 32 ++++++++++++------------ pandas/tests/reshape/test_pivot.py | 10 ++++++-- 5 files changed, 39 insertions(+), 40 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 75b6be96feb78..d7ff83ca18f44 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -672,30 +672,18 @@ def _set_names(self, names, level=None, validate=True): if level is None: level = range(self.nlevels) - used = {} else: level = [self._get_level_number(l) for l in level] - used = {self.levels[l].name: l - for l in set(range(self.nlevels)) - set(level)} # set the name for l, name in zip(level, names): if name is not None: - # GH 20527 # All items in 'names' need to be hashable: if not is_hashable(name): raise TypeError('{}.name must be a hashable type' .format(self.__class__.__name__)) - - if name in used: - raise ValueError( - 'Duplicated level name: "{}", assigned to ' - 'level {}, is already used for level ' - '{}.'.format(name, l, used[name])) - self.levels[l].rename(name, inplace=True) - used[name] = l names = property(fset=_set_names, fget=_get_names, doc="Names of levels in MultiIndex") diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 164d6746edec0..5a78a42b02ac7 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -130,19 +130,24 @@ def test_set_index2(self): result = df.set_index(df.C) assert result.index.name == 'C' - @pytest.mark.parametrize('level', ['a', pd.Series(range(3), name='a')]) + @pytest.mark.parametrize( + 'level', ['a', pd.Series(range(0, 8, 2), name='a')]) def test_set_index_duplicate_names(self, level): - # GH18872 + # GH18872 - GH19029 df = pd.DataFrame(np.arange(8).reshape(4, 2), columns=['a', 'b']) # Pass an existing level name: df.index.name = 'a' - pytest.raises(ValueError, df.set_index, level, append=True) - pytest.raises(ValueError, df.set_index, [level], append=True) + expected = pd.MultiIndex.from_tuples([(0, 0), (1, 2), (2, 4), (3, 6)], + names=['a', 'a']) + result = df.set_index(level, append=True) + tm.assert_index_equal(result.index, expected) + result = df.set_index([level], append=True) + tm.assert_index_equal(result.index, expected) # Pass twice the same level name: df.index.name = 'c' - pytest.raises(ValueError, df.set_index, [level, level]) + # pytest.raises(ValueError, df.set_index, [level, level]) def test_set_index_nonuniq(self): df = DataFrame({'A': ['foo', 'foo', 'foo', 'bar', 'bar'], diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index e0793b8e1bd64..8f022739b9dd1 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -553,14 +553,14 @@ def test_as_index(): columns=['cat', 'A', 'B']) tm.assert_frame_equal(result, expected) - # another not in-axis grouper - s = Series(['a', 'b', 'b'], name='cat2') + # another not in-axis grouper (conflicting names in index) + s = Series(['a', 'b', 'b'], name='cat') result = df.groupby(['cat', s], as_index=False, observed=True).sum() tm.assert_frame_equal(result, expected) - # GH18872: conflicting names in desired index - with pytest.raises(ValueError): - df.groupby(['cat', s.rename('cat')], observed=True).sum() + # # GH18872: conflicting names in desired index + # with pytest.raises(ValueError): + # df.groupby(['cat', s.rename('cat')], observed=True).sum() # is original index dropped? group_columns = ['cat', 'A'] diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 0ab3447909d9b..e0401a2bad10d 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -656,22 +656,22 @@ def test_constructor_nonhashable_names(self): # With .set_names() tm.assert_raises_regex(TypeError, message, mi.set_names, names=renamed) - @pytest.mark.parametrize('names', [['a', 'b', 'a'], ['1', '1', '2'], - ['1', 'a', '1']]) - def test_duplicate_level_names(self, names): - # GH18872 - pytest.raises(ValueError, pd.MultiIndex.from_product, - [[0, 1]] * 3, names=names) - - # With .rename() - mi = pd.MultiIndex.from_product([[0, 1]] * 3) - tm.assert_raises_regex(ValueError, "Duplicated level name:", - mi.rename, names) - - # With .rename(., level=) - mi.rename(names[0], level=1, inplace=True) - tm.assert_raises_regex(ValueError, "Duplicated level name:", - mi.rename, names[:2], level=[0, 2]) + # @pytest.mark.parametrize('names', [['a', 'b', 'a'], ['1', '1', '2'], + # ['1', 'a', '1']]) + # def test_duplicate_level_names(self, names): + # # GH18872 + # pytest.raises(ValueError, pd.MultiIndex.from_product, + # [[0, 1]] * 3, names=names) + + # # With .rename() + # mi = pd.MultiIndex.from_product([[0, 1]] * 3) + # tm.assert_raises_regex(ValueError, "Duplicated level name:", + # mi.rename, names) + + # # With .rename(., level=) + # mi.rename(names[0], level=1, inplace=True) + # tm.assert_raises_regex(ValueError, "Duplicated level name:", + # mi.rename, names[:2], level=[0, 2]) def assert_multiindex_copied(self, copy, original): # Levels should be (at least, shallow copied) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 3ec60d50f2792..b71954163f9e1 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1729,9 +1729,15 @@ def test_crosstab_with_numpy_size(self): tm.assert_frame_equal(result, expected) def test_crosstab_dup_index_names(self): - # GH 13279, GH 18872 + # GH 13279 s = pd.Series(range(3), name='foo') - pytest.raises(ValueError, pd.crosstab, s, s) + + result = pd.crosstab(s, s) + expected_index = pd.Index(range(3), name='foo') + expected = pd.DataFrame(np.eye(3, dtype=np.int64), + index=expected_index, + columns=expected_index) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("names", [['a', ('b', 'c')], [('a', 'b'), 'c']]) From 4872a7c6f6ec3f7b84884ad9714365ebcbc74fa8 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 27 Jun 2018 19:11:42 +0200 Subject: [PATCH 2/5] re-enable tests --- pandas/tests/frame/test_alter_axes.py | 22 +++++++++++-- pandas/tests/frame/test_reshape.py | 10 ++++++ pandas/tests/groupby/test_categorical.py | 4 --- pandas/tests/indexes/test_multi.py | 39 ++++++++++++++---------- pandas/tests/io/test_pytables.py | 6 ++++ 5 files changed, 58 insertions(+), 23 deletions(-) diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 5a78a42b02ac7..21961906c39bb 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -145,9 +145,12 @@ def test_set_index_duplicate_names(self, level): result = df.set_index([level], append=True) tm.assert_index_equal(result.index, expected) - # Pass twice the same level name: - df.index.name = 'c' - # pytest.raises(ValueError, df.set_index, [level, level]) + # Pass twice the same level name (only works with passing actual data) + if isinstance(level, pd.Series): + result = df.set_index([level, level]) + expected = pd.MultiIndex.from_tuples( + [(0, 0), (2, 2), (4, 4), (6, 6)], names=['a', 'a']) + tm.assert_index_equal(result.index, expected) def test_set_index_nonuniq(self): df = DataFrame({'A': ['foo', 'foo', 'foo', 'bar', 'bar'], @@ -622,6 +625,19 @@ def test_reorder_levels(self): index=e_idx) assert_frame_equal(result, expected) + result = df.reorder_levels([0, 0, 0]) + e_idx = MultiIndex(levels=[['bar'], ['bar'], ['bar']], + labels=[[0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0]], + names=['L0', 'L0', 'L0']) + expected = DataFrame({'A': np.arange(6), 'B': np.arange(6)}, + index=e_idx) + assert_frame_equal(result, expected) + + result = df.reorder_levels(['L0', 'L0', 'L0']) + assert_frame_equal(result, expected) + def test_reset_index(self): stacked = self.frame.stack()[::2] stacked = DataFrame({'foo': stacked, 'bar': stacked}) diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index d05321abefca6..ebf6c5e37b916 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -560,6 +560,16 @@ def test_unstack_dtypes(self): assert left.shape == (3, 2) tm.assert_frame_equal(left, right) + def test_unstack_non_unique_index_names(self): + idx = MultiIndex.from_tuples([('a', 'b'), ('c', 'd')], + names=['c1', 'c1']) + df = DataFrame([1, 2], index=idx) + with pytest.raises(ValueError): + df.unstack('c1') + + with pytest.raises(ValueError): + df.T.stack('c1') + def test_unstack_unused_levels(self): # GH 17845: unused labels in index make unstack() cast int to float idx = pd.MultiIndex.from_product([['a'], ['A', 'B', 'C', 'D']])[:-1] diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 75ecbe31aa601..cb76195eacf40 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -560,10 +560,6 @@ def test_as_index(): result = df.groupby(['cat', s], as_index=False, observed=True).sum() tm.assert_frame_equal(result, expected) - # # GH18872: conflicting names in desired index - # with pytest.raises(ValueError): - # df.groupby(['cat', s.rename('cat')], observed=True).sum() - # is original index dropped? group_columns = ['cat', 'A'] expected = DataFrame( diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index e77004247aed6..0e295e8b88b8e 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -6,6 +6,8 @@ from datetime import timedelta from itertools import product +import six + import pytest import numpy as np @@ -656,22 +658,27 @@ def test_constructor_nonhashable_names(self): # With .set_names() tm.assert_raises_regex(TypeError, message, mi.set_names, names=renamed) - # @pytest.mark.parametrize('names', [['a', 'b', 'a'], ['1', '1', '2'], - # ['1', 'a', '1']]) - # def test_duplicate_level_names(self, names): - # # GH18872 - # pytest.raises(ValueError, pd.MultiIndex.from_product, - # [[0, 1]] * 3, names=names) - - # # With .rename() - # mi = pd.MultiIndex.from_product([[0, 1]] * 3) - # tm.assert_raises_regex(ValueError, "Duplicated level name:", - # mi.rename, names) - - # # With .rename(., level=) - # mi.rename(names[0], level=1, inplace=True) - # tm.assert_raises_regex(ValueError, "Duplicated level name:", - # mi.rename, names[:2], level=[0, 2]) + @pytest.mark.parametrize('names', [['a', 'b', 'a'], [1, 1, 2], + [1, 'a', 1]]) + def test_duplicate_level_names(self, names): + # GH18872, GH19029 + mi = pd.MultiIndex.from_product([[0, 1]] * 3, names=names) + assert mi.names == names + + # With .rename() + mi = pd.MultiIndex.from_product([[0, 1]] * 3) + mi = mi.rename(names) + assert mi.names == names + + # With .rename(., level=) + mi.rename(names[1], level=1, inplace=True) + mi = mi.rename([names[0], names[2]], level=[0, 2]) + assert mi.names == names + + def test_duplicate_level_names_access_raises(self): + self.index.names = ['foo', 'foo'] + tm.assert_raises_regex(KeyError, 'Level foo not found', + self.index._get_level_number, 'foo') def assert_multiindex_copied(self, copy, original): # Levels should be (at least, shallow copied) diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index 29063b64221c1..865cab7a1596e 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -1893,6 +1893,12 @@ def make_index(names=None): 'a', 'b'], index=make_index(['date', 'a', 't'])) pytest.raises(ValueError, store.append, 'df', df) + # dup within level + _maybe_remove(store, 'df') + df = DataFrame(np.zeros((12, 2)), columns=['a', 'b'], + index=make_index(['date', 'date', 'date'])) + pytest.raises(ValueError, store.append, 'df', df) + # fully names _maybe_remove(store, 'df') df = DataFrame(np.zeros((12, 2)), columns=[ From 0fa93996a6acd9026475185a913b894243b65190 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 27 Jun 2018 19:29:12 +0200 Subject: [PATCH 3/5] TEMP add back checking for duplicate column names in stack/unstack --- pandas/core/indexes/multi.py | 7 +++++++ pandas/core/reshape/reshape.py | 12 ++++++++++++ 2 files changed, 19 insertions(+) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 6514695197552..a2322348e1caa 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2881,6 +2881,13 @@ def isin(self, values, level=None): else: return np.lib.arraysetops.in1d(labs, sought_labels) + def _reference_duplicate_name(self, name): + """ + Returns True if the name refered to in self.names is duplicated. + """ + # count the times name equals an element in self.names. + return sum(name == n for n in self.names) > 1 + MultiIndex._add_numeric_methods_disabled() MultiIndex._add_numeric_methods_add_sub_disabled() diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 2757e0797a410..3d9e84954a63b 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -115,6 +115,12 @@ def __init__(self, values, index, level=-1, value_columns=None, self.index = index.remove_unused_levels() + if isinstance(self.index, MultiIndex): + if index._reference_duplicate_name(level): + msg = ("Ambiguous reference to {level}. The index " + "names are not unique.".format(level=level)) + raise ValueError(msg) + self.level = self.index._get_level_number(level) # when index includes `nan`, need to lift levels/strides by 1 @@ -528,6 +534,12 @@ def factorize(index): N, K = frame.shape + if isinstance(frame.columns, MultiIndex): + if frame.columns._reference_duplicate_name(level): + msg = ("Ambiguous reference to {level}. The column " + "names are not unique.".format(level=level)) + raise ValueError(msg) + # Will also convert negative level numbers and check if out of bounds. level_num = frame.columns._get_level_number(level) From 158bbdbf307b53160fcabd7f269f0b56feddd072 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 27 Jun 2018 19:31:13 +0200 Subject: [PATCH 4/5] add whatsnew --- doc/source/whatsnew/v0.23.2.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 9c4b408a1d24b..2df10592ab1af 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -53,6 +53,7 @@ Fixed Regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :meth:`to_csv` when handling file-like object incorrectly (:issue:`21471`) +- Re-allowed duplicate level names of a ``MultiIndex``. Accessing a level that has a duplicate name by name still raises an error (:issue:`19029`). - Bug in both :meth:`DataFrame.first_valid_index` and :meth:`Series.first_valid_index` raised for a row index having duplicate values (:issue:`21441`) - From fde82319732fcb00e7a855ea2aca07876ba8e85b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 27 Jun 2018 20:39:04 +0200 Subject: [PATCH 5/5] flake8 --- pandas/tests/indexes/test_multi.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 0e295e8b88b8e..1dc44677ab3ad 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -6,8 +6,6 @@ from datetime import timedelta from itertools import product -import six - import pytest import numpy as np