Skip to content

Commit dbf1c94

Browse files
committed
support for removing unused levels (internally)
xref pandas-dev#2770
1 parent bc5c172 commit dbf1c94

File tree

3 files changed

+108
-16
lines changed

3 files changed

+108
-16
lines changed

Diff for: pandas/indexes/multi.py

+48-5
Original file line numberDiff line numberDiff line change
@@ -1175,7 +1175,7 @@ def from_product(cls, iterables, sortorder=None, names=None):
11751175
labels = cartesian_product(labels)
11761176
return MultiIndex(levels, labels, sortorder=sortorder, names=names)
11771177

1178-
def _reconstruct(self, sort=False):
1178+
def _reconstruct(self, sort=False, remove_unused=False):
11791179
"""
11801180
reconstruct the MultiIndex
11811181
@@ -1186,21 +1186,33 @@ def _reconstruct(self, sort=False):
11861186
----------
11871187
sort: boolean, default False
11881188
monotonically sort the levels
1189+
remove_unused: boolean, default False
1190+
remove unsued levels
11891191
11901192
Returns
11911193
-------
11921194
MultiIndex
11931195
11941196
"""
1197+
1198+
if sort and remove_unused:
1199+
raise ValueError("only support one of sort / remove_unused")
1200+
1201+
if not (sort or remove_unused):
1202+
raise ValueError("must supply one of sort / remove_unsued")
1203+
1204+
levels = self.levels
1205+
labels = self.labels
1206+
11951207
new_levels = []
11961208
new_labels = []
11971209

11981210
if sort:
11991211

1200-
if self.is_monotonic:
1212+
if self.is_lexsorted() and self.is_monotonic:
12011213
return self
12021214

1203-
for lev, lab in zip(self.levels, self.labels):
1215+
for lev, lab in zip(levels, labels):
12041216

12051217
if lev.is_monotonic:
12061218
new_levels.append(lev)
@@ -1218,8 +1230,39 @@ def _reconstruct(self, sort=False):
12181230
new_levels.append(lev)
12191231
new_labels.append(lab)
12201232

1221-
else:
1222-
return self
1233+
elif remove_unused:
1234+
1235+
changed = np.zeros(self.nlevels, dtype=bool)
1236+
for i, (lev, lab) in enumerate(zip(levels, labels)):
1237+
1238+
uniques = np.sort(algos.unique(lab))
1239+
1240+
# nothing unused
1241+
if len(uniques) == len(lev):
1242+
new_levels.append(lev)
1243+
new_labels.append(lab)
1244+
changed[i] = True
1245+
continue
1246+
1247+
unused = list(reversed(sorted(set(
1248+
np.arange(len(lev))) - set(uniques))))
1249+
1250+
# new levels are simple
1251+
lev = lev.take(uniques)
1252+
1253+
# new labels, we remove the unsued
1254+
# by decrementing the labels for that value
1255+
# prob a better way
1256+
for u in unused:
1257+
1258+
lab = np.where(lab > u, lab - 1, lab)
1259+
1260+
new_levels.append(lev)
1261+
new_labels.append(lab)
1262+
1263+
# nothing changed
1264+
if not changed.any():
1265+
return self
12231266

12241267
return MultiIndex(new_levels, new_labels,
12251268
names=self.names, sortorder=self.sortorder,

Diff for: pandas/tests/indexes/test_multi.py

+41-8
Original file line numberDiff line numberDiff line change
@@ -2411,6 +2411,18 @@ def test_is_monotonic(self):
24112411

24122412
self.assertFalse(i.is_monotonic)
24132413

2414+
def test_reconstruct_api(self):
2415+
2416+
mi = MultiIndex.from_arrays([
2417+
['A', 'A', 'B', 'B', 'B'], [1, 2, 1, 2, 3]
2418+
])
2419+
2420+
with pytest.raises(ValueError):
2421+
mi._reconstruct()
2422+
2423+
with pytest.raises(ValueError):
2424+
mi._reconstruct(sort=True, remove_unused=True)
2425+
24142426
def test_reconstruct_sort(self):
24152427

24162428
# starts off lexsorted & monotonic
@@ -2428,14 +2440,6 @@ def test_reconstruct_sort(self):
24282440
assert mi.equals(recons)
24292441
assert Index(mi.values).equals(Index(recons.values))
24302442

2431-
recons = mi._reconstruct(sort=False)
2432-
assert recons.is_lexsorted()
2433-
assert recons.is_monotonic
2434-
assert mi is recons
2435-
2436-
assert mi.equals(recons)
2437-
assert Index(mi.values).equals(Index(recons.values))
2438-
24392443
# cannot convert to lexsorted
24402444
mi = pd.MultiIndex.from_tuples([('z', 'a'), ('x', 'a'), ('y', 'b'),
24412445
('x', 'b'), ('y', 'a'), ('z', 'b')],
@@ -2464,6 +2468,35 @@ def test_reconstruct_sort(self):
24642468
assert mi.equals(recons)
24652469
assert Index(mi.values).equals(Index(recons.values))
24662470

2471+
def test_reconstruct_remove_unused(self):
2472+
# xref to GH 2770
2473+
df = DataFrame([['deleteMe', 1, 9],
2474+
['keepMe', 2, 9],
2475+
['keepMeToo', 3, 9]],
2476+
columns=['first', 'second', 'third'])
2477+
df2 = df.set_index(['first', 'second'], drop=False)
2478+
df2 = df2[df2['first'] != 'deleteMe']
2479+
2480+
# removed levels are there
2481+
expected = MultiIndex(levels=[['deleteMe', 'keepMe', 'keepMeToo'],
2482+
[1, 2, 3]],
2483+
labels=[[1, 2], [1, 2]],
2484+
names=['first', 'second'])
2485+
result = df2.index
2486+
tm.assert_index_equal(result, expected)
2487+
2488+
expected = MultiIndex(levels=[['keepMe', 'keepMeToo'],
2489+
[2, 3]],
2490+
labels=[[0, 1], [0, 1]],
2491+
names=['first', 'second'])
2492+
result = df2.index._reconstruct(remove_unused=True)
2493+
tm.assert_index_equal(result, expected)
2494+
2495+
# idempotent
2496+
result2 = result._reconstruct(remove_unused=True)
2497+
tm.assert_index_equal(result2, expected)
2498+
assert result2 is result
2499+
24672500
def test_isin(self):
24682501
values = [('foo', 2), ('bar', 3), ('quux', 4)]
24692502

Diff for: pandas/tests/test_multilevel.py

+19-3
Original file line numberDiff line numberDiff line change
@@ -2559,16 +2559,32 @@ def test_sort_index_and_reconstruction(self):
25592559
assert result.columns.is_lexsorted()
25602560
assert result.columns.is_monotonic
25612561

2562+
def test_sort_index_and_reconstruction_doc_example(self):
25622563
# doc example
25632564
df = DataFrame({'value': [1, 2, 3, 4]},
25642565
index=MultiIndex(
25652566
levels=[['a', 'b'], ['bb', 'aa']],
2566-
labels=[[0, 0, 1, 1], [1, 0, 1, 0]]))
2567-
result = df.sort_index()
2567+
labels=[[0, 0, 1, 1], [0, 1, 0, 1]]))
2568+
assert df.index.is_lexsorted()
2569+
assert not df.index.is_monotonic
2570+
2571+
# sort it
25682572
expected = DataFrame({'value': [2, 1, 4, 3]},
25692573
index=MultiIndex(
25702574
levels=[['a', 'b'], ['aa', 'bb']],
2571-
labels=[[0, 0, 1, 1], [1, 0, 1, 0]]))
2575+
labels=[[0, 0, 1, 1], [0, 1, 0, 1]]))
2576+
result = df.sort_index()
2577+
assert not result.index.is_lexsorted()
2578+
assert result.index.is_monotonic
2579+
2580+
tm.assert_frame_equal(result, expected)
2581+
2582+
# reconstruct
2583+
result = df.sort_index().copy()
2584+
result.index = result.index._reconstruct(sort=True)
2585+
assert result.index.is_lexsorted()
2586+
assert result.index.is_monotonic
2587+
25722588
tm.assert_frame_equal(result, expected)
25732589

25742590
def test_sort_index_reorder_on_ops(self):

0 commit comments

Comments
 (0)