Skip to content

Commit

Permalink
REF: misplaced sort_index tests (#33774)
Browse files Browse the repository at this point in the history
  • Loading branch information
jbrockmendel authored Apr 24, 2020
1 parent 3df51fc commit 70175be
Show file tree
Hide file tree
Showing 3 changed files with 362 additions and 328 deletions.
340 changes: 339 additions & 1 deletion pandas/tests/frame/methods/test_sort_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,156 @@
import pytest

import pandas as pd
from pandas import CategoricalDtype, DataFrame, IntervalIndex, MultiIndex, Series
from pandas import CategoricalDtype, DataFrame, Index, IntervalIndex, MultiIndex, Series
import pandas._testing as tm


class TestDataFrameSortIndex:
def test_sort_index_and_reconstruction_doc_example(self):
# doc example
df = DataFrame(
{"value": [1, 2, 3, 4]},
index=MultiIndex(
levels=[["a", "b"], ["bb", "aa"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]]
),
)
assert df.index.is_lexsorted()
assert not df.index.is_monotonic

# sort it
expected = DataFrame(
{"value": [2, 1, 4, 3]},
index=MultiIndex(
levels=[["a", "b"], ["aa", "bb"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]]
),
)
result = df.sort_index()
assert result.index.is_lexsorted()
assert result.index.is_monotonic

tm.assert_frame_equal(result, expected)

# reconstruct
result = df.sort_index().copy()
result.index = result.index._sort_levels_monotonic()
assert result.index.is_lexsorted()
assert result.index.is_monotonic

tm.assert_frame_equal(result, expected)

def test_sort_index_non_existent_label_multiindex(self):
# GH#12261
df = DataFrame(0, columns=[], index=MultiIndex.from_product([[], []]))
df.loc["b", "2"] = 1
df.loc["a", "3"] = 1
result = df.sort_index().index.is_monotonic
assert result is True

def test_sort_index_reorder_on_ops(self):
# GH#15687
df = DataFrame(
np.random.randn(8, 2),
index=MultiIndex.from_product(
[["a", "b"], ["big", "small"], ["red", "blu"]],
names=["letter", "size", "color"],
),
columns=["near", "far"],
)
df = df.sort_index()

def my_func(group):
group.index = ["newz", "newa"]
return group

result = df.groupby(level=["letter", "size"]).apply(my_func).sort_index()
expected = MultiIndex.from_product(
[["a", "b"], ["big", "small"], ["newa", "newz"]],
names=["letter", "size", None],
)

tm.assert_index_equal(result.index, expected)

def test_sort_index_nan_multiindex(self):
# GH#14784
# incorrect sorting w.r.t. nans
tuples = [[12, 13], [np.nan, np.nan], [np.nan, 3], [1, 2]]
mi = MultiIndex.from_tuples(tuples)

df = DataFrame(np.arange(16).reshape(4, 4), index=mi, columns=list("ABCD"))
s = Series(np.arange(4), index=mi)

df2 = DataFrame(
{
"date": pd.DatetimeIndex(
[
"20121002",
"20121007",
"20130130",
"20130202",
"20130305",
"20121002",
"20121207",
"20130130",
"20130202",
"20130305",
"20130202",
"20130305",
]
),
"user_id": [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5],
"whole_cost": [
1790,
np.nan,
280,
259,
np.nan,
623,
90,
312,
np.nan,
301,
359,
801,
],
"cost": [12, 15, 10, 24, 39, 1, 0, np.nan, 45, 34, 1, 12],
}
).set_index(["date", "user_id"])

# sorting frame, default nan position is last
result = df.sort_index()
expected = df.iloc[[3, 0, 2, 1], :]
tm.assert_frame_equal(result, expected)

# sorting frame, nan position last
result = df.sort_index(na_position="last")
expected = df.iloc[[3, 0, 2, 1], :]
tm.assert_frame_equal(result, expected)

# sorting frame, nan position first
result = df.sort_index(na_position="first")
expected = df.iloc[[1, 2, 3, 0], :]
tm.assert_frame_equal(result, expected)

# sorting frame with removed rows
result = df2.dropna().sort_index()
expected = df2.sort_index().dropna()
tm.assert_frame_equal(result, expected)

# sorting series, default nan position is last
result = s.sort_index()
expected = s.iloc[[3, 0, 2, 1]]
tm.assert_series_equal(result, expected)

# sorting series, nan position last
result = s.sort_index(na_position="last")
expected = s.iloc[[3, 0, 2, 1]]
tm.assert_series_equal(result, expected)

# sorting series, nan position first
result = s.sort_index(na_position="first")
expected = s.iloc[[1, 2, 3, 0]]
tm.assert_series_equal(result, expected)

def test_sort_index_nan(self):
# GH#3917

Expand Down Expand Up @@ -318,3 +463,196 @@ def test_sort_index_ignore_index_multi_index(

tm.assert_frame_equal(result_df, expected_df)
tm.assert_frame_equal(df, DataFrame(original_dict, index=mi))

def test_sort_index_categorical_multiindex(self):
# GH#15058
df = DataFrame(
{
"a": range(6),
"l1": pd.Categorical(
["a", "a", "b", "b", "c", "c"],
categories=["c", "a", "b"],
ordered=True,
),
"l2": [0, 1, 0, 1, 0, 1],
}
)
result = df.set_index(["l1", "l2"]).sort_index()
expected = DataFrame(
[4, 5, 0, 1, 2, 3],
columns=["a"],
index=MultiIndex(
levels=[
pd.CategoricalIndex(
["c", "a", "b"],
categories=["c", "a", "b"],
ordered=True,
name="l1",
dtype="category",
),
[0, 1],
],
codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]],
names=["l1", "l2"],
),
)
tm.assert_frame_equal(result, expected)

def test_sort_index_and_reconstruction(self):

# GH#15622
# lexsortedness should be identical
# across MultiIndex construction methods

df = DataFrame([[1, 1], [2, 2]], index=list("ab"))
expected = DataFrame(
[[1, 1], [2, 2], [1, 1], [2, 2]],
index=MultiIndex.from_tuples(
[(0.5, "a"), (0.5, "b"), (0.8, "a"), (0.8, "b")]
),
)
assert expected.index.is_lexsorted()

result = DataFrame(
[[1, 1], [2, 2], [1, 1], [2, 2]],
index=MultiIndex.from_product([[0.5, 0.8], list("ab")]),
)
result = result.sort_index()
assert result.index.is_lexsorted()
assert result.index.is_monotonic

tm.assert_frame_equal(result, expected)

result = DataFrame(
[[1, 1], [2, 2], [1, 1], [2, 2]],
index=MultiIndex(
levels=[[0.5, 0.8], ["a", "b"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]]
),
)
result = result.sort_index()
assert result.index.is_lexsorted()

tm.assert_frame_equal(result, expected)

concatted = pd.concat([df, df], keys=[0.8, 0.5])
result = concatted.sort_index()

assert result.index.is_lexsorted()
assert result.index.is_monotonic

tm.assert_frame_equal(result, expected)

# GH#14015
df = DataFrame(
[[1, 2], [6, 7]],
columns=MultiIndex.from_tuples(
[(0, "20160811 12:00:00"), (0, "20160809 12:00:00")],
names=["l1", "Date"],
),
)

df.columns.set_levels(
pd.to_datetime(df.columns.levels[1]), level=1, inplace=True
)
assert not df.columns.is_lexsorted()
assert not df.columns.is_monotonic
result = df.sort_index(axis=1)
assert result.columns.is_lexsorted()
assert result.columns.is_monotonic
result = df.sort_index(axis=1, level=1)
assert result.columns.is_lexsorted()
assert result.columns.is_monotonic

# TODO: better name, de-duplicate with test_sort_index_level above
def test_sort_index_level2(self):
mi = MultiIndex(
levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]],
codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
names=["first", "second"],
)
frame = DataFrame(
np.random.randn(10, 3),
index=mi,
columns=Index(["A", "B", "C"], name="exp"),
)

df = frame.copy()
df.index = np.arange(len(df))

# axis=1

# series
a_sorted = frame["A"].sort_index(level=0)

# preserve names
assert a_sorted.index.names == frame.index.names

# inplace
rs = frame.copy()
rs.sort_index(level=0, inplace=True)
tm.assert_frame_equal(rs, frame.sort_index(level=0))

def test_sort_index_level_large_cardinality(self):

# GH#2684 (int64)
index = MultiIndex.from_arrays([np.arange(4000)] * 3)
df = DataFrame(np.random.randn(4000), index=index, dtype=np.int64)

# it works!
result = df.sort_index(level=0)
assert result.index.lexsort_depth == 3

# GH#2684 (int32)
index = MultiIndex.from_arrays([np.arange(4000)] * 3)
df = DataFrame(np.random.randn(4000), index=index, dtype=np.int32)

# it works!
result = df.sort_index(level=0)
assert (result.dtypes.values == df.dtypes.values).all()
assert result.index.lexsort_depth == 3

def test_sort_index_level_by_name(self):
mi = MultiIndex(
levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]],
codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
names=["first", "second"],
)
frame = DataFrame(
np.random.randn(10, 3),
index=mi,
columns=Index(["A", "B", "C"], name="exp"),
)

frame.index.names = ["first", "second"]
result = frame.sort_index(level="second")
expected = frame.sort_index(level=1)
tm.assert_frame_equal(result, expected)

def test_sort_index_level_mixed(self):
mi = MultiIndex(
levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]],
codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
names=["first", "second"],
)
frame = DataFrame(
np.random.randn(10, 3),
index=mi,
columns=Index(["A", "B", "C"], name="exp"),
)

sorted_before = frame.sort_index(level=1)

df = frame.copy()
df["foo"] = "bar"
sorted_after = df.sort_index(level=1)
tm.assert_frame_equal(sorted_before, sorted_after.drop(["foo"], axis=1))

dft = frame.T
sorted_before = dft.sort_index(level=1, axis=1)
dft["foo", "three"] = "bar"

sorted_after = dft.sort_index(level=1, axis=1)
tm.assert_frame_equal(
sorted_before.drop([("foo", "three")], axis=1),
sorted_after.drop([("foo", "three")], axis=1),
)
23 changes: 23 additions & 0 deletions pandas/tests/series/methods/test_sort_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,3 +170,26 @@ def test_sort_index_ignore_index(

tm.assert_series_equal(result_ser, expected)
tm.assert_series_equal(ser, Series(original_list))

def test_sort_index_ascending_list(self):
# GH#16934

# Set up a Series with a three level MultiIndex
arrays = [
["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
["one", "two", "one", "two", "one", "two", "one", "two"],
[4, 3, 2, 1, 4, 3, 2, 1],
]
tuples = zip(*arrays)
mi = MultiIndex.from_tuples(tuples, names=["first", "second", "third"])
ser = Series(range(8), index=mi)

# Sort with boolean ascending
result = ser.sort_index(level=["third", "first"], ascending=False)
expected = ser.iloc[[4, 0, 5, 1, 6, 2, 7, 3]]
tm.assert_series_equal(result, expected)

# Sort with list of boolean ascending
result = ser.sort_index(level=["third", "first"], ascending=[False, True])
expected = ser.iloc[[0, 4, 1, 5, 2, 6, 3, 7]]
tm.assert_series_equal(result, expected)
Loading

0 comments on commit 70175be

Please sign in to comment.