Skip to content

Commit

Permalink
fix: Fix unit null rank (#18252)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Aug 18, 2024
1 parent 0e9914d commit 1dc2533
Show file tree
Hide file tree
Showing 5 changed files with 106 additions and 89 deletions.
9 changes: 9 additions & 0 deletions crates/polars-ops/src/series/ops/rank.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,15 @@ unsafe fn rank_impl<F: FnMut(&mut [IdxSize])>(idxs: &IdxCa, neq: &BooleanArray,
fn rank(s: &Series, method: RankMethod, descending: bool, seed: Option<u64>) -> Series {
let len = s.len();
let null_count = s.null_count();

if null_count == len {
let dt = match method {
Average => DataType::Float64,
_ => IDX_DTYPE,
};
return Series::full_null(s.name(), s.len(), &dt);
}

match len {
1 => {
return match method {
Expand Down
49 changes: 0 additions & 49 deletions py-polars/tests/unit/expr/test_exprs.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,55 +333,6 @@ def test_arr_contains() -> None:
}


def test_rank() -> None:
df = pl.DataFrame(
{
"a": [1, 1, 2, 2, 3],
}
)

s = df.select(pl.col("a").rank(method="average").alias("b")).to_series()
assert s.to_list() == [1.5, 1.5, 3.5, 3.5, 5.0]
assert s.dtype == pl.Float64

s = df.select(pl.col("a").rank(method="max").alias("b")).to_series()
assert s.to_list() == [2, 2, 4, 4, 5]
assert s.dtype == pl.get_index_type()


def test_rank_so_4109() -> None:
# also tests ranks null behavior
df = pl.from_dict(
{
"id": [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4],
"rank": [None, 3, 2, 4, 1, 4, 3, 2, 1, None, 3, 4, 4, 1, None, 3],
}
).sort(by=["id", "rank"])

assert df.group_by("id").agg(
[
pl.col("rank").alias("original"),
pl.col("rank").rank(method="dense").alias("dense"),
pl.col("rank").rank(method="average").alias("average"),
]
).to_dict(as_series=False) == {
"id": [1, 2, 3, 4],
"original": [[None, 2, 3, 4], [1, 2, 3, 4], [None, 1, 3, 4], [None, 1, 3, 4]],
"dense": [[None, 1, 2, 3], [1, 2, 3, 4], [None, 1, 2, 3], [None, 1, 2, 3]],
"average": [
[None, 1.0, 2.0, 3.0],
[1.0, 2.0, 3.0, 4.0],
[None, 1.0, 2.0, 3.0],
[None, 1.0, 2.0, 3.0],
],
}


def test_rank_string_null_11252() -> None:
rank = pl.Series([None, "", "z", None, "a"]).rank()
assert rank.to_list() == [None, 1.0, 3.0, None, 2.0]


def test_logical_boolean() -> None:
# note, cannot use expressions in logical
# boolean context (eg: and/or/not operators)
Expand Down
21 changes: 0 additions & 21 deletions py-polars/tests/unit/operations/test_random.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,27 +116,6 @@ def test_sample_series() -> None:
assert len(s.sample(n=10, with_replacement=True, seed=0)) == 10


def test_rank_random_expr() -> None:
df = pl.from_dict(
{"a": [1] * 5, "b": [1, 2, 3, 4, 5], "c": [200, 100, 100, 50, 100]}
)

df_ranks1 = df.with_columns(
pl.col("c").rank(method="random", seed=1).over("a").alias("rank")
)
df_ranks2 = df.with_columns(
pl.col("c").rank(method="random", seed=1).over("a").alias("rank")
)
assert_frame_equal(df_ranks1, df_ranks2)


def test_rank_random_series() -> None:
s = pl.Series("a", [1, 2, 3, 2, 2, 3, 0])
assert_series_equal(
s.rank("random", seed=1), pl.Series("a", [2, 4, 7, 3, 5, 6, 1], dtype=pl.UInt32)
)


def test_shuffle_expr() -> None:
# pl.set_random_seed should lead to reproducible results.
s = pl.Series("a", range(20))
Expand Down
97 changes: 97 additions & 0 deletions py-polars/tests/unit/operations/test_rank.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import polars as pl
from polars.testing import assert_frame_equal, assert_series_equal


def test_rank_nulls() -> None:
assert pl.Series([]).rank().to_list() == []
assert pl.Series([None]).rank().to_list() == [None]
assert pl.Series([None, None]).rank().to_list() == [None, None]


def test_rank_random_expr() -> None:
df = pl.from_dict(
{"a": [1] * 5, "b": [1, 2, 3, 4, 5], "c": [200, 100, 100, 50, 100]}
)

df_ranks1 = df.with_columns(
pl.col("c").rank(method="random", seed=1).over("a").alias("rank")
)
df_ranks2 = df.with_columns(
pl.col("c").rank(method="random", seed=1).over("a").alias("rank")
)
assert_frame_equal(df_ranks1, df_ranks2)


def test_rank_random_series() -> None:
s = pl.Series("a", [1, 2, 3, 2, 2, 3, 0])
assert_series_equal(
s.rank("random", seed=1), pl.Series("a", [2, 4, 7, 3, 5, 6, 1], dtype=pl.UInt32)
)


def test_rank_df() -> None:
df = pl.DataFrame(
{
"a": [1, 1, 2, 2, 3],
}
)

s = df.select(pl.col("a").rank(method="average").alias("b")).to_series()
assert s.to_list() == [1.5, 1.5, 3.5, 3.5, 5.0]
assert s.dtype == pl.Float64

s = df.select(pl.col("a").rank(method="max").alias("b")).to_series()
assert s.to_list() == [2, 2, 4, 4, 5]
assert s.dtype == pl.get_index_type()


def test_rank_so_4109() -> None:
# also tests ranks null behavior
df = pl.from_dict(
{
"id": [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4],
"rank": [None, 3, 2, 4, 1, 4, 3, 2, 1, None, 3, 4, 4, 1, None, 3],
}
).sort(by=["id", "rank"])

assert df.group_by("id").agg(
[
pl.col("rank").alias("original"),
pl.col("rank").rank(method="dense").alias("dense"),
pl.col("rank").rank(method="average").alias("average"),
]
).to_dict(as_series=False) == {
"id": [1, 2, 3, 4],
"original": [[None, 2, 3, 4], [1, 2, 3, 4], [None, 1, 3, 4], [None, 1, 3, 4]],
"dense": [[None, 1, 2, 3], [1, 2, 3, 4], [None, 1, 2, 3], [None, 1, 2, 3]],
"average": [
[None, 1.0, 2.0, 3.0],
[1.0, 2.0, 3.0, 4.0],
[None, 1.0, 2.0, 3.0],
[None, 1.0, 2.0, 3.0],
],
}


def test_rank_string_null_11252() -> None:
rank = pl.Series([None, "", "z", None, "a"]).rank()
assert rank.to_list() == [None, 1.0, 3.0, None, 2.0]


def test_rank_series() -> None:
s = pl.Series("a", [1, 2, 3, 2, 2, 3, 0])

assert_series_equal(
s.rank("dense"), pl.Series("a", [2, 3, 4, 3, 3, 4, 1], dtype=pl.UInt32)
)

df = pl.DataFrame([s])
assert df.select(pl.col("a").rank("dense"))["a"].to_list() == [2, 3, 4, 3, 3, 4, 1]

assert_series_equal(
s.rank("dense", descending=True),
pl.Series("a", [3, 2, 1, 2, 2, 1, 4], dtype=pl.UInt32),
)

assert s.rank(method="average").dtype == pl.Float64
assert s.rank(method="max").dtype == pl.get_index_type()
19 changes: 0 additions & 19 deletions py-polars/tests/unit/series/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -994,25 +994,6 @@ def test_mode() -> None:
assert pl.int_range(0, 3, eager=True).mode().to_list() == [2, 1, 0]


def test_rank() -> None:
s = pl.Series("a", [1, 2, 3, 2, 2, 3, 0])

assert_series_equal(
s.rank("dense"), pl.Series("a", [2, 3, 4, 3, 3, 4, 1], dtype=UInt32)
)

df = pl.DataFrame([s])
assert df.select(pl.col("a").rank("dense"))["a"].to_list() == [2, 3, 4, 3, 3, 4, 1]

assert_series_equal(
s.rank("dense", descending=True),
pl.Series("a", [3, 2, 1, 2, 2, 1, 4], dtype=UInt32),
)

assert s.rank(method="average").dtype == pl.Float64
assert s.rank(method="max").dtype == pl.get_index_type()


def test_diff() -> None:
s = pl.Series("a", [1, 2, 3, 2, 2, 3, 0])
expected = pl.Series("a", [1, 1, -1, 0, 1, -3])
Expand Down

0 comments on commit 1dc2533

Please sign in to comment.