Skip to content

CLN: rank_1d #40546

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 18 commits into from
Mar 23, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
149 changes: 83 additions & 66 deletions pandas/_libs/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -946,22 +946,27 @@ def rank_1d(
cdef:
TiebreakEnumType tiebreak
Py_ssize_t i, j, N, grp_start=0, dups=0, sum_ranks=0
Py_ssize_t grp_vals_seen=1, grp_na_count=0, grp_tie_count=0
Py_ssize_t grp_vals_seen=1, grp_na_count=0
ndarray[int64_t, ndim=1] lexsort_indexer
ndarray[float64_t, ndim=1] grp_sizes, out
ndarray[rank_t, ndim=1] masked_vals
ndarray[uint8_t, ndim=1] mask
bint keep_na, at_end, next_val_diff, check_labels
bint keep_na, at_end, next_val_diff, check_labels, group_changed
rank_t nan_fill_val

tiebreak = tiebreakers[ties_method]
if tiebreak == TIEBREAK_FIRST:
if not ascending:
tiebreak = TIEBREAK_FIRST_DESCENDING

keep_na = na_option == 'keep'

N = len(values)
# TODO Cython 3.0: cast won't be necessary (#2992)
assert <Py_ssize_t>len(labels) == N
out = np.empty(N)
grp_sizes = np.ones(N)

# If all 0 labels, can short-circuit later label
# comparisons
check_labels = np.any(labels)
Expand All @@ -983,6 +988,12 @@ def rank_1d(
else:
mask = np.zeros(shape=len(masked_vals), dtype=np.uint8)

# If `na_option == 'top'`, we want to assign the lowest rank
# to NaN regardless of ascending/descending. So if ascending,
# fill with lowest value of type to end up with lowest rank.
# If descending, fill with highest value since descending
# will flip the ordering to still end up with lowest rank.
# Symmetric logic applies to `na_option == 'bottom'`
if ascending ^ (na_option == 'top'):
if rank_t is object:
nan_fill_val = Infinity()
Expand Down Expand Up @@ -1025,36 +1036,36 @@ def rank_1d(
if rank_t is object:
for i in range(N):
at_end = i == N - 1

# dups and sum_ranks will be incremented each loop where
# the value / group remains the same, and should be reset
# when either of those change
# Used to calculate tiebreakers
# when either of those change. Used to calculate tiebreakers
dups += 1
sum_ranks += i - grp_start + 1

next_val_diff = at_end or are_diff(masked_vals[lexsort_indexer[i]],
masked_vals[lexsort_indexer[i+1]])

# We'll need this check later anyway to determine group size, so just
# compute it here since shortcircuiting won't help
group_changed = at_end or (check_labels and
(labels[lexsort_indexer[i]]
!= labels[lexsort_indexer[i+1]]))

# Update out only when there is a transition of values or labels.
# When a new value or group is encountered, go back #dups steps(
# the number of occurrence of current value) and assign the ranks
# based on the starting index of the current group (grp_start)
# and the current index
if not at_end:
next_val_diff = are_diff(masked_vals[lexsort_indexer[i]],
masked_vals[lexsort_indexer[i+1]])
else:
next_val_diff = True

if (next_val_diff
or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]])
or (check_labels
and (labels[lexsort_indexer[i]]
!= labels[lexsort_indexer[i+1]]))
):
# if keep_na, check for missing values and assign back
if (next_val_diff or group_changed
or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]])):

# If keep_na, check for missing values and assign back
# to the result where appropriate
if keep_na and mask[lexsort_indexer[i]]:
grp_na_count = dups
for j in range(i - dups + 1, i + 1):
out[lexsort_indexer[j]] = NaN
grp_na_count = dups
elif tiebreak == TIEBREAK_AVERAGE:
for j in range(i - dups + 1, i + 1):
out[lexsort_indexer[j]] = sum_ranks / <float64_t>dups
Expand All @@ -1064,84 +1075,87 @@ def rank_1d(
elif tiebreak == TIEBREAK_MAX:
for j in range(i - dups + 1, i + 1):
out[lexsort_indexer[j]] = i - grp_start + 1

# With n as the previous rank in the group and m as the number
# of duplicates in this stretch, if TIEBREAK_FIRST and ascending,
# then rankings should be n + 1, n + 2 ... n + m
elif tiebreak == TIEBREAK_FIRST:
for j in range(i - dups + 1, i + 1):
if ascending:
out[lexsort_indexer[j]] = j + 1 - grp_start
else:
out[lexsort_indexer[j]] = 2 * i - j - dups + 2 - grp_start
out[lexsort_indexer[j]] = j + 1 - grp_start

# If TIEBREAK_FIRST and descending, the ranking should be
# n + m, n + (m - 1) ... n + 1. This is equivalent to
# (i - dups + 1) + (i - j + 1) - grp_start
elif tiebreak == TIEBREAK_FIRST_DESCENDING:
for j in range(i - dups + 1, i + 1):
out[lexsort_indexer[j]] = 2 * i - j - dups + 2 - grp_start
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should it be obvious where 2 * i - j - dups + 2 - grp_start comes from? looks like equivalent to (i - dups + 1) + (i - j + 1) - grp_start

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed, good opportunity to clarify here, have added some commentary

elif tiebreak == TIEBREAK_DENSE:
for j in range(i - dups + 1, i + 1):
out[lexsort_indexer[j]] = grp_vals_seen

# look forward to the next value (using the sorting in _as)
# Look forward to the next value (using the sorting in lexsort_indexer)
# if the value does not equal the current value then we need to
# reset the dups and sum_ranks, knowing that a new value is
# coming up. the conditional also needs to handle nan equality
# coming up. The conditional also needs to handle nan equality
# and the end of iteration
if next_val_diff or (mask[lexsort_indexer[i]]
^ mask[lexsort_indexer[i+1]]):
dups = sum_ranks = 0
grp_vals_seen += 1
grp_tie_count += 1

# Similar to the previous conditional, check now if we are
# moving to a new group. If so, keep track of the index where
# the new group occurs, so the tiebreaker calculations can
# decrement that from their position. fill in the size of each
# group encountered (used by pct calculations later). also be
# decrement that from their position. Fill in the size of each
# group encountered (used by pct calculations later). Also be
# sure to reset any of the items helping to calculate dups
if (at_end or
(check_labels
and (labels[lexsort_indexer[i]]
!= labels[lexsort_indexer[i+1]]))):
if group_changed:
if tiebreak != TIEBREAK_DENSE:
for j in range(grp_start, i + 1):
grp_sizes[lexsort_indexer[j]] = \
(i - grp_start + 1 - grp_na_count)
else:
for j in range(grp_start, i + 1):
grp_sizes[lexsort_indexer[j]] = \
(grp_tie_count - (grp_na_count > 0))
(grp_vals_seen - 1 - (grp_na_count > 0))
dups = sum_ranks = 0
grp_na_count = 0
grp_tie_count = 0
grp_start = i + 1
grp_vals_seen = 1
else:
with nogil:
for i in range(N):
at_end = i == N - 1

# dups and sum_ranks will be incremented each loop where
# the value / group remains the same, and should be reset
# when either of those change
# Used to calculate tiebreakers
# when either of those change. Used to calculate tiebreakers
dups += 1
sum_ranks += i - grp_start + 1

next_val_diff = at_end or (masked_vals[lexsort_indexer[i]]
!= masked_vals[lexsort_indexer[i+1]])

# We'll need this check later anyway to determine group size, so just
# compute it here since shortcircuiting won't help
group_changed = at_end or (check_labels and
(labels[lexsort_indexer[i]]
!= labels[lexsort_indexer[i+1]]))

# Update out only when there is a transition of values or labels.
# When a new value or group is encountered, go back #dups steps(
# the number of occurrence of current value) and assign the ranks
# based on the starting index of the current group (grp_start)
# and the current index
if not at_end:
next_val_diff = (masked_vals[lexsort_indexer[i]]
!= masked_vals[lexsort_indexer[i+1]])
else:
next_val_diff = True

if (next_val_diff
or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]])
or (check_labels
and (labels[lexsort_indexer[i]]
!= labels[lexsort_indexer[i+1]]))
):
# if keep_na, check for missing values and assign back
if (next_val_diff or group_changed
or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]])):

# If keep_na, check for missing values and assign back
# to the result where appropriate
if keep_na and mask[lexsort_indexer[i]]:
grp_na_count = dups
for j in range(i - dups + 1, i + 1):
out[lexsort_indexer[j]] = NaN
grp_na_count = dups
elif tiebreak == TIEBREAK_AVERAGE:
for j in range(i - dups + 1, i + 1):
out[lexsort_indexer[j]] = sum_ranks / <float64_t>dups
Expand All @@ -1151,48 +1165,51 @@ def rank_1d(
elif tiebreak == TIEBREAK_MAX:
for j in range(i - dups + 1, i + 1):
out[lexsort_indexer[j]] = i - grp_start + 1

# With n as the previous rank in the group and m as the number
# of duplicates in this stretch, if TIEBREAK_FIRST and ascending,
# then rankings should be n + 1, n + 2 ... n + m
elif tiebreak == TIEBREAK_FIRST:
for j in range(i - dups + 1, i + 1):
if ascending:
out[lexsort_indexer[j]] = j + 1 - grp_start
else:
out[lexsort_indexer[j]] = \
(2 * i - j - dups + 2 - grp_start)
out[lexsort_indexer[j]] = j + 1 - grp_start

# If TIEBREAK_FIRST and descending, the ranking should be
# n + m, n + (m - 1) ... n + 1. This is equivalent to
# (i - dups + 1) + (i - j + 1) - grp_start
elif tiebreak == TIEBREAK_FIRST_DESCENDING:
for j in range(i - dups + 1, i + 1):
out[lexsort_indexer[j]] = 2 * i - j - dups + 2 - grp_start
elif tiebreak == TIEBREAK_DENSE:
for j in range(i - dups + 1, i + 1):
out[lexsort_indexer[j]] = grp_vals_seen

# look forward to the next value (using the sorting in
# Look forward to the next value (using the sorting in
# lexsort_indexer) if the value does not equal the current
# value then we need to reset the dups and sum_ranks,
# knowing that a new value is coming up. the conditional
# also needs to handle nan equality and the end of iteration
# value then we need to reset the dups and sum_ranks, knowing
# that a new value is coming up. The conditional also needs
# to handle nan equality and the end of iteration
if next_val_diff or (mask[lexsort_indexer[i]]
^ mask[lexsort_indexer[i+1]]):
dups = sum_ranks = 0
grp_vals_seen += 1
grp_tie_count += 1

# Similar to the previous conditional, check now if we are
# moving to a new group. If so, keep track of the index where
# the new group occurs, so the tiebreaker calculations can
# decrement that from their position. fill in the size of each
# group encountered (used by pct calculations later). also be
# decrement that from their position. Fill in the size of each
# group encountered (used by pct calculations later). Also be
# sure to reset any of the items helping to calculate dups
if at_end or (check_labels and
(labels[lexsort_indexer[i]]
!= labels[lexsort_indexer[i+1]])):
if group_changed:
if tiebreak != TIEBREAK_DENSE:
for j in range(grp_start, i + 1):
grp_sizes[lexsort_indexer[j]] = \
(i - grp_start + 1 - grp_na_count)
else:
for j in range(grp_start, i + 1):
grp_sizes[lexsort_indexer[j]] = \
(grp_tie_count - (grp_na_count > 0))
(grp_vals_seen - 1 - (grp_na_count > 0))
dups = sum_ranks = 0
grp_na_count = 0
grp_tie_count = 0
grp_start = i + 1
grp_vals_seen = 1

Expand Down