Skip to content

REF: give rank1d/2d same nan filling #41916

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jun 17, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 50 additions & 52 deletions pandas/_libs/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -931,6 +931,32 @@ ctypedef fused rank_t:
int64_t


cdef rank_t get_rank_nan_fill_val(bint rank_nans_highest, rank_t[:] _=None):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is there an _ here?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can rank_t be required

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The _ was to signify that the argument is unused. Not requiring rank_t here is purposeful - the thought here was that we need an argument with rank_t present for fused type specialization, but with it being optional we never have to worry about passing anything.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i c, might be worth comment ing in the doc-string as this is non-obvious (oh you did this nvm)

"""
Return the value we'll use to represent missing values when sorting depending
on if we'd like missing values to end up at the top/bottom. (The second parameter
is unused, but needed for fused type specialization)
"""
if rank_nans_highest:
if rank_t is object:
return Infinity()
elif rank_t is int64_t:
return util.INT64_MAX
elif rank_t is uint64_t:
return util.UINT64_MAX
else:
return np.inf
else:
if rank_t is object:
return NegInfinity()
elif rank_t is int64_t:
return NPY_NAT
elif rank_t is uint64_t:
return 0
else:
return -np.inf


@cython.wraparound(False)
@cython.boundscheck(False)
def rank_1d(
Expand Down Expand Up @@ -980,7 +1006,7 @@ def rank_1d(
ndarray[rank_t, ndim=1] masked_vals
rank_t[:] masked_vals_memview
uint8_t[:] mask
bint keep_na, check_labels, check_mask
bint keep_na, nans_rank_highest, check_labels, check_mask
rank_t nan_fill_val

tiebreak = tiebreakers[ties_method]
Expand Down Expand Up @@ -1026,27 +1052,12 @@ def rank_1d(
# If descending, fill with highest value since descending
# will flip the ordering to still end up with lowest rank.
# Symmetric logic applies to `na_option == 'bottom'`
if ascending ^ (na_option == 'top'):
if rank_t is object:
nan_fill_val = Infinity()
elif rank_t is int64_t:
nan_fill_val = util.INT64_MAX
elif rank_t is uint64_t:
nan_fill_val = util.UINT64_MAX
else:
nan_fill_val = np.inf
nans_rank_highest = ascending ^ (na_option == 'top')
nan_fill_val = get_rank_nan_fill_val[rank_t](nans_rank_highest)
if nans_rank_highest:
order = (masked_vals, mask, labels)
else:
if rank_t is object:
nan_fill_val = NegInfinity()
elif rank_t is int64_t:
nan_fill_val = NPY_NAT
elif rank_t is uint64_t:
nan_fill_val = 0
else:
nan_fill_val = -np.inf

order = (masked_vals, ~(np.array(mask, copy=False)), labels)
order = (masked_vals, ~(np.asarray(mask)), labels)

np.putmask(masked_vals, mask, nan_fill_val)
# putmask doesn't accept a memoryview, so we assign as a separate step
Expand All @@ -1073,14 +1084,11 @@ def rank_1d(
check_mask,
check_labels,
keep_na,
pct,
N,
)
if pct:
for i in range(N):
if grp_sizes[i] != 0:
out[i] = out[i] / grp_sizes[i]

return np.array(out)
return np.asarray(out)


@cython.wraparound(False)
Expand All @@ -1097,6 +1105,7 @@ cdef void rank_sorted_1d(
bint check_mask,
bint check_labels,
bint keep_na,
bint pct,
Py_ssize_t N,
) nogil:
"""
Expand All @@ -1108,7 +1117,7 @@ cdef void rank_sorted_1d(
out : float64_t[::1]
Array to store computed ranks
grp_sizes : int64_t[::1]
Array to store group counts.
Array to store group counts, only used if pct=True
labels : See rank_1d.__doc__
sort_indexer : intp_t[:]
Array of indices which sorts masked_vals
Expand All @@ -1118,12 +1127,14 @@ cdef void rank_sorted_1d(
Array where entries are True if the value is missing, False otherwise
tiebreak : TiebreakEnumType
See rank_1d.__doc__ for the different modes
check_mask : bint
check_mask : bool
If False, assumes the mask is all False to skip mask indexing
check_labels : bint
check_labels : bool
If False, assumes all labels are the same to skip group handling logic
keep_na : bint
keep_na : bool
Whether or not to keep nulls
pct : bool
Compute percentage rank of data within each group
N : Py_ssize_t
The number of elements to rank. Note: it is not always true that
N == len(out) or N == len(masked_vals) (see `nancorr_spearman` usage for why)
Expand Down Expand Up @@ -1342,6 +1353,11 @@ cdef void rank_sorted_1d(
grp_start = i + 1
grp_vals_seen = 1

if pct:
for i in range(N):
if grp_sizes[i] != 0:
out[i] = out[i] / grp_sizes[i]


def rank_2d(
ndarray[rank_t, ndim=2] in_arr,
Expand All @@ -1362,11 +1378,11 @@ def rank_2d(
ndarray[rank_t, ndim=2] values
ndarray[intp_t, ndim=2] argsort_indexer
ndarray[uint8_t, ndim=2] mask
rank_t val, nan_value
rank_t val, nan_fill_val
float64_t count, sum_ranks = 0.0
int tiebreak = 0
int64_t idx
bint check_mask, condition, keep_na
bint check_mask, condition, keep_na, nans_rank_highest

tiebreak = tiebreakers[ties_method]

Expand All @@ -1384,27 +1400,9 @@ def rank_2d(
if values.dtype != np.object_:
values = values.astype('O')

nans_rank_highest = ascending ^ (na_option == 'top')
if check_mask:
if ascending ^ (na_option == 'top'):
if rank_t is object:
nan_value = Infinity()
elif rank_t is float64_t:
nan_value = np.inf

# int64 and datetimelike
else:
nan_value = util.INT64_MAX

else:
if rank_t is object:
nan_value = NegInfinity()
elif rank_t is float64_t:
nan_value = -np.inf

# int64 and datetimelike
else:
nan_value = NPY_NAT

nan_fill_val = get_rank_nan_fill_val[rank_t](nans_rank_highest)
if rank_t is object:
mask = missing.isnaobj2d(values)
elif rank_t is float64_t:
Expand All @@ -1414,7 +1412,7 @@ def rank_2d(
else:
mask = values == NPY_NAT

np.putmask(values, mask, nan_value)
np.putmask(values, mask, nan_fill_val)
else:
mask = np.zeros_like(values, dtype=bool)

Expand Down