From 29dc59090e865ad69d7c5c1e647895916b315bdb Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Wed, 9 Jun 2021 21:25:24 -0400
Subject: [PATCH 1/3] REF: give ranks same nan filling

---
 pandas/_libs/algos.pyx | 100 +++++++++++++++++++++--------------------
 1 file changed, 51 insertions(+), 49 deletions(-)

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index 03f4ce273de6e..4fd515113316c 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -931,6 +931,32 @@ ctypedef fused rank_t:
     int64_t
 
 
+cdef rank_t get_rank_nan_fill_val(bint rank_nans_highest, ndarray[rank_t, ndim=1] _):
+    """
+    Return the value we'll use to represent missing values when sorting depending
+    on if we'd like missing values to end up at the top/bottom. (The second parameter
+    is unused, but needed for fused type specialization)
+    """
+    if rank_nans_highest:
+        if rank_t is object:
+            return Infinity()
+        elif rank_t is int64_t:
+            return np.iinfo(np.int64).max
+        elif rank_t is uint64_t:
+            return np.iinfo(np.uint64).max
+        else:
+            return np.inf
+    else:
+        if rank_t is object:
+            return NegInfinity()
+        elif rank_t is int64_t:
+            return NPY_NAT
+        elif rank_t is uint64_t:
+            return 0
+        else:
+            return -np.inf
+
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def rank_1d(
@@ -980,7 +1006,7 @@ def rank_1d(
         ndarray[rank_t, ndim=1] masked_vals
         rank_t[:] masked_vals_memview
         uint8_t[:] mask
-        bint keep_na, check_labels, check_mask
+        bint keep_na, nans_rank_highest, check_labels, check_mask
         rank_t nan_fill_val
 
     tiebreak = tiebreakers[ties_method]
@@ -1026,26 +1052,11 @@ def rank_1d(
     # If descending, fill with highest value since descending
     # will flip the ordering to still end up with lowest rank.
     # Symmetric logic applies to `na_option == 'bottom'`
-    if ascending ^ (na_option == 'top'):
-        if rank_t is object:
-            nan_fill_val = Infinity()
-        elif rank_t is int64_t:
-            nan_fill_val = np.iinfo(np.int64).max
-        elif rank_t is uint64_t:
-            nan_fill_val = np.iinfo(np.uint64).max
-        else:
-            nan_fill_val = np.inf
+    nans_rank_highest = ascending ^ (na_option == 'top')
+    nan_fill_val = get_rank_nan_fill_val(nans_rank_highest, masked_vals)
+    if nans_rank_highest:
         order = (masked_vals, mask, labels)
     else:
-        if rank_t is object:
-            nan_fill_val = NegInfinity()
-        elif rank_t is int64_t:
-            nan_fill_val = NPY_NAT
-        elif rank_t is uint64_t:
-            nan_fill_val = 0
-        else:
-            nan_fill_val = -np.inf
-
         order = (masked_vals, ~(np.array(mask, copy=False)), labels)
 
     np.putmask(masked_vals, mask, nan_fill_val)
@@ -1073,12 +1084,9 @@ def rank_1d(
             check_mask,
             check_labels,
             keep_na,
+            pct,
             N,
         )
-        if pct:
-            for i in range(N):
-                if grp_sizes[i] != 0:
-                    out[i] = out[i] / grp_sizes[i]
 
     return np.array(out)
 
@@ -1097,6 +1105,7 @@ cdef void rank_sorted_1d(
     bint check_mask,
     bint check_labels,
     bint keep_na,
+    bint pct,
     Py_ssize_t N,
 ) nogil:
     """
@@ -1108,7 +1117,7 @@ cdef void rank_sorted_1d(
     out : float64_t[::1]
         Array to store computed ranks
     grp_sizes : int64_t[::1]
-        Array to store group counts.
+        Array to store group counts, only used if pct=True
     labels : See rank_1d.__doc__
     sort_indexer : intp_t[:]
         Array of indices which sorts masked_vals
@@ -1118,12 +1127,14 @@ cdef void rank_sorted_1d(
         Array where entries are True if the value is missing, False otherwise
     tiebreak : TiebreakEnumType
         See rank_1d.__doc__ for the different modes
-    check_mask : bint
+    check_mask : bool
         If False, assumes the mask is all False to skip mask indexing
-    check_labels : bint
+    check_labels : bool
         If False, assumes all labels are the same to skip group handling logic
-    keep_na : bint
+    keep_na : bool
         Whether or not to keep nulls
+    pct : bool
+        Compute percentage rank of data within each group
     N : Py_ssize_t
         The number of elements to rank. Note: it is not always true that
         N == len(out) or N == len(masked_vals) (see `nancorr_spearman` usage for why)
@@ -1342,6 +1353,11 @@ cdef void rank_sorted_1d(
                     grp_start = i + 1
                     grp_vals_seen = 1
 
+    if pct:
+        for i in range(N):
+            if grp_sizes[i] != 0:
+                out[i] = out[i] / grp_sizes[i]
+
 
 def rank_2d(
     ndarray[rank_t, ndim=2] in_arr,
@@ -1360,13 +1376,14 @@ def rank_2d(
         Py_ssize_t infs
         ndarray[float64_t, ndim=2] ranks
         ndarray[rank_t, ndim=2] values
+        ndarray[rank_t, ndim=1] unused
         ndarray[intp_t, ndim=2] argsort_indexer
         ndarray[uint8_t, ndim=2] mask
-        rank_t val, nan_value
+        rank_t val, nan_fill_val
         float64_t count, sum_ranks = 0.0
         int tiebreak = 0
         int64_t idx
-        bint check_mask, condition, keep_na
+        bint check_mask, condition, keep_na, nans_rank_highest
 
     tiebreak = tiebreakers[ties_method]
 
@@ -1384,26 +1401,11 @@ def rank_2d(
         if values.dtype != np.object_:
             values = values.astype('O')
 
+    nans_rank_highest = ascending ^ (na_option == 'top')
     if check_mask:
-        if ascending ^ (na_option == 'top'):
-            if rank_t is object:
-                nan_value = Infinity()
-            elif rank_t is float64_t:
-                nan_value = np.inf
-
-            # int64 and datetimelike
-            else:
-                nan_value = np.iinfo(np.int64).max
-
-        else:
-            if rank_t is object:
-                nan_value = NegInfinity()
-            elif rank_t is float64_t:
-                nan_value = -np.inf
-
-            # int64 and datetimelike
-            else:
-                nan_value = NPY_NAT
+        # For fused type specialization
+        unused = values[:, 0]
+        nan_fill_val = get_rank_nan_fill_val(nans_rank_highest, unused)
 
         if rank_t is object:
             mask = missing.isnaobj2d(values)
@@ -1414,7 +1416,7 @@ def rank_2d(
         else:
             mask = values == NPY_NAT
 
-        np.putmask(values, mask, nan_value)
+        np.putmask(values, mask, nan_fill_val)
     else:
         mask = np.zeros_like(values, dtype=bool)
 

From b840b74040de4390ccd86cdee7e536b297f6a590 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Wed, 9 Jun 2021 21:58:55 -0400
Subject: [PATCH 2/3] Handle empty case early

---
 pandas/_libs/algos.pyx | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index 4fd515113316c..c55d1e9898b79 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -1385,6 +1385,9 @@ def rank_2d(
         int64_t idx
         bint check_mask, condition, keep_na, nans_rank_highest
 
+    if in_arr.shape[0] == 0 or in_arr.shape[1] == 0:
+        return np.empty_like(in_arr, dtype="f8")
+
     tiebreak = tiebreakers[ties_method]
 
     keep_na = na_option == 'keep'

From 61540043a269e82841264e2a4524cc1b1b9c5579 Mon Sep 17 00:00:00 2001
From: Matthew Zeitlin <mzeitlin@caltech.edu>
Date: Thu, 10 Jun 2021 14:32:49 -0400
Subject: [PATCH 3/3] Cleaner fused type handling

---
 pandas/_libs/algos.pyx | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index c55d1e9898b79..7e6521deac052 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -931,7 +931,7 @@ ctypedef fused rank_t:
     int64_t
 
 
-cdef rank_t get_rank_nan_fill_val(bint rank_nans_highest, ndarray[rank_t, ndim=1] _):
+cdef rank_t get_rank_nan_fill_val(bint rank_nans_highest, rank_t[:] _=None):
     """
     Return the value we'll use to represent missing values when sorting depending
     on if we'd like missing values to end up at the top/bottom. (The second parameter
@@ -1053,7 +1053,7 @@ def rank_1d(
     # will flip the ordering to still end up with lowest rank.
     # Symmetric logic applies to `na_option == 'bottom'`
     nans_rank_highest = ascending ^ (na_option == 'top')
-    nan_fill_val = get_rank_nan_fill_val(nans_rank_highest, masked_vals)
+    nan_fill_val = get_rank_nan_fill_val[rank_t](nans_rank_highest)
     if nans_rank_highest:
         order = (masked_vals, mask, labels)
     else:
@@ -1376,7 +1376,6 @@ def rank_2d(
         Py_ssize_t infs
         ndarray[float64_t, ndim=2] ranks
         ndarray[rank_t, ndim=2] values
-        ndarray[rank_t, ndim=1] unused
         ndarray[intp_t, ndim=2] argsort_indexer
         ndarray[uint8_t, ndim=2] mask
         rank_t val, nan_fill_val
@@ -1385,9 +1384,6 @@ def rank_2d(
         int64_t idx
         bint check_mask, condition, keep_na, nans_rank_highest
 
-    if in_arr.shape[0] == 0 or in_arr.shape[1] == 0:
-        return np.empty_like(in_arr, dtype="f8")
-
     tiebreak = tiebreakers[ties_method]
 
     keep_na = na_option == 'keep'
@@ -1406,9 +1402,7 @@ def rank_2d(
 
     nans_rank_highest = ascending ^ (na_option == 'top')
     if check_mask:
-        # For fused type specialization
-        unused = values[:, 0]
-        nan_fill_val = get_rank_nan_fill_val(nans_rank_highest, unused)
+        nan_fill_val = get_rank_nan_fill_val[rank_t](nans_rank_highest)
 
         if rank_t is object:
             mask = missing.isnaobj2d(values)