pandas-dev · jreback · Mar 26, 2021 · Mar 25, 2021 · Mar 25, 2021 · Mar 25, 2021
diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
@@ -490,7 +490,7 @@ def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarra
         int64_t total_discordant = 0
         float64_t kendall_tau
         int64_t n_obs
-        const int64_t[:] labels_n
+        const intp_t[:] labels_n
 
     N, K = (<object>mat).shape
 
@@ -499,7 +499,7 @@ def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarra
 
     ranked_mat = np.empty((N, K), dtype=np.float64)
     # For compatibility when calling rank_1d
-    labels_n = np.zeros(N, dtype=np.int64)
+    labels_n = np.zeros(N, dtype=np.intp)
 
     for i in range(K):
         ranked_mat[:, i] = rank_1d(mat[:, i], labels_n)
@@ -959,7 +959,7 @@ ctypedef fused rank_t:
 @cython.boundscheck(False)
 def rank_1d(
     ndarray[rank_t, ndim=1] values,
-    const int64_t[:] labels,
+    const intp_t[:] labels,
     ties_method="average",
     bint ascending=True,
     bint pct=False,
@@ -971,7 +971,8 @@ def rank_1d(
     Parameters
     ----------
     values : array of rank_t values to be ranked
-    labels : array containing unique label for each group, with its ordering
+    labels : np.ndarray[np.intp]
+        Array containing unique label for each group, with its ordering
         matching up to the corresponding record in `values`. If not called
         from a groupby operation, will be an array of 0's
     ties_method : {'average', 'min', 'max', 'first', 'dense'}, default

diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -105,7 +105,7 @@ cdef inline float64_t median_linear(float64_t* a, int n) nogil:
 def group_median_float64(ndarray[float64_t, ndim=2] out,
                          ndarray[int64_t] counts,
                          ndarray[float64_t, ndim=2] values,
-                         ndarray[int64_t] labels,
+                         ndarray[intp_t] labels,
                          Py_ssize_t min_count=-1):
     """
     Only aggregates on axis=0
@@ -122,7 +122,7 @@ def group_median_float64(ndarray[float64_t, ndim=2] out,
     ngroups = len(counts)
     N, K = (<object>values).shape
 
-    indexer, _counts = groupsort_indexer(ensure_platform_int(labels), ngroups)
+    indexer, _counts = groupsort_indexer(labels, ngroups)
     counts[:] = _counts[1:]
 
     data = np.empty((K, N), dtype=np.float64)
@@ -145,7 +145,7 @@ def group_median_float64(ndarray[float64_t, ndim=2] out,
 @cython.wraparound(False)
 def group_cumprod_float64(float64_t[:, ::1] out,
                           const float64_t[:, :] values,
-                          const int64_t[:] labels,
+                          const intp_t[:] labels,
                           int ngroups,
                           bint is_datetimelike,
                           bint skipna=True):
@@ -158,7 +158,7 @@ def group_cumprod_float64(float64_t[:, ::1] out,
         Array to store cumprod in.
     values : float64 array
         Values to take cumprod of.
-    labels : int64 array
+    labels : np.ndarray[np.intp]
         Labels to group by.
     ngroups : int
         Number of groups, larger than all entries of `labels`.
@@ -175,7 +175,7 @@ def group_cumprod_float64(float64_t[:, ::1] out,
         Py_ssize_t i, j, N, K, size
         float64_t val
         float64_t[:, ::1] accum
-        int64_t lab
+        intp_t lab
 
     N, K = (<object>values).shape
     accum = np.ones((ngroups, K), dtype=np.float64)
@@ -202,7 +202,7 @@ def group_cumprod_float64(float64_t[:, ::1] out,
 @cython.wraparound(False)
 def group_cumsum(numeric[:, ::1] out,
                  ndarray[numeric, ndim=2] values,
-                 const int64_t[:] labels,
+                 const intp_t[:] labels,
                  int ngroups,
                  is_datetimelike,
                  bint skipna=True):
@@ -215,7 +215,7 @@ def group_cumsum(numeric[:, ::1] out,
         Array to store cumsum in.
     values : array
         Values to take cumsum of.
-    labels : int64 array
+    labels : np.ndarray[np.intp]
         Labels to group by.
     ngroups : int
         Number of groups, larger than all entries of `labels`.
@@ -232,7 +232,7 @@ def group_cumsum(numeric[:, ::1] out,
         Py_ssize_t i, j, N, K, size
         numeric val, y, t
         numeric[:, ::1] accum, compensation
-        int64_t lab
+        intp_t lab
 
     N, K = (<object>values).shape
     accum = np.zeros((ngroups, K), dtype=np.asarray(values).dtype)
@@ -269,12 +269,12 @@ def group_cumsum(numeric[:, ::1] out,
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def group_shift_indexer(int64_t[::1] out, const int64_t[:] labels,
+def group_shift_indexer(int64_t[::1] out, const intp_t[:] labels,
                         int ngroups, int periods):
     cdef:
-        Py_ssize_t N, i, j, ii
+        Py_ssize_t N, i, j, ii, lab
         int offset = 0, sign
-        int64_t lab, idxer, idxer_slot
+        int64_t idxer, idxer_slot
         int64_t[::1] label_seen = np.zeros(ngroups, dtype=np.int64)
         int64_t[:, ::1] label_indexer
 
@@ -321,7 +321,7 @@ def group_shift_indexer(int64_t[::1] out, const int64_t[:] labels,
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels,
+def group_fillna_indexer(ndarray[int64_t] out, ndarray[intp_t] labels,
                          ndarray[uint8_t] mask, object direction,
                          int64_t limit, bint dropna):
     """
@@ -331,8 +331,9 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels,
     ----------
     out : array of int64_t values which this method will write its results to
         Missing values will be written to with a value of -1
-    labels : array containing unique label for each group, with its ordering
-        matching up to the corresponding record in `values`
+    labels : np.ndarray[np.intp]
+        Array containing unique label for each group, with its ordering
+        matching up to the corresponding record in `values`.
     mask : array of int64_t values where a 1 indicates a missing value
     direction : {'ffill', 'bfill'}
         Direction for fill to be applied (forwards or backwards, respectively)
@@ -344,17 +345,18 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels,
     This method modifies the `out` parameter rather than returning an object
     """
     cdef:
-        Py_ssize_t i, N
-        int64_t[:] sorted_labels
-        int64_t idx, curr_fill_idx=-1, filled_vals=0
+        Py_ssize_t i, N, idx
+        intp_t[:] sorted_labels
+        intp_t curr_fill_idx=-1
+        int64_t filled_vals = 0
 
     N = len(out)
 
     # Make sure all arrays are the same size
     assert N == len(labels) == len(mask)
 
     sorted_labels = np.argsort(labels, kind='mergesort').astype(
-        np.int64, copy=False)
+        np.intp, copy=False)
     if direction == 'bfill':
         sorted_labels = sorted_labels[::-1]
 
@@ -385,7 +387,7 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels,
 @cython.wraparound(False)
 def group_any_all(uint8_t[::1] out,
                   const uint8_t[::1] values,
-                  const int64_t[:] labels,
+                  const intp_t[:] labels,
                   const uint8_t[::1] mask,
                   object val_test,
                   bint skipna):
@@ -395,7 +397,8 @@ def group_any_all(uint8_t[::1] out,
     Parameters
     ----------
     out : array of values which this method will write its results to
-    labels : array containing unique label for each group, with its
+    labels : np.ndarray[np.intp]
+        Array containing unique label for each group, with its
         ordering matching up to the corresponding record in `values`
     values : array containing the truth value of each element
     mask : array indicating whether a value is na or not
@@ -411,7 +414,7 @@ def group_any_all(uint8_t[::1] out,
     """
     cdef:
         Py_ssize_t i, N = len(labels)
-        int64_t lab
+        intp_t lab
         uint8_t flag_val
 
     if val_test == 'all':
@@ -455,7 +458,7 @@ ctypedef fused complexfloating_t:
 def _group_add(complexfloating_t[:, ::1] out,
                int64_t[::1] counts,
                ndarray[complexfloating_t, ndim=2] values,
-               const int64_t[:] labels,
+               const intp_t[:] labels,
                Py_ssize_t min_count=0):
     """
     Only aggregates on axis=0 using Kahan summation
@@ -514,7 +517,7 @@ group_add_complex128 = _group_add['double complex']
 def _group_prod(floating[:, ::1] out,
                 int64_t[::1] counts,
                 ndarray[floating, ndim=2] values,
-                const int64_t[:] labels,
+                const intp_t[:] labels,
                 Py_ssize_t min_count=0):
     """
     Only aggregates on axis=0
@@ -567,7 +570,7 @@ group_prod_float64 = _group_prod['double']
 def _group_var(floating[:, ::1] out,
                int64_t[::1] counts,
                ndarray[floating, ndim=2] values,
-               const int64_t[:] labels,
+               const intp_t[:] labels,
                Py_ssize_t min_count=-1,
                int64_t ddof=1):
     cdef:
@@ -625,7 +628,7 @@ group_var_float64 = _group_var['double']
 def _group_mean(floating[:, ::1] out,
                 int64_t[::1] counts,
                 ndarray[floating, ndim=2] values,
-                const int64_t[::1] labels,
+                const intp_t[::1] labels,
                 Py_ssize_t min_count=-1):
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
@@ -681,7 +684,7 @@ group_mean_float64 = _group_mean['double']
 def _group_ohlc(floating[:, ::1] out,
                 int64_t[::1] counts,
                 ndarray[floating, ndim=2] values,
-                const int64_t[:] labels,
+                const intp_t[:] labels,
                 Py_ssize_t min_count=-1):
     """
     Only aggregates on axis=0
@@ -732,7 +735,7 @@ group_ohlc_float64 = _group_ohlc['double']
 @cython.wraparound(False)
 def group_quantile(ndarray[float64_t] out,
                    ndarray[numeric, ndim=1] values,
-                   ndarray[int64_t] labels,
+                   ndarray[intp_t] labels,
                    ndarray[uint8_t] mask,
                    float64_t q,
                    object interpolation):
@@ -743,7 +746,7 @@ def group_quantile(ndarray[float64_t] out,
     ----------
     out : ndarray
         Array of aggregated values that will be written to.
-    labels : ndarray
+    labels : ndarray[np.intp]
         Array containing the unique group labels.
     values : ndarray
         Array containing the values to apply the function against.
@@ -758,7 +761,7 @@ def group_quantile(ndarray[float64_t] out,
     cdef:
         Py_ssize_t i, N=len(labels), ngroups, grp_sz, non_na_sz
         Py_ssize_t grp_start=0, idx=0
-        int64_t lab
+        intp_t lab
         uint8_t interp
         float64_t q_idx, frac, val, next_val
         ndarray[int64_t] counts, non_na_counts, sort_arr
@@ -875,7 +878,7 @@ cdef inline bint _treat_as_na(rank_t val, bint is_datetimelike) nogil:
 def group_last(rank_t[:, ::1] out,
                int64_t[::1] counts,
                ndarray[rank_t, ndim=2] values,
-               const int64_t[:] labels,
+               const intp_t[:] labels,
                Py_ssize_t min_count=-1):
     """
     Only aggregates on axis=0
@@ -967,7 +970,7 @@ def group_last(rank_t[:, ::1] out,
 def group_nth(rank_t[:, ::1] out,
               int64_t[::1] counts,
               ndarray[rank_t, ndim=2] values,
-              const int64_t[:] labels,
+              const intp_t[:] labels,
               int64_t min_count=-1, int64_t rank=1
               ):
     """
@@ -1059,7 +1062,7 @@ def group_nth(rank_t[:, ::1] out,
 @cython.wraparound(False)
 def group_rank(float64_t[:, ::1] out,
                ndarray[rank_t, ndim=2] values,
-               const int64_t[:] labels,
+               const intp_t[:] labels,
                int ngroups,
                bint is_datetimelike, object ties_method="average",
                bint ascending=True, bint pct=False, object na_option="keep"):
@@ -1070,7 +1073,8 @@ def group_rank(float64_t[:, ::1] out,
     ----------
     out : array of float64_t values which this method will write its results to
     values : array of rank_t values to be ranked
-    labels : array containing unique label for each group, with its ordering
+    labels : np.ndarray[np.intp]
+        Array containing unique label for each group, with its ordering
         matching up to the corresponding record in `values`
     ngroups : int
         This parameter is not used, is needed to match signatures of other
@@ -1131,7 +1135,7 @@ ctypedef fused groupby_t:
 cdef group_min_max(groupby_t[:, ::1] out,
                    int64_t[::1] counts,
                    ndarray[groupby_t, ndim=2] values,
-                   const int64_t[:] labels,
+                   const intp_t[:] labels,
                    Py_ssize_t min_count=-1,
                    bint compute_max=True):
     """
@@ -1145,7 +1149,7 @@ cdef group_min_max(groupby_t[:, ::1] out,
         Input as a zeroed array, populated by group sizes during algorithm
     values : array
         Values to find column-wise min/max of.
-    labels : int64 array
+    labels : np.ndarray[np.intp]
         Labels to group by.
     min_count : Py_ssize_t, default -1
         The minimum number of non-NA group elements, NA result if threshold
@@ -1230,7 +1234,7 @@ cdef group_min_max(groupby_t[:, ::1] out,
 def group_max(groupby_t[:, ::1] out,
               int64_t[::1] counts,
               ndarray[groupby_t, ndim=2] values,
-              const int64_t[:] labels,
+              const intp_t[:] labels,
               Py_ssize_t min_count=-1):
     """See group_min_max.__doc__"""
     group_min_max(out, counts, values, labels, min_count=min_count, compute_max=True)
@@ -1241,7 +1245,7 @@ def group_max(groupby_t[:, ::1] out,
 def group_min(groupby_t[:, ::1] out,
               int64_t[::1] counts,
               ndarray[groupby_t, ndim=2] values,
-              const int64_t[:] labels,
+              const intp_t[:] labels,
               Py_ssize_t min_count=-1):
     """See group_min_max.__doc__"""
     group_min_max(out, counts, values, labels, min_count=min_count, compute_max=False)
@@ -1251,7 +1255,7 @@ def group_min(groupby_t[:, ::1] out,
 @cython.wraparound(False)
 def group_cummin_max(groupby_t[:, ::1] out,
                      ndarray[groupby_t, ndim=2] values,
-                     const int64_t[:] labels,
+                     const intp_t[:] labels,
                      int ngroups,
                      bint is_datetimelike,
                      bint compute_max):
@@ -1264,7 +1268,7 @@ def group_cummin_max(groupby_t[:, ::1] out,
         Array to store cummin/max in.
     values : array
         Values to take cummin/max of.
-    labels : int64 array
+    labels : np.ndarray[np.intp]
         Labels to group by.
     ngroups : int
         Number of groups, larger than all entries of `labels`.
@@ -1282,7 +1286,7 @@ def group_cummin_max(groupby_t[:, ::1] out,
         Py_ssize_t i, j, N, K, size
         groupby_t val, mval
         ndarray[groupby_t, ndim=2] accum
-        int64_t lab
+        intp_t lab
 
     N, K = (<object>values).shape
     accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype)
@@ -1319,7 +1323,7 @@ def group_cummin_max(groupby_t[:, ::1] out,
 @cython.wraparound(False)
 def group_cummin(groupby_t[:, ::1] out,
                  ndarray[groupby_t, ndim=2] values,
-                 const int64_t[:] labels,
+                 const intp_t[:] labels,
                  int ngroups,
                  bint is_datetimelike):
     """See group_cummin_max.__doc__"""
@@ -1330,7 +1334,7 @@ def group_cummin(groupby_t[:, ::1] out,
 @cython.wraparound(False)
 def group_cummax(groupby_t[:, ::1] out,
                  ndarray[groupby_t, ndim=2] values,
-                 const int64_t[:] labels,
+                 const intp_t[:] labels,
                  int ngroups,
                  bint is_datetimelike):
     """See group_cummin_max.__doc__"""