Skip to content

TYP: libgroupby int64->intp #40635

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Mar 26, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions pandas/_libs/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -490,7 +490,7 @@ def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarra
int64_t total_discordant = 0
float64_t kendall_tau
int64_t n_obs
const int64_t[:] labels_n
const intp_t[:] labels_n

N, K = (<object>mat).shape

Expand All @@ -499,7 +499,7 @@ def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarra

ranked_mat = np.empty((N, K), dtype=np.float64)
# For compatibility when calling rank_1d
labels_n = np.zeros(N, dtype=np.int64)
labels_n = np.zeros(N, dtype=np.intp)

for i in range(K):
ranked_mat[:, i] = rank_1d(mat[:, i], labels_n)
Expand Down Expand Up @@ -959,7 +959,7 @@ ctypedef fused rank_t:
@cython.boundscheck(False)
def rank_1d(
ndarray[rank_t, ndim=1] values,
const int64_t[:] labels,
const intp_t[:] labels,
ties_method="average",
bint ascending=True,
bint pct=False,
Expand All @@ -971,7 +971,8 @@ def rank_1d(
Parameters
----------
values : array of rank_t values to be ranked
labels : array containing unique label for each group, with its ordering
labels : np.ndarray[np.intp]
Array containing unique label for each group, with its ordering
matching up to the corresponding record in `values`. If not called
from a groupby operation, will be an array of 0's
ties_method : {'average', 'min', 'max', 'first', 'dense'}, default
Expand Down
88 changes: 46 additions & 42 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ cdef inline float64_t median_linear(float64_t* a, int n) nogil:
def group_median_float64(ndarray[float64_t, ndim=2] out,
ndarray[int64_t] counts,
ndarray[float64_t, ndim=2] values,
ndarray[int64_t] labels,
ndarray[intp_t] labels,
Py_ssize_t min_count=-1):
"""
Only aggregates on axis=0
Expand All @@ -122,7 +122,7 @@ def group_median_float64(ndarray[float64_t, ndim=2] out,
ngroups = len(counts)
N, K = (<object>values).shape

indexer, _counts = groupsort_indexer(ensure_platform_int(labels), ngroups)
indexer, _counts = groupsort_indexer(labels, ngroups)
counts[:] = _counts[1:]

data = np.empty((K, N), dtype=np.float64)
Expand All @@ -145,7 +145,7 @@ def group_median_float64(ndarray[float64_t, ndim=2] out,
@cython.wraparound(False)
def group_cumprod_float64(float64_t[:, ::1] out,
const float64_t[:, :] values,
const int64_t[:] labels,
const intp_t[:] labels,
int ngroups,
bint is_datetimelike,
bint skipna=True):
Expand All @@ -158,7 +158,7 @@ def group_cumprod_float64(float64_t[:, ::1] out,
Array to store cumprod in.
values : float64 array
Values to take cumprod of.
labels : int64 array
labels : np.ndarray[np.intp]
Labels to group by.
ngroups : int
Number of groups, larger than all entries of `labels`.
Expand All @@ -175,7 +175,7 @@ def group_cumprod_float64(float64_t[:, ::1] out,
Py_ssize_t i, j, N, K, size
float64_t val
float64_t[:, ::1] accum
int64_t lab
intp_t lab

N, K = (<object>values).shape
accum = np.ones((ngroups, K), dtype=np.float64)
Expand All @@ -202,7 +202,7 @@ def group_cumprod_float64(float64_t[:, ::1] out,
@cython.wraparound(False)
def group_cumsum(numeric[:, ::1] out,
ndarray[numeric, ndim=2] values,
const int64_t[:] labels,
const intp_t[:] labels,
int ngroups,
is_datetimelike,
bint skipna=True):
Expand All @@ -215,7 +215,7 @@ def group_cumsum(numeric[:, ::1] out,
Array to store cumsum in.
values : array
Values to take cumsum of.
labels : int64 array
labels : np.ndarray[np.intp]
Labels to group by.
ngroups : int
Number of groups, larger than all entries of `labels`.
Expand All @@ -232,7 +232,7 @@ def group_cumsum(numeric[:, ::1] out,
Py_ssize_t i, j, N, K, size
numeric val, y, t
numeric[:, ::1] accum, compensation
int64_t lab
intp_t lab

N, K = (<object>values).shape
accum = np.zeros((ngroups, K), dtype=np.asarray(values).dtype)
Expand Down Expand Up @@ -269,12 +269,12 @@ def group_cumsum(numeric[:, ::1] out,

@cython.boundscheck(False)
@cython.wraparound(False)
def group_shift_indexer(int64_t[::1] out, const int64_t[:] labels,
def group_shift_indexer(int64_t[::1] out, const intp_t[:] labels,
int ngroups, int periods):
cdef:
Py_ssize_t N, i, j, ii
Py_ssize_t N, i, j, ii, lab
int offset = 0, sign
int64_t lab, idxer, idxer_slot
int64_t idxer, idxer_slot
int64_t[::1] label_seen = np.zeros(ngroups, dtype=np.int64)
int64_t[:, ::1] label_indexer

Expand Down Expand Up @@ -321,7 +321,7 @@ def group_shift_indexer(int64_t[::1] out, const int64_t[:] labels,

@cython.wraparound(False)
@cython.boundscheck(False)
def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels,
def group_fillna_indexer(ndarray[int64_t] out, ndarray[intp_t] labels,
ndarray[uint8_t] mask, object direction,
int64_t limit, bint dropna):
"""
Expand All @@ -331,8 +331,9 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels,
----------
out : array of int64_t values which this method will write its results to
Missing values will be written to with a value of -1
labels : array containing unique label for each group, with its ordering
matching up to the corresponding record in `values`
labels : np.ndarray[np.intp]
Array containing unique label for each group, with its ordering
matching up to the corresponding record in `values`.
mask : array of int64_t values where a 1 indicates a missing value
direction : {'ffill', 'bfill'}
Direction for fill to be applied (forwards or backwards, respectively)
Expand All @@ -344,17 +345,18 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels,
This method modifies the `out` parameter rather than returning an object
"""
cdef:
Py_ssize_t i, N
int64_t[:] sorted_labels
int64_t idx, curr_fill_idx=-1, filled_vals=0
Py_ssize_t i, N, idx
intp_t[:] sorted_labels
intp_t curr_fill_idx=-1
int64_t filled_vals = 0

N = len(out)

# Make sure all arrays are the same size
assert N == len(labels) == len(mask)

sorted_labels = np.argsort(labels, kind='mergesort').astype(
np.int64, copy=False)
np.intp, copy=False)
if direction == 'bfill':
sorted_labels = sorted_labels[::-1]

Expand Down Expand Up @@ -385,7 +387,7 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels,
@cython.wraparound(False)
def group_any_all(uint8_t[::1] out,
const uint8_t[::1] values,
const int64_t[:] labels,
const intp_t[:] labels,
const uint8_t[::1] mask,
object val_test,
bint skipna):
Expand All @@ -395,7 +397,8 @@ def group_any_all(uint8_t[::1] out,
Parameters
----------
out : array of values which this method will write its results to
labels : array containing unique label for each group, with its
labels : np.ndarray[np.intp]
Array containing unique label for each group, with its
ordering matching up to the corresponding record in `values`
values : array containing the truth value of each element
mask : array indicating whether a value is na or not
Expand All @@ -411,7 +414,7 @@ def group_any_all(uint8_t[::1] out,
"""
cdef:
Py_ssize_t i, N = len(labels)
int64_t lab
intp_t lab
uint8_t flag_val

if val_test == 'all':
Expand Down Expand Up @@ -455,7 +458,7 @@ ctypedef fused complexfloating_t:
def _group_add(complexfloating_t[:, ::1] out,
int64_t[::1] counts,
ndarray[complexfloating_t, ndim=2] values,
const int64_t[:] labels,
const intp_t[:] labels,
Py_ssize_t min_count=0):
"""
Only aggregates on axis=0 using Kahan summation
Expand Down Expand Up @@ -514,7 +517,7 @@ group_add_complex128 = _group_add['double complex']
def _group_prod(floating[:, ::1] out,
int64_t[::1] counts,
ndarray[floating, ndim=2] values,
const int64_t[:] labels,
const intp_t[:] labels,
Py_ssize_t min_count=0):
"""
Only aggregates on axis=0
Expand Down Expand Up @@ -567,7 +570,7 @@ group_prod_float64 = _group_prod['double']
def _group_var(floating[:, ::1] out,
int64_t[::1] counts,
ndarray[floating, ndim=2] values,
const int64_t[:] labels,
const intp_t[:] labels,
Py_ssize_t min_count=-1,
int64_t ddof=1):
cdef:
Expand Down Expand Up @@ -625,7 +628,7 @@ group_var_float64 = _group_var['double']
def _group_mean(floating[:, ::1] out,
int64_t[::1] counts,
ndarray[floating, ndim=2] values,
const int64_t[::1] labels,
const intp_t[::1] labels,
Py_ssize_t min_count=-1):
cdef:
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
Expand Down Expand Up @@ -681,7 +684,7 @@ group_mean_float64 = _group_mean['double']
def _group_ohlc(floating[:, ::1] out,
int64_t[::1] counts,
ndarray[floating, ndim=2] values,
const int64_t[:] labels,
const intp_t[:] labels,
Py_ssize_t min_count=-1):
"""
Only aggregates on axis=0
Expand Down Expand Up @@ -732,7 +735,7 @@ group_ohlc_float64 = _group_ohlc['double']
@cython.wraparound(False)
def group_quantile(ndarray[float64_t] out,
ndarray[numeric, ndim=1] values,
ndarray[int64_t] labels,
ndarray[intp_t] labels,
ndarray[uint8_t] mask,
float64_t q,
object interpolation):
Expand All @@ -743,7 +746,7 @@ def group_quantile(ndarray[float64_t] out,
----------
out : ndarray
Array of aggregated values that will be written to.
labels : ndarray
labels : ndarray[np.intp]
Array containing the unique group labels.
values : ndarray
Array containing the values to apply the function against.
Expand All @@ -758,7 +761,7 @@ def group_quantile(ndarray[float64_t] out,
cdef:
Py_ssize_t i, N=len(labels), ngroups, grp_sz, non_na_sz
Py_ssize_t grp_start=0, idx=0
int64_t lab
intp_t lab
uint8_t interp
float64_t q_idx, frac, val, next_val
ndarray[int64_t] counts, non_na_counts, sort_arr
Expand Down Expand Up @@ -875,7 +878,7 @@ cdef inline bint _treat_as_na(rank_t val, bint is_datetimelike) nogil:
def group_last(rank_t[:, ::1] out,
int64_t[::1] counts,
ndarray[rank_t, ndim=2] values,
const int64_t[:] labels,
const intp_t[:] labels,
Py_ssize_t min_count=-1):
"""
Only aggregates on axis=0
Expand Down Expand Up @@ -967,7 +970,7 @@ def group_last(rank_t[:, ::1] out,
def group_nth(rank_t[:, ::1] out,
int64_t[::1] counts,
ndarray[rank_t, ndim=2] values,
const int64_t[:] labels,
const intp_t[:] labels,
int64_t min_count=-1, int64_t rank=1
):
"""
Expand Down Expand Up @@ -1059,7 +1062,7 @@ def group_nth(rank_t[:, ::1] out,
@cython.wraparound(False)
def group_rank(float64_t[:, ::1] out,
ndarray[rank_t, ndim=2] values,
const int64_t[:] labels,
const intp_t[:] labels,
int ngroups,
bint is_datetimelike, object ties_method="average",
bint ascending=True, bint pct=False, object na_option="keep"):
Expand All @@ -1070,7 +1073,8 @@ def group_rank(float64_t[:, ::1] out,
----------
out : array of float64_t values which this method will write its results to
values : array of rank_t values to be ranked
labels : array containing unique label for each group, with its ordering
labels : np.ndarray[np.intp]
Array containing unique label for each group, with its ordering
matching up to the corresponding record in `values`
ngroups : int
This parameter is not used, is needed to match signatures of other
Expand Down Expand Up @@ -1131,7 +1135,7 @@ ctypedef fused groupby_t:
cdef group_min_max(groupby_t[:, ::1] out,
int64_t[::1] counts,
ndarray[groupby_t, ndim=2] values,
const int64_t[:] labels,
const intp_t[:] labels,
Py_ssize_t min_count=-1,
bint compute_max=True):
"""
Expand All @@ -1145,7 +1149,7 @@ cdef group_min_max(groupby_t[:, ::1] out,
Input as a zeroed array, populated by group sizes during algorithm
values : array
Values to find column-wise min/max of.
labels : int64 array
labels : np.ndarray[np.intp]
Labels to group by.
min_count : Py_ssize_t, default -1
The minimum number of non-NA group elements, NA result if threshold
Expand Down Expand Up @@ -1230,7 +1234,7 @@ cdef group_min_max(groupby_t[:, ::1] out,
def group_max(groupby_t[:, ::1] out,
int64_t[::1] counts,
ndarray[groupby_t, ndim=2] values,
const int64_t[:] labels,
const intp_t[:] labels,
Py_ssize_t min_count=-1):
"""See group_min_max.__doc__"""
group_min_max(out, counts, values, labels, min_count=min_count, compute_max=True)
Expand All @@ -1241,7 +1245,7 @@ def group_max(groupby_t[:, ::1] out,
def group_min(groupby_t[:, ::1] out,
int64_t[::1] counts,
ndarray[groupby_t, ndim=2] values,
const int64_t[:] labels,
const intp_t[:] labels,
Py_ssize_t min_count=-1):
"""See group_min_max.__doc__"""
group_min_max(out, counts, values, labels, min_count=min_count, compute_max=False)
Expand All @@ -1251,7 +1255,7 @@ def group_min(groupby_t[:, ::1] out,
@cython.wraparound(False)
def group_cummin_max(groupby_t[:, ::1] out,
ndarray[groupby_t, ndim=2] values,
const int64_t[:] labels,
const intp_t[:] labels,
int ngroups,
bint is_datetimelike,
bint compute_max):
Expand All @@ -1264,7 +1268,7 @@ def group_cummin_max(groupby_t[:, ::1] out,
Array to store cummin/max in.
values : array
Values to take cummin/max of.
labels : int64 array
labels : np.ndarray[np.intp]
Labels to group by.
ngroups : int
Number of groups, larger than all entries of `labels`.
Expand All @@ -1282,7 +1286,7 @@ def group_cummin_max(groupby_t[:, ::1] out,
Py_ssize_t i, j, N, K, size
groupby_t val, mval
ndarray[groupby_t, ndim=2] accum
int64_t lab
intp_t lab

N, K = (<object>values).shape
accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype)
Expand Down Expand Up @@ -1319,7 +1323,7 @@ def group_cummin_max(groupby_t[:, ::1] out,
@cython.wraparound(False)
def group_cummin(groupby_t[:, ::1] out,
ndarray[groupby_t, ndim=2] values,
const int64_t[:] labels,
const intp_t[:] labels,
int ngroups,
bint is_datetimelike):
"""See group_cummin_max.__doc__"""
Expand All @@ -1330,7 +1334,7 @@ def group_cummin(groupby_t[:, ::1] out,
@cython.wraparound(False)
def group_cummax(groupby_t[:, ::1] out,
ndarray[groupby_t, ndim=2] values,
const int64_t[:] labels,
const intp_t[:] labels,
int ngroups,
bint is_datetimelike):
"""See group_cummin_max.__doc__"""
Expand Down
Loading