From 15a1d68edf95030ca9b38701061de3a22357ed59 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 25 Mar 2021 08:41:46 -0700 Subject: [PATCH 1/3] TYP: libgroupby int64->intp --- pandas/_libs/groupby.pyx | 88 +++++++++++++----------- pandas/_libs/lib.pyx | 8 +-- pandas/_libs/reduction.pyx | 6 +- pandas/core/frame.py | 3 +- pandas/core/groupby/generic.py | 2 - pandas/core/groupby/groupby.py | 35 +++++----- pandas/core/groupby/ops.py | 11 ++- pandas/core/sorting.py | 8 ++- pandas/tests/groupby/test_bin_groupby.py | 4 +- pandas/tests/groupby/test_grouping.py | 2 +- pandas/tests/test_algos.py | 2 +- 11 files changed, 86 insertions(+), 83 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 545d6a10232ab..7ddc087df9b11 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -105,7 +105,7 @@ cdef inline float64_t median_linear(float64_t* a, int n) nogil: def group_median_float64(ndarray[float64_t, ndim=2] out, ndarray[int64_t] counts, ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels, + ndarray[intp_t] labels, Py_ssize_t min_count=-1): """ Only aggregates on axis=0 @@ -122,7 +122,7 @@ def group_median_float64(ndarray[float64_t, ndim=2] out, ngroups = len(counts) N, K = (values).shape - indexer, _counts = groupsort_indexer(ensure_platform_int(labels), ngroups) + indexer, _counts = groupsort_indexer(labels, ngroups) counts[:] = _counts[1:] data = np.empty((K, N), dtype=np.float64) @@ -145,7 +145,7 @@ def group_median_float64(ndarray[float64_t, ndim=2] out, @cython.wraparound(False) def group_cumprod_float64(float64_t[:, ::1] out, const float64_t[:, :] values, - const int64_t[:] labels, + const intp_t[:] labels, int ngroups, bint is_datetimelike, bint skipna=True): @@ -158,7 +158,7 @@ def group_cumprod_float64(float64_t[:, ::1] out, Array to store cumprod in. values : float64 array Values to take cumprod of. - labels : int64 array + labels : np.ndarray[np.intp] Labels to group by. ngroups : int Number of groups, larger than all entries of `labels`. @@ -175,7 +175,7 @@ def group_cumprod_float64(float64_t[:, ::1] out, Py_ssize_t i, j, N, K, size float64_t val float64_t[:, ::1] accum - int64_t lab + intp_t lab N, K = (values).shape accum = np.ones((ngroups, K), dtype=np.float64) @@ -202,7 +202,7 @@ def group_cumprod_float64(float64_t[:, ::1] out, @cython.wraparound(False) def group_cumsum(numeric[:, ::1] out, ndarray[numeric, ndim=2] values, - const int64_t[:] labels, + const intp_t[:] labels, int ngroups, is_datetimelike, bint skipna=True): @@ -215,7 +215,7 @@ def group_cumsum(numeric[:, ::1] out, Array to store cumsum in. values : array Values to take cumsum of. - labels : int64 array + labels : np.ndarray[np.intp] Labels to group by. ngroups : int Number of groups, larger than all entries of `labels`. @@ -232,7 +232,7 @@ def group_cumsum(numeric[:, ::1] out, Py_ssize_t i, j, N, K, size numeric val, y, t numeric[:, ::1] accum, compensation - int64_t lab + intp_t lab N, K = (values).shape accum = np.zeros((ngroups, K), dtype=np.asarray(values).dtype) @@ -269,12 +269,12 @@ def group_cumsum(numeric[:, ::1] out, @cython.boundscheck(False) @cython.wraparound(False) -def group_shift_indexer(int64_t[::1] out, const int64_t[:] labels, +def group_shift_indexer(int64_t[::1] out, const intp_t[:] labels, int ngroups, int periods): cdef: - Py_ssize_t N, i, j, ii + Py_ssize_t N, i, j, ii, lab int offset = 0, sign - int64_t lab, idxer, idxer_slot + int64_t idxer, idxer_slot int64_t[::1] label_seen = np.zeros(ngroups, dtype=np.int64) int64_t[:, ::1] label_indexer @@ -321,7 +321,7 @@ def group_shift_indexer(int64_t[::1] out, const int64_t[:] labels, @cython.wraparound(False) @cython.boundscheck(False) -def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, +def group_fillna_indexer(ndarray[int64_t] out, ndarray[intp_t] labels, ndarray[uint8_t] mask, object direction, int64_t limit, bint dropna): """ @@ -331,8 +331,9 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, ---------- out : array of int64_t values which this method will write its results to Missing values will be written to with a value of -1 - labels : array containing unique label for each group, with its ordering - matching up to the corresponding record in `values` + labels : np.ndarray[np.intp] + Array containing unique label for each group, with its ordering + matching up to the corresponding record in `values`. mask : array of int64_t values where a 1 indicates a missing value direction : {'ffill', 'bfill'} Direction for fill to be applied (forwards or backwards, respectively) @@ -344,9 +345,10 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, This method modifies the `out` parameter rather than returning an object """ cdef: - Py_ssize_t i, N - int64_t[:] sorted_labels - int64_t idx, curr_fill_idx=-1, filled_vals=0 + Py_ssize_t i, N, idx + intp_t[:] sorted_labels + intp_t curr_fill_idx=-1 + int64_t filled_vals = 0 N = len(out) @@ -354,7 +356,7 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, assert N == len(labels) == len(mask) sorted_labels = np.argsort(labels, kind='mergesort').astype( - np.int64, copy=False) + np.intp, copy=False) if direction == 'bfill': sorted_labels = sorted_labels[::-1] @@ -385,7 +387,7 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, @cython.wraparound(False) def group_any_all(uint8_t[::1] out, const uint8_t[::1] values, - const int64_t[:] labels, + const intp_t[:] labels, const uint8_t[::1] mask, object val_test, bint skipna): @@ -395,7 +397,8 @@ def group_any_all(uint8_t[::1] out, Parameters ---------- out : array of values which this method will write its results to - labels : array containing unique label for each group, with its + labels : np.ndarray[np.intp] + Array containing unique label for each group, with its ordering matching up to the corresponding record in `values` values : array containing the truth value of each element mask : array indicating whether a value is na or not @@ -411,7 +414,7 @@ def group_any_all(uint8_t[::1] out, """ cdef: Py_ssize_t i, N = len(labels) - int64_t lab + intp_t lab uint8_t flag_val if val_test == 'all': @@ -455,7 +458,7 @@ ctypedef fused complexfloating_t: def _group_add(complexfloating_t[:, ::1] out, int64_t[::1] counts, ndarray[complexfloating_t, ndim=2] values, - const int64_t[:] labels, + const intp_t[:] labels, Py_ssize_t min_count=0): """ Only aggregates on axis=0 using Kahan summation @@ -514,7 +517,7 @@ group_add_complex128 = _group_add['double complex'] def _group_prod(floating[:, ::1] out, int64_t[::1] counts, ndarray[floating, ndim=2] values, - const int64_t[:] labels, + const intp_t[:] labels, Py_ssize_t min_count=0): """ Only aggregates on axis=0 @@ -567,7 +570,7 @@ group_prod_float64 = _group_prod['double'] def _group_var(floating[:, ::1] out, int64_t[::1] counts, ndarray[floating, ndim=2] values, - const int64_t[:] labels, + const intp_t[:] labels, Py_ssize_t min_count=-1, int64_t ddof=1): cdef: @@ -625,7 +628,7 @@ group_var_float64 = _group_var['double'] def _group_mean(floating[:, ::1] out, int64_t[::1] counts, ndarray[floating, ndim=2] values, - const int64_t[::1] labels, + const intp_t[::1] labels, Py_ssize_t min_count=-1): cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) @@ -681,7 +684,7 @@ group_mean_float64 = _group_mean['double'] def _group_ohlc(floating[:, ::1] out, int64_t[::1] counts, ndarray[floating, ndim=2] values, - const int64_t[:] labels, + const intp_t[:] labels, Py_ssize_t min_count=-1): """ Only aggregates on axis=0 @@ -732,7 +735,7 @@ group_ohlc_float64 = _group_ohlc['double'] @cython.wraparound(False) def group_quantile(ndarray[float64_t] out, ndarray[numeric, ndim=1] values, - ndarray[int64_t] labels, + ndarray[intp_t] labels, ndarray[uint8_t] mask, float64_t q, object interpolation): @@ -743,7 +746,7 @@ def group_quantile(ndarray[float64_t] out, ---------- out : ndarray Array of aggregated values that will be written to. - labels : ndarray + labels : ndarray[np.intp] Array containing the unique group labels. values : ndarray Array containing the values to apply the function against. @@ -758,7 +761,7 @@ def group_quantile(ndarray[float64_t] out, cdef: Py_ssize_t i, N=len(labels), ngroups, grp_sz, non_na_sz Py_ssize_t grp_start=0, idx=0 - int64_t lab + intp_t lab uint8_t interp float64_t q_idx, frac, val, next_val ndarray[int64_t] counts, non_na_counts, sort_arr @@ -875,7 +878,7 @@ cdef inline bint _treat_as_na(rank_t val, bint is_datetimelike) nogil: def group_last(rank_t[:, ::1] out, int64_t[::1] counts, ndarray[rank_t, ndim=2] values, - const int64_t[:] labels, + const intp_t[:] labels, Py_ssize_t min_count=-1): """ Only aggregates on axis=0 @@ -967,7 +970,7 @@ def group_last(rank_t[:, ::1] out, def group_nth(rank_t[:, ::1] out, int64_t[::1] counts, ndarray[rank_t, ndim=2] values, - const int64_t[:] labels, + const intp_t[:] labels, int64_t min_count=-1, int64_t rank=1 ): """ @@ -1059,7 +1062,7 @@ def group_nth(rank_t[:, ::1] out, @cython.wraparound(False) def group_rank(float64_t[:, ::1] out, ndarray[rank_t, ndim=2] values, - const int64_t[:] labels, + const intp_t[:] labels, int ngroups, bint is_datetimelike, object ties_method="average", bint ascending=True, bint pct=False, object na_option="keep"): @@ -1070,7 +1073,8 @@ def group_rank(float64_t[:, ::1] out, ---------- out : array of float64_t values which this method will write its results to values : array of rank_t values to be ranked - labels : array containing unique label for each group, with its ordering + labels : np.ndarray[np.intp] + Array containing unique label for each group, with its ordering matching up to the corresponding record in `values` ngroups : int This parameter is not used, is needed to match signatures of other @@ -1131,7 +1135,7 @@ ctypedef fused groupby_t: cdef group_min_max(groupby_t[:, ::1] out, int64_t[::1] counts, ndarray[groupby_t, ndim=2] values, - const int64_t[:] labels, + const intp_t[:] labels, Py_ssize_t min_count=-1, bint compute_max=True): """ @@ -1145,7 +1149,7 @@ cdef group_min_max(groupby_t[:, ::1] out, Input as a zeroed array, populated by group sizes during algorithm values : array Values to find column-wise min/max of. - labels : int64 array + labels : np.ndarray[np.intp] Labels to group by. min_count : Py_ssize_t, default -1 The minimum number of non-NA group elements, NA result if threshold @@ -1230,7 +1234,7 @@ cdef group_min_max(groupby_t[:, ::1] out, def group_max(groupby_t[:, ::1] out, int64_t[::1] counts, ndarray[groupby_t, ndim=2] values, - const int64_t[:] labels, + const intp_t[:] labels, Py_ssize_t min_count=-1): """See group_min_max.__doc__""" group_min_max(out, counts, values, labels, min_count=min_count, compute_max=True) @@ -1241,7 +1245,7 @@ def group_max(groupby_t[:, ::1] out, def group_min(groupby_t[:, ::1] out, int64_t[::1] counts, ndarray[groupby_t, ndim=2] values, - const int64_t[:] labels, + const intp_t[:] labels, Py_ssize_t min_count=-1): """See group_min_max.__doc__""" group_min_max(out, counts, values, labels, min_count=min_count, compute_max=False) @@ -1251,7 +1255,7 @@ def group_min(groupby_t[:, ::1] out, @cython.wraparound(False) def group_cummin_max(groupby_t[:, ::1] out, ndarray[groupby_t, ndim=2] values, - const int64_t[:] labels, + const intp_t[:] labels, int ngroups, bint is_datetimelike, bint compute_max): @@ -1264,7 +1268,7 @@ def group_cummin_max(groupby_t[:, ::1] out, Array to store cummin/max in. values : array Values to take cummin/max of. - labels : int64 array + labels : np.ndarray[np.intp] Labels to group by. ngroups : int Number of groups, larger than all entries of `labels`. @@ -1282,7 +1286,7 @@ def group_cummin_max(groupby_t[:, ::1] out, Py_ssize_t i, j, N, K, size groupby_t val, mval ndarray[groupby_t, ndim=2] accum - int64_t lab + intp_t lab N, K = (values).shape accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype) @@ -1319,7 +1323,7 @@ def group_cummin_max(groupby_t[:, ::1] out, @cython.wraparound(False) def group_cummin(groupby_t[:, ::1] out, ndarray[groupby_t, ndim=2] values, - const int64_t[:] labels, + const intp_t[:] labels, int ngroups, bint is_datetimelike): """See group_cummin_max.__doc__""" @@ -1330,7 +1334,7 @@ def group_cummin(groupby_t[:, ::1] out, @cython.wraparound(False) def group_cummax(groupby_t[:, ::1] out, ndarray[groupby_t, ndim=2] values, - const int64_t[:] labels, + const intp_t[:] labels, int ngroups, bint is_datetimelike): """See group_cummin_max.__doc__""" diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 9ef3c859633c2..94a4d586b4f13 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -852,7 +852,7 @@ def get_level_sorter( @cython.boundscheck(False) @cython.wraparound(False) def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask, - const int64_t[:] labels, + const intp_t[:] labels, Py_ssize_t max_bin, int axis): cdef: @@ -881,10 +881,10 @@ def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask, return counts -def generate_slices(const int64_t[:] labels, Py_ssize_t ngroups): +def generate_slices(const intp_t[:] labels, Py_ssize_t ngroups): cdef: Py_ssize_t i, group_size, n, start - int64_t lab + intp_t lab object slobj ndarray[int64_t] starts, ends @@ -910,7 +910,7 @@ def generate_slices(const int64_t[:] labels, Py_ssize_t ngroups): return starts, ends -def indices_fast(ndarray index, const int64_t[:] labels, list keys, +def indices_fast(ndarray[intp_t] index, const int64_t[:] labels, list keys, list sorted_labels) -> dict: """ Parameters diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 5649d1378cda3..9acff1cac305c 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -10,6 +10,7 @@ import numpy as np cimport numpy as cnp from numpy cimport ( int64_t, + intp_t, ndarray, ) @@ -200,7 +201,7 @@ cdef class SeriesGrouper(_BaseGrouper): ndarray arr, index, dummy_arr, dummy_index object f, labels, values, typ, ityp, name - def __init__(self, object series, object f, object labels, + def __init__(self, object series, object f, ndarray[intp_t] labels, Py_ssize_t ngroups): if len(series) == 0: @@ -228,7 +229,8 @@ cdef class SeriesGrouper(_BaseGrouper): cdef: # Define result to avoid UnboundLocalError ndarray arr, result = None - ndarray[int64_t] labels, counts + ndarray[intp_t] labels + ndarray[int64_t] counts Py_ssize_t i, n, group_size, lab, start, end object res bint initialized = 0 diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 32439af6db238..eab6bdb3b2d66 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -104,7 +104,6 @@ validate_numeric_casting, ) from pandas.core.dtypes.common import ( - ensure_int64, ensure_platform_int, infer_dtype_from_object, is_bool_dtype, @@ -9493,7 +9492,7 @@ def _count_level(self, level: Level, axis: int = 0, numeric_only: bool = False): level_name = count_axis._names[level] level_index = count_axis.levels[level]._rename(name=level_name) - level_codes = ensure_int64(count_axis.codes[level]) + level_codes = ensure_platform_int(count_axis.codes[level]) counts = lib.count_level_2d(mask, level_codes, len(level_index), axis=axis) if axis == 1: diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index b407212fe6a50..142654bbffb18 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -56,7 +56,6 @@ ) from pandas.core.dtypes.common import ( ensure_int64, - ensure_platform_int, is_bool, is_categorical_dtype, is_dict_like, @@ -896,7 +895,6 @@ def count(self) -> Series: val = self.obj._values mask = (ids != -1) & ~isna(val) - ids = ensure_platform_int(ids) minlength = ngroups or 0 out = np.bincount(ids[mask], minlength=minlength) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index f33833193e4e0..aeebd585dc20e 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1140,24 +1140,31 @@ def _cython_agg_general( ) @final - def _transform_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs): - """ - Perform groupby transform routine with the numba engine. - - This routine mimics the data splitting routine of the DataSplitter class - to generate the indices of each group in the sorted data and then passes the - data and indices into a Numba jitted function. - """ + def _numba_prep(self, func, data): if not callable(func): raise NotImplementedError( "Numba engine can only be used with a single function." ) - group_keys = self.grouper._get_group_keys() labels, _, n_groups = self.grouper.group_info sorted_index = get_group_index_sorter(labels, n_groups) sorted_labels = labels.take(sorted_index) + sorted_data = data.take(sorted_index, axis=self.axis).to_numpy() + starts, ends = lib.generate_slices(sorted_labels, n_groups) + return starts, ends, sorted_index, sorted_data + + @final + def _transform_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs): + """ + Perform groupby transform routine with the numba engine. + + This routine mimics the data splitting routine of the DataSplitter class + to generate the indices of each group in the sorted data and then passes the + data and indices into a Numba jitted function. + """ + starts, ends, sorted_index, sorted_data = self._numba_prep(func, data) + group_keys = self.grouper._get_group_keys() numba_transform_func = numba_.generate_numba_transform_func( tuple(args), kwargs, func, engine_kwargs @@ -1183,16 +1190,8 @@ def _aggregate_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs) to generate the indices of each group in the sorted data and then passes the data and indices into a Numba jitted function. """ - if not callable(func): - raise NotImplementedError( - "Numba engine can only be used with a single function." - ) + starts, ends, sorted_index, sorted_data = self._numba_prep(func, data) group_keys = self.grouper._get_group_keys() - labels, _, n_groups = self.grouper.group_info - sorted_index = get_group_index_sorter(labels, n_groups) - sorted_labels = labels.take(sorted_index) - sorted_data = data.take(sorted_index, axis=self.axis).to_numpy() - starts, ends = lib.generate_slices(sorted_labels, n_groups) numba_agg_func = numba_.generate_numba_agg_func( tuple(args), kwargs, func, engine_kwargs diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index c5d36d1588a5f..60f7c54cfeaaf 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -353,7 +353,6 @@ def size(self) -> Series: Compute group sizes. """ ids, _, ngroup = self.group_info - ids = ensure_platform_int(ids) if ngroup: out = np.bincount(ids[ids != -1], minlength=ngroup) else: @@ -381,7 +380,7 @@ def group_info(self): comp_ids, obs_group_ids = self._get_compressed_codes() ngroups = len(obs_group_ids) - comp_ids = ensure_int64(comp_ids) + comp_ids = ensure_platform_int(comp_ids) return comp_ids, obs_group_ids, ngroups @final @@ -707,7 +706,7 @@ def _transform( self, result, values, comp_ids, transform_func, is_datetimelike: bool, **kwargs ): - comp_ids, _, ngroups = self.group_info + _, _, ngroups = self.group_info transform_func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs) return result @@ -918,7 +917,7 @@ def group_info(self): comp_ids = np.repeat(np.r_[-1, np.arange(ngroups)], rep) return ( - comp_ids.astype("int64", copy=False), + ensure_platform_int(comp_ids), obs_group_ids.astype("int64", copy=False), ngroups, ) @@ -981,14 +980,14 @@ def _is_indexed_like(obj, axes, axis: int) -> bool: class DataSplitter(Generic[FrameOrSeries]): def __init__(self, data: FrameOrSeries, labels, ngroups: int, axis: int = 0): self.data = data - self.labels = ensure_int64(labels) + self.labels = ensure_platform_int(labels) # _should_ already be np.intp self.ngroups = ngroups self.axis = axis assert isinstance(axis, int), axis @cache_readonly - def slabels(self) -> np.ndarray: + def slabels(self) -> np.ndarray: # np.ndarray[np.intp] # Sorted labels return self.labels.take(self._sort_idx) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 02c41538ca123..856bd7d159c71 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -237,6 +237,7 @@ def decons_obs_group_ids(comp_ids, obs_ids, shape, labels, xnull: bool): Parameters ---------- + comp_ids : np.ndarray[np.intp] xnull : bool If nulls are excluded; i.e. -1 labels are passed through. """ @@ -249,7 +250,8 @@ def decons_obs_group_ids(comp_ids, obs_ids, shape, labels, xnull: bool): out = decons_group_index(obs_ids, shape) return out if xnull or not lift.any() else [x - y for x, y in zip(out, lift)] - i = unique_label_indices(comp_ids) + # TODO: unique_label_indices only used here, should take ndarray[np.intp] + i = unique_label_indices(ensure_int64(comp_ids)) i8copy = lambda a: a.astype("i8", subok=False, copy=True) return [i8copy(lab[i]) for lab in labels] @@ -517,7 +519,7 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): def get_flattened_list( - comp_ids: np.ndarray, + comp_ids: np.ndarray, # np.ndarray[np.intp] ngroups: int, levels: Iterable[Index], labels: Iterable[np.ndarray], @@ -584,7 +586,7 @@ def get_group_index_sorter( Parameters ---------- - group_index : np.ndarray + group_index : np.ndarray[np.intp] signed integer dtype ngroups : int or None, default None diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py index bb541739c7f44..5fcf4a73479a5 100644 --- a/pandas/tests/groupby/test_bin_groupby.py +++ b/pandas/tests/groupby/test_bin_groupby.py @@ -15,7 +15,7 @@ def test_series_grouper(): obj = Series(np.random.randn(10)) - labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.int64) + labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.intp) grouper = libreduction.SeriesGrouper(obj, np.mean, labels, 2) result, counts = grouper.get_result() @@ -31,7 +31,7 @@ def test_series_grouper_requires_nonempty_raises(): # GH#29500 obj = Series(np.random.randn(10)) dummy = obj.iloc[:0] - labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.int64) + labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.intp) with pytest.raises(ValueError, match="SeriesGrouper requires non-empty `series`"): libreduction.SeriesGrouper(dummy, np.mean, labels, 2) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 2d10dd8d18dc1..3d02e784d83b0 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -663,7 +663,7 @@ def test_groupby_empty(self): # check group properties assert len(gr.grouper.groupings) == 1 tm.assert_numpy_array_equal( - gr.grouper.group_info[0], np.array([], dtype=np.dtype("int64")) + gr.grouper.group_info[0], np.array([], dtype=np.dtype(np.intp)) ) tm.assert_numpy_array_equal( diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index cd800b3f3a452..deee006c4a416 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1712,7 +1712,7 @@ def test_quantile(): def test_unique_label_indices(): - a = np.random.randint(1, 1 << 10, 1 << 15).astype("i8") + a = np.random.randint(1, 1 << 10, 1 << 15).astype("int64") left = ht.unique_label_indices(a) right = np.unique(a, return_index=True)[1] From 97495798c16b11de9427b30b1e9950eb71ec585d Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 25 Mar 2021 10:48:52 -0700 Subject: [PATCH 2/3] update rank_1d int64->intp --- pandas/_libs/algos.pyx | 9 +++++---- pandas/core/algorithms.py | 2 +- pandas/tests/groupby/test_libgroupby.py | 20 ++++++++++---------- pandas/tests/test_algos.py | 2 +- 4 files changed, 17 insertions(+), 16 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 122a014604bf0..b5b90b4987a66 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -490,7 +490,7 @@ def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarra int64_t total_discordant = 0 float64_t kendall_tau int64_t n_obs - const int64_t[:] labels_n + const intp_t[:] labels_n N, K = (mat).shape @@ -499,7 +499,7 @@ def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarra ranked_mat = np.empty((N, K), dtype=np.float64) # For compatibility when calling rank_1d - labels_n = np.zeros(N, dtype=np.int64) + labels_n = np.zeros(N, dtype=np.intp) for i in range(K): ranked_mat[:, i] = rank_1d(mat[:, i], labels_n) @@ -959,7 +959,7 @@ ctypedef fused rank_t: @cython.boundscheck(False) def rank_1d( ndarray[rank_t, ndim=1] values, - const int64_t[:] labels, + const intp_t[:] labels, ties_method="average", bint ascending=True, bint pct=False, @@ -971,7 +971,8 @@ def rank_1d( Parameters ---------- values : array of rank_t values to be ranked - labels : array containing unique label for each group, with its ordering + labels : np.ndarray[np.intp] + Array containing unique label for each group, with its ordering matching up to the corresponding record in `values`. If not called from a groupby operation, will be an array of 0's ties_method : {'average', 'min', 'max', 'first', 'dense'}, default diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 0ad1b4da03c70..77b5a0148905e 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1035,7 +1035,7 @@ def rank( values = _get_values_for_rank(values) ranks = algos.rank_1d( values, - labels=np.zeros(len(values), dtype=np.int64), + labels=np.zeros(len(values), dtype=np.intp), ties_method=method, ascending=ascending, na_option=na_option, diff --git a/pandas/tests/groupby/test_libgroupby.py b/pandas/tests/groupby/test_libgroupby.py index 30b0edf8a139e..febc12edf0b32 100644 --- a/pandas/tests/groupby/test_libgroupby.py +++ b/pandas/tests/groupby/test_libgroupby.py @@ -8,7 +8,7 @@ group_var_float64, ) -from pandas.core.dtypes.common import ensure_int64 +from pandas.core.dtypes.common import ensure_platform_int from pandas import isna import pandas._testing as tm @@ -21,7 +21,7 @@ def test_group_var_generic_1d(self): out = (np.nan * np.ones((5, 1))).astype(self.dtype) counts = np.zeros(5, dtype="int64") values = 10 * prng.rand(15, 1).astype(self.dtype) - labels = np.tile(np.arange(5), (3,)).astype("int64") + labels = np.tile(np.arange(5), (3,)).astype("intp") expected_out = ( np.squeeze(values).reshape((5, 3), order="F").std(axis=1, ddof=1) ** 2 @@ -38,7 +38,7 @@ def test_group_var_generic_1d_flat_labels(self): out = (np.nan * np.ones((1, 1))).astype(self.dtype) counts = np.zeros(1, dtype="int64") values = 10 * prng.rand(5, 1).astype(self.dtype) - labels = np.zeros(5, dtype="int64") + labels = np.zeros(5, dtype="intp") expected_out = np.array([[values.std(ddof=1) ** 2]]) expected_counts = counts + 5 @@ -54,7 +54,7 @@ def test_group_var_generic_2d_all_finite(self): out = (np.nan * np.ones((5, 2))).astype(self.dtype) counts = np.zeros(5, dtype="int64") values = 10 * prng.rand(10, 2).astype(self.dtype) - labels = np.tile(np.arange(5), (2,)).astype("int64") + labels = np.tile(np.arange(5), (2,)).astype("intp") expected_out = np.std(values.reshape(2, 5, 2), ddof=1, axis=0) ** 2 expected_counts = counts + 2 @@ -70,7 +70,7 @@ def test_group_var_generic_2d_some_nan(self): counts = np.zeros(5, dtype="int64") values = 10 * prng.rand(10, 2).astype(self.dtype) values[:, 1] = np.nan - labels = np.tile(np.arange(5), (2,)).astype("int64") + labels = np.tile(np.arange(5), (2,)).astype("intp") expected_out = np.vstack( [ @@ -90,7 +90,7 @@ def test_group_var_constant(self): out = np.array([[np.nan]], dtype=self.dtype) counts = np.array([0], dtype="int64") values = 0.832845131556193 * np.ones((3, 1), dtype=self.dtype) - labels = np.zeros(3, dtype="int64") + labels = np.zeros(3, dtype="intp") self.algo(out, counts, values, labels) @@ -113,7 +113,7 @@ def test_group_var_large_inputs(self): counts = np.array([0], dtype="int64") values = (prng.rand(10 ** 6) + 10 ** 12).astype(self.dtype) values.shape = (10 ** 6, 1) - labels = np.zeros(10 ** 6, dtype="int64") + labels = np.zeros(10 ** 6, dtype="intp") self.algo(out, counts, values, labels) @@ -136,7 +136,7 @@ def _check(dtype): bins = np.array([6, 12, 20]) out = np.zeros((3, 4), dtype) counts = np.zeros(len(out), dtype=np.int64) - labels = ensure_int64(np.repeat(np.arange(3), np.diff(np.r_[0, bins]))) + labels = ensure_platform_int(np.repeat(np.arange(3), np.diff(np.r_[0, bins]))) func = getattr(libgroupby, f"group_ohlc_{dtype}") func(out, counts, obj[:, None], labels) @@ -178,7 +178,7 @@ def _check_cython_group_transform_cumulative(pd_op, np_op, dtype): data = np.array([[1], [2], [3], [4]], dtype=dtype) answer = np.zeros_like(data) - labels = np.array([0, 0, 0, 0], dtype=np.int64) + labels = np.array([0, 0, 0, 0], dtype=np.intp) ngroups = 1 pd_op(answer, data, labels, ngroups, is_datetimelike) @@ -204,7 +204,7 @@ def test_cython_group_transform_algos(): is_datetimelike = False # with nans - labels = np.array([0, 0, 0, 0, 0], dtype=np.int64) + labels = np.array([0, 0, 0, 0, 0], dtype=np.intp) ngroups = 1 data = np.array([[1], [2], [3], [np.nan], [4]], dtype="float64") diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index deee006c4a416..b9d7e59ea9716 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1733,7 +1733,7 @@ def test_scipy_compat(self): def _check(arr): mask = ~np.isfinite(arr) arr = arr.copy() - result = libalgos.rank_1d(arr, labels=np.zeros(len(arr), dtype=np.int64)) + result = libalgos.rank_1d(arr, labels=np.zeros(len(arr), dtype=np.intp)) arr[mask] = np.inf exp = rankdata(arr) exp[mask] = np.nan From 4305c855d0134e86bbbc39d654cff4e6cd04d909 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 25 Mar 2021 12:04:49 -0700 Subject: [PATCH 3/3] ngroup return int64 --- pandas/core/groupby/groupby.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index aeebd585dc20e..76170d95b83df 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2424,7 +2424,9 @@ def ngroup(self, ascending: bool = True): """ with group_selection_context(self): index = self._selected_obj.index - result = self._obj_1d_constructor(self.grouper.group_info[0], index) + result = self._obj_1d_constructor( + self.grouper.group_info[0], index, dtype=np.int64 + ) if not ascending: result = self.ngroups - 1 - result return result