From e32b789c06ab6ecb175b332fa1d1da4cdb340155 Mon Sep 17 00:00:00 2001 From: richard Date: Wed, 16 Aug 2023 14:19:33 -0400 Subject: [PATCH 01/31] REF: Compute correct result_index upfront in groupby --- pandas/core/groupby/generic.py | 24 +- pandas/core/groupby/groupby.py | 160 ++--------- pandas/core/groupby/grouper.py | 86 ++---- pandas/core/groupby/ops.py | 275 +++++++++++-------- pandas/core/indexes/base.py | 2 +- pandas/core/reshape/pivot.py | 2 + pandas/tests/groupby/aggregate/test_other.py | 2 +- pandas/tests/groupby/methods/test_size.py | 6 +- pandas/tests/groupby/test_categorical.py | 29 +- pandas/tests/groupby/test_groupby.py | 4 +- pandas/tests/groupby/test_groupby_dropna.py | 2 +- pandas/tests/groupby/test_grouping.py | 8 +- pandas/tests/groupby/test_raises.py | 11 +- pandas/tests/groupby/test_timegrouper.py | 2 +- pandas/tests/reshape/test_pivot.py | 4 +- 15 files changed, 257 insertions(+), 360 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 1bdba5a3e71fb..e92931d52093d 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -416,7 +416,6 @@ def _wrap_applied_output( # GH #823 #24880 index = self.grouper.result_index res_df = self.obj._constructor_expanddim(values, index=index) - res_df = self._reindex_output(res_df) # if self.observed is False, # keep all-NaN rows created while re-indexing res_ser = res_df.stack(future_stack=True) @@ -442,7 +441,7 @@ def _wrap_applied_output( if not self.as_index: result = self._insert_inaxis_grouper(result) result.index = default_index(len(result)) - return self._reindex_output(result) + return result def _aggregate_named(self, func, *args, **kwargs): # Note: this is very similar to _aggregate_series_pure_python, @@ -672,7 +671,7 @@ def nunique(self, dropna: bool = True) -> Series | DataFrame: 2023-02-01 1 Freq: MS, dtype: int64 """ - ids, _, _ = self.grouper.group_info + ids, _ = self.grouper.group_info val = self.obj._values @@ -721,7 +720,7 @@ def nunique(self, dropna: bool = True) -> Series | DataFrame: if not self.as_index: result = self._insert_inaxis_grouper(result) result.index = default_index(len(result)) - return self._reindex_output(result, fill_value=0) + return result @doc(Series.describe) def describe(self, percentiles=None, include=None, exclude=None) -> Series: @@ -749,7 +748,7 @@ def value_counts( from pandas.core.reshape.merge import get_join_indexers from pandas.core.reshape.tile import cut - ids, _, _ = self.grouper.group_info + ids, _ = self.grouper.group_info val = self.obj._values index_names = self.grouper.names + [self.obj.name] @@ -819,9 +818,18 @@ def value_counts( rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx)) # multi-index components - codes = self.grouper.reconstructed_codes + if isinstance(self.grouper.result_index, MultiIndex): + codes = list(self.grouper.result_index.codes) + else: + codes = [ + algorithms.factorize( + self.grouper.result_index, + sort=self.grouper._sort, + use_na_sentinel=self.grouper.dropna, + )[0] + ] codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)] - levels = [ping.group_index for ping in self.grouper.groupings] + [lev] + levels = self.grouper.levels + [lev] if dropna: mask = codes[-1] != -1 @@ -1686,7 +1694,7 @@ def _wrap_applied_output_series( if not self.as_index: result = self._insert_inaxis_grouper(result) - return self._reindex_output(result) + return result def _cython_transform( self, diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index b9b69d4ef0c87..8c2703eceec69 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -54,7 +54,6 @@ class providing the base-class of operations. NDFrameT, PositionalIndexer, RandomState, - Scalar, T, npt, ) @@ -788,7 +787,7 @@ def __repr__(self) -> str: @final @property - def groups(self) -> dict[Hashable, np.ndarray]: + def groups(self) -> dict[Hashable, Index]: """ Dict {group name -> group labels}. @@ -1505,7 +1504,7 @@ def _set_result_index_ordered( return result # row order is scrambled => sort the rows by position in original index - original_positions = Index(self.grouper.result_ilocs()) + original_positions = Index(self.grouper.result_ilocs) result = result.set_axis(original_positions, axis=self.axis, copy=False) result = result.sort_index(axis=self.axis) if self.grouper.has_dropped_na: @@ -1599,7 +1598,7 @@ def _wrap_aggregated_output( # error: Argument 1 to "_maybe_transpose_result" of "GroupBy" has # incompatible type "Union[Series, DataFrame]"; expected "NDFrameT" res = self._maybe_transpose_result(result) # type: ignore[arg-type] - return self._reindex_output(res, qs=qs) + return res def _wrap_applied_output( self, @@ -1615,8 +1614,8 @@ def _wrap_applied_output( @final def _numba_prep(self, data: DataFrame): - ids, _, ngroups = self.grouper.group_info - sorted_index = self.grouper._sort_idx + ids, ngroups = self.grouper.group_info + sorted_index = self.grouper.result_ilocs sorted_ids = self.grouper._sorted_ids sorted_data = data.take(sorted_index, axis=self.axis).to_numpy() @@ -1669,7 +1668,7 @@ def _numba_agg_general( ) # Pass group ids to kernel directly if it can handle it # (This is faster since it doesn't require a sort) - ids, _, _ = self.grouper.group_info + ids, _ = self.grouper.group_info ngroups = self.grouper.ngroups res_mgr = df._mgr.apply( @@ -2043,7 +2042,7 @@ def _wrap_transform_fast_result(self, result: NDFrameT) -> NDFrameT: obj = self._obj_with_exclusions # for each col, reshape to size of original frame by take operation - ids, _, _ = self.grouper.group_info + ids = self.grouper.result_index_and_codes[1] result = result.reindex(self.grouper.result_index, axis=self.axis, copy=False) if self.obj.ndim == 1: @@ -2096,7 +2095,7 @@ def _cumcount_array(self, ascending: bool = True) -> np.ndarray: this is currently implementing sort=False (though the default is sort=True) for groupby in general """ - ids, _, ngroups = self.grouper.group_info + ids, ngroups = self.grouper.group_info sorter = get_group_index_sorter(ids, ngroups) ids, count = ids[sorter], len(ids) @@ -2305,7 +2304,7 @@ def count(self) -> NDFrameT: Freq: MS, dtype: int64 """ data = self._get_data_to_aggregate() - ids, _, ngroups = self.grouper.group_info + ids, ngroups = self.grouper.group_info mask = ids != -1 is_series = data.ndim == 1 @@ -2335,15 +2334,9 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike: new_mgr = data.grouped_reduce(hfunc) new_obj = self._wrap_agged_manager(new_mgr) + result = self._wrap_aggregated_output(new_obj) - # If we are grouping on categoricals we want unobserved categories to - # return zero, rather than the default of NaN which the reindexing in - # _wrap_aggregated_output() returns. GH 35028 - # e.g. test_dataframe_groupby_on_2_categoricals_when_observed_is_false - with com.temp_setattr(self, "observed", True): - result = self._wrap_aggregated_output(new_obj) - - return self._reindex_output(result, fill_value=0) + return result @final @Substitution(name="groupby") @@ -2820,7 +2813,7 @@ def _value_counts( and not grouping._observed for grouping in groupings ): - levels_list = [ping.result_index for ping in groupings] + levels_list = gb.grouper.levels multi_index, _ = MultiIndex.from_product( levels_list, names=[ping.name for ping in groupings] ).sortlevel() @@ -3043,10 +3036,6 @@ def size(self) -> DataFrame | Series: dtype_backend=dtype_backend, ) - with com.temp_setattr(self, "as_index", True): - # size already has the desired behavior in GH#49519, but this makes the - # as_index=False path of _reindex_output fail on categorical groupers. - result = self._reindex_output(result, fill_value=0) if not self.as_index: # error: Incompatible types in assignment (expression has # type "DataFrame", variable has type "Series") @@ -3124,7 +3113,7 @@ def sum( npfunc=np.sum, ) - return self._reindex_output(result, fill_value=0) + return result @final @doc( @@ -3522,7 +3511,7 @@ def ohlc(self) -> DataFrame: result = self.obj._constructor_expanddim( res_values, index=self.grouper.result_index, columns=agg_names ) - return self._reindex_output(result) + return result result = self._apply_to_column_groupbys(lambda sgb: sgb.ohlc()) return result @@ -3907,7 +3896,7 @@ def _fill(self, direction: Literal["ffill", "bfill"], limit: int | None = None): if limit is None: limit = -1 - ids, _, _ = self.grouper.group_info + ids, _ = self.grouper.group_info sorted_labels = np.argsort(ids, kind="mergesort").astype(np.intp, copy=False) if direction == "bfill": sorted_labels = sorted_labels[::-1] @@ -4238,7 +4227,7 @@ def _nth( if not dropna: mask = self._make_mask_from_positional_indexer(n) - ids, _, _ = self.grouper.group_info + ids, _ = self.grouper.group_info # Drop NA values in grouping mask = mask & (ids != -1) @@ -4449,12 +4438,13 @@ def post_processor( qs = np.array([q], dtype=np.float64) pass_qs = None - ids, _, ngroups = self.grouper.group_info + ids, ngroups = self.grouper.group_info + ids = ids[ids >= 0] nqs = len(qs) func = partial( libgroupby.group_quantile, - labels=ids, + labels=ids[ids >= 0], qs=qs, interpolation=interpolation, starts=starts, @@ -5169,7 +5159,7 @@ def shift( else: if fill_value is lib.no_default: fill_value = None - ids, _, ngroups = self.grouper.group_info + ids, ngroups = self.grouper.group_info res_indexer = np.zeros(len(ids), dtype=np.int64) libgroupby.group_shift_indexer(res_indexer, ids, ngroups, period) @@ -5516,104 +5506,6 @@ def _mask_selected_obj(self, mask: npt.NDArray[np.bool_]) -> NDFrameT: else: return self._selected_obj.iloc[:, mask] - @final - def _reindex_output( - self, - output: OutputFrameOrSeries, - fill_value: Scalar = np.nan, - qs: npt.NDArray[np.float64] | None = None, - ) -> OutputFrameOrSeries: - """ - If we have categorical groupers, then we might want to make sure that - we have a fully re-indexed output to the levels. This means expanding - the output space to accommodate all values in the cartesian product of - our groups, regardless of whether they were observed in the data or - not. This will expand the output space if there are missing groups. - - The method returns early without modifying the input if the number of - groupings is less than 2, self.observed == True or none of the groupers - are categorical. - - Parameters - ---------- - output : Series or DataFrame - Object resulting from grouping and applying an operation. - fill_value : scalar, default np.nan - Value to use for unobserved categories if self.observed is False. - qs : np.ndarray[float64] or None, default None - quantile values, only relevant for quantile. - - Returns - ------- - Series or DataFrame - Object (potentially) re-indexed to include all possible groups. - """ - groupings = self.grouper.groupings - if len(groupings) == 1: - return output - - # if we only care about the observed values - # we are done - elif self.observed: - return output - - # reindexing only applies to a Categorical grouper - elif not any( - isinstance(ping.grouping_vector, (Categorical, CategoricalIndex)) - for ping in groupings - ): - return output - - levels_list = [ping.group_index for ping in groupings] - names = self.grouper.names - if qs is not None: - # error: Argument 1 to "append" of "list" has incompatible type - # "ndarray[Any, dtype[floating[_64Bit]]]"; expected "Index" - levels_list.append(qs) # type: ignore[arg-type] - names = names + [None] - index = MultiIndex.from_product(levels_list, names=names) - if self.sort: - index = index.sort_values() - - if self.as_index: - # Always holds for SeriesGroupBy unless GH#36507 is implemented - d = { - self.obj._get_axis_name(self.axis): index, - "copy": False, - "fill_value": fill_value, - } - return output.reindex(**d) # type: ignore[arg-type] - - # GH 13204 - # Here, the categorical in-axis groupers, which need to be fully - # expanded, are columns in `output`. An idea is to do: - # output = output.set_index(self.grouper.names) - # .reindex(index).reset_index() - # but special care has to be taken because of possible not-in-axis - # groupers. - # So, we manually select and drop the in-axis grouper columns, - # reindex `output`, and then reset the in-axis grouper columns. - - # Select in-axis groupers - in_axis_grps = [ - (i, ping.name) for (i, ping) in enumerate(groupings) if ping.in_axis - ] - if len(in_axis_grps) > 0: - g_nums, g_names = zip(*in_axis_grps) - output = output.drop(labels=list(g_names), axis=1) - - # Set a temp index and reindex (possibly expanding) - output = output.set_index(self.grouper.result_index).reindex( - index, copy=False, fill_value=fill_value - ) - - # Reset in-axis grouper columns - # (using level numbers `g_nums` because level names may not be unique) - if len(in_axis_grps) > 0: - output = output.reset_index(level=g_nums) - - return output.reset_index(drop=True) - @final def sample( self, @@ -5785,14 +5677,10 @@ def _idxmax_idxmin( if not self.observed and any( ping._passed_categorical for ping in self.grouper.groupings ): - expected_len = np.prod( - [len(ping.group_index) for ping in self.grouper.groupings] - ) - if len(self.grouper.groupings) == 1: - result_len = len(self.grouper.groupings[0].grouping_vector.unique()) - else: - # result_index only contains observed groups in this case - result_len = len(self.grouper.result_index) + expected_len = len(self.grouper.result_index) + # TODO: Better way to find # of observed groups? + group_sizes = self.grouper.size() + result_len = group_sizes[group_sizes > 0].shape[0] assert result_len <= expected_len has_unobserved = result_len < expected_len diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 06e6755079a22..ff249e24d0c49 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -36,7 +36,6 @@ from pandas.core.groupby import ops from pandas.core.groupby.categorical import recode_for_groupby from pandas.core.indexes.api import ( - CategoricalIndex, Index, MultiIndex, ) @@ -676,7 +675,7 @@ def _ilevel(self) -> int | None: @property def ngroups(self) -> int: - return len(self.group_index) + return len(self._codes_and_uniques[1]) @cache_readonly def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: @@ -691,56 +690,6 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: def codes(self) -> npt.NDArray[np.signedinteger]: return self._codes_and_uniques[0] - @cache_readonly - def group_arraylike(self) -> ArrayLike: - """ - Analogous to result_index, but holding an ArrayLike to ensure - we can retain ExtensionDtypes. - """ - if self._all_grouper is not None: - # retain dtype for categories, including unobserved ones - return self.result_index._values - - elif self._passed_categorical: - return self.group_index._values - - return self._codes_and_uniques[1] - - @cache_readonly - def result_index(self) -> Index: - # result_index retains dtype for categories, including unobserved ones, - # which group_index does not - if self._all_grouper is not None: - group_idx = self.group_index - assert isinstance(group_idx, CategoricalIndex) - cats = self._orig_cats - # set_categories is dynamically added - return group_idx.set_categories(cats) # type: ignore[attr-defined] - return self.group_index - - @cache_readonly - def group_index(self) -> Index: - codes, uniques = self._codes_and_uniques - if not self._dropna and self._passed_categorical: - assert isinstance(uniques, Categorical) - if self._sort and (codes == len(uniques)).any(): - # Add NA value on the end when sorting - uniques = Categorical.from_codes( - np.append(uniques.codes, [-1]), uniques.categories, validate=False - ) - elif len(codes) > 0: - # Need to determine proper placement of NA value when not sorting - cat = self.grouping_vector - na_idx = (cat.codes < 0).argmax() - if cat.codes[na_idx] < 0: - # count number of unique codes that comes before the nan value - na_unique_idx = algorithms.nunique_ints(cat.codes[:na_idx]) - new_codes = np.insert(uniques.codes, na_unique_idx, -1) - uniques = Categorical.from_codes( - new_codes, uniques.categories, validate=False - ) - return Index._with_infer(uniques, name=self.name) - @cache_readonly def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: uniques: ArrayLike @@ -759,27 +708,34 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: else: ucodes = np.arange(len(categories)) - uniques = Categorical.from_codes( - codes=ucodes, categories=categories, ordered=cat.ordered, validate=False - ) - - codes = cat.codes if not self._dropna: - na_mask = codes < 0 + na_mask = cat.codes < 0 if np.any(na_mask): if self._sort: # Replace NA codes with `largest code + 1` na_code = len(categories) - codes = np.where(na_mask, na_code, codes) else: # Insert NA code into the codes based on first appearance # A negative code must exist, no need to check codes[na_idx] < 0 na_idx = na_mask.argmax() # count number of unique codes that comes before the nan value - na_code = algorithms.nunique_ints(codes[:na_idx]) + na_code = algorithms.nunique_ints(cat.codes[:na_idx]) + ucodes = np.insert(ucodes, na_code, -1) + + uniques = Categorical.from_codes( + codes=ucodes, categories=categories, ordered=cat.ordered, validate=False + ) + codes = cat.codes + if not self._dropna: + na_mask = codes < 0 + if np.any(na_mask): + if self._sort: + codes = np.where(na_mask, na_code, codes) + else: codes = np.where(codes >= na_code, codes + 1, codes) codes = np.where(na_mask, na_code, codes) + # TODO: Can this be removed? if not self._observed: uniques = uniques.reorder_categories(self._orig_cats) @@ -805,8 +761,10 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: return codes, uniques @cache_readonly - def groups(self) -> dict[Hashable, np.ndarray]: - cats = Categorical.from_codes(self.codes, self.group_index, validate=False) + def groups(self) -> dict[Hashable, Index]: + codes, uniques = self._codes_and_uniques + uniques = Index._with_infer(uniques, name=self.name) + cats = Categorical.from_codes(codes, uniques, validate=False) return self._index.groupby(cats) @@ -1040,7 +998,9 @@ def is_in_obj(gpr) -> bool: groupings.append(Grouping(Index([], dtype="int"), np.array([], dtype=np.intp))) # create the internals grouper - grouper = ops.BaseGrouper(group_axis, groupings, sort=sort, dropna=dropna) + grouper = ops.BaseGrouper( + group_axis, groupings, sort=sort, dropna=dropna, observed=observed + ) return grouper, frozenset(exclusions), obj diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 466bbac641077..1f180a76f7f79 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -24,6 +24,7 @@ ) import pandas._libs.groupby as libgroupby from pandas._typing import ( + AnyArrayLike, ArrayLike, AxisInt, NDFrameT, @@ -50,6 +51,7 @@ maybe_fill, ) +from pandas.core.arrays import Categorical from pandas.core.frame import DataFrame from pandas.core.groupby import grouper from pandas.core.indexes.api import ( @@ -60,10 +62,6 @@ ) from pandas.core.series import Series from pandas.core.sorting import ( - compress_group_index, - decons_obs_group_ids, - get_flattened_list, - get_group_index, get_group_index_sorter, get_indexer_dict, ) @@ -581,6 +579,7 @@ def __init__( groupings: Sequence[grouper.Grouping], sort: bool = True, dropna: bool = True, + observed: bool = True, ) -> None: assert isinstance(axis, Index), axis @@ -588,6 +587,7 @@ def __init__( self._groupings: list[grouper.Grouping] = list(groupings) self._sort = sort self.dropna = dropna + self.observed = observed @property def groupings(self) -> list[grouper.Grouping]: @@ -616,7 +616,8 @@ def get_iterator( for each group """ splitter = self._get_splitter(data, axis=axis) - keys = self.group_keys_seq + # TODO: Skip unobserved for transform? + keys = self.result_index_and_codes[0] yield from zip(keys, splitter) @final @@ -626,27 +627,17 @@ def _get_splitter(self, data: NDFrame, axis: AxisInt = 0) -> DataSplitter: ------- Generator yielding subsetted objects """ - ids, _, ngroups = self.group_info + ids, ngroups = self.group_info + ids = self.result_index_and_codes[1] return _get_splitter( data, ids, ngroups, sorted_ids=self._sorted_ids, - sort_idx=self._sort_idx, + sort_idx=self.result_ilocs, axis=axis, ) - @final - @cache_readonly - def group_keys_seq(self): - if len(self.groupings) == 1: - return self.levels[0] - else: - ids, _, ngroups = self.group_info - - # provide "flattened" iterator for multi-group setting - return get_flattened_list(ids, ngroups, self.levels, self.codes) - @cache_readonly def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: """dict {group name -> group indices}""" @@ -654,10 +645,10 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: # This shows unused categories in indices GH#38642 return self.groupings[0].indices codes_list = [ping.codes for ping in self.groupings] - keys = [ping.group_index for ping in self.groupings] - return get_indexer_dict(codes_list, keys) + return get_indexer_dict(codes_list, self.levels) @final + @cache_readonly def result_ilocs(self) -> npt.NDArray[np.intp]: """ Get the original integer locations of result_index in the input. @@ -665,10 +656,7 @@ def result_ilocs(self) -> npt.NDArray[np.intp]: # Original indices are where group_index would go via sorting. # But when dropna is true, we need to remove null values while accounting for # any gaps that then occur because of them. - group_index = get_group_index( - self.codes, self.shape, sort=self._sort, xnull=True - ) - group_index, _ = compress_group_index(group_index, sort=self._sort) + group_index = self.result_index_and_codes[1] if self.has_dropped_na: mask = np.where(group_index >= 0) @@ -684,14 +672,17 @@ def result_ilocs(self) -> npt.NDArray[np.intp]: return result - @final @property def codes(self) -> list[npt.NDArray[np.signedinteger]]: return [ping.codes for ping in self.groupings] @property def levels(self) -> list[Index]: - return [ping.group_index for ping in self.groupings] + if len(self.groupings) > 1: + # mypy doesn't know result_index must be a MultiIndex + return list(self.result_index.levels) # type: ignore[attr-defined] + else: + return [self.result_index] @property def names(self) -> list[Hashable]: @@ -702,7 +693,7 @@ def size(self) -> Series: """ Compute group sizes. """ - ids, _, ngroups = self.group_info + ids, ngroups = self.group_info out: np.ndarray | list if ngroups: out = np.bincount(ids[ids != -1], minlength=ngroups) @@ -711,20 +702,26 @@ def size(self) -> Series: return Series(out, index=self.result_index, dtype="int64") @cache_readonly - def groups(self) -> dict[Hashable, np.ndarray]: + def groups(self) -> dict[Hashable, Index]: """dict {group name -> group labels}""" if len(self.groupings) == 1: - return self.groupings[0].groups + result = self.groupings[0].groups + return result + + if len(self.result_index) == 0: + index = self.result_index else: - to_groupby = [] - for ping in self.groupings: - gv = ping.grouping_vector - if not isinstance(gv, BaseGrouper): - to_groupby.append(gv) - else: - to_groupby.append(gv.groupings[0].grouping_vector) - index = MultiIndex.from_arrays(to_groupby) - return self.axis.groupby(index) + index = self.result_index.take(self.result_index_and_codes[1]) + categories = ( + self.result_index._values + if isinstance(self.result_index, MultiIndex) + else self.result_index + ) + values = index._values if isinstance(index, MultiIndex) else index + cats = Categorical(values, categories) + result = {k: self.axis.take(v) for k, v in cats._reverse_indexer().items()} + + return result @final @cache_readonly @@ -738,73 +735,141 @@ def has_dropped_na(self) -> bool: """ Whether grouper has null value(s) that are dropped. """ - return bool((self.group_info[0] < 0).any()) + return bool((self.result_index_and_codes[1] < 0).any()) @cache_readonly - def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: - comp_ids, obs_group_ids = self._get_compressed_codes() - - ngroups = len(obs_group_ids) - comp_ids = ensure_platform_int(comp_ids) - - return comp_ids, obs_group_ids, ngroups + def group_info(self) -> tuple[npt.NDArray[np.intp], int]: + result_index, codes = self.result_index_and_codes + ngroups = len(result_index) + return codes, ngroups @cache_readonly def codes_info(self) -> npt.NDArray[np.intp]: # return the codes of items in original grouped axis - ids, _, _ = self.group_info + ids, _ = self.group_info return ids - @final - def _get_compressed_codes( - self, - ) -> tuple[npt.NDArray[np.signedinteger], npt.NDArray[np.intp]]: - # The first returned ndarray may have any signed integer dtype - if len(self.groupings) > 1: - group_index = get_group_index(self.codes, self.shape, sort=True, xnull=True) - return compress_group_index(group_index, sort=self._sort) - # FIXME: compress_group_index's second return value is int64, not intp - - ping = self.groupings[0] - return ping.codes, np.arange(len(ping.group_index), dtype=np.intp) - @final @cache_readonly def ngroups(self) -> int: return len(self.result_index) @property - def reconstructed_codes(self) -> list[npt.NDArray[np.intp]]: - codes = self.codes - ids, obs_ids, _ = self.group_info - return decons_obs_group_ids(ids, obs_ids, self.shape, codes, xnull=True) + def result_index(self) -> Index: + return self.result_index_and_codes[0] @cache_readonly - def result_index(self) -> Index: + def result_index_and_codes(self) -> tuple[Index, np.ndarray]: + from pandas.core.sorting import ( + compress_group_index, + decons_obs_group_ids, + get_group_index, + ) + + codes_and_uniques = [ping._codes_and_uniques for ping in self.groupings] + + codes = [e[0] for e in codes_and_uniques] + levels = [e[1] for e in codes_and_uniques] + # TODO: Modify in Grouping.groups instead? + for k, (ping, level) in enumerate(zip(self.groupings, levels)): + if ping._passed_categorical: + # set_categories is dynamically added + levels[k] = level.set_categories( # type: ignore[union-attr] + ping._orig_cats + ) + names = self.names + + obs = [ + ping._observed or not ping._passed_categorical for ping in self.groupings + ] + ob_indices = [k for k, e in enumerate(obs) if e] + if len(self.groupings) == 1: - return self.groupings[0].result_index.rename(self.names[0]) + result_index: AnyArrayLike = levels[0] + if not isinstance(result_index, Index): + result_index = Index(levels[0], name=names[0]) + else: + result_index.name = names[0] + ids = codes[0].astype("intp") + return result_index, ids + elif any(obs): + ob_codes = [e for e, o in zip(codes, obs) if o] + ob_levels = [e for e, o in zip(levels, obs) if o] + ob_names = [e for e, o in zip(names, obs) if o] + + shape = tuple(len(level) for level in ob_levels) + group_index = get_group_index(ob_codes, shape, sort=True, xnull=True) + ob_ids, obs_group_ids = compress_group_index(group_index, sort=self._sort) + ob_ids = ensure_platform_int(ob_ids) + ids, obs_ids = ob_ids, obs_group_ids + ob_index_codes = decons_obs_group_ids( + ids, obs_ids, shape, ob_codes, xnull=True + ) - codes = self.reconstructed_codes - levels = [ping.result_index for ping in self.groupings] - return MultiIndex( - levels=levels, codes=codes, verify_integrity=False, names=self.names - ) + ob_index = MultiIndex( + levels=ob_levels, + codes=ob_index_codes, + names=ob_names, + verify_integrity=False, + ) + if not all(obs): + unob_codes = [e for e, o in zip(codes, obs) if not o] + unob_levels = [e for e, o in zip(levels, obs) if not o] + unob_names = [e for e, o in zip(names, obs) if not o] + + shape = tuple(len(level) for level in unob_levels) + unob_ids = get_group_index(unob_codes, shape, sort=True, xnull=True) + + unob_index = MultiIndex.from_product(unob_levels, names=unob_names) + + if all(obs): + result_index = ob_index + ids = ob_ids + elif not any(obs): + result_index = unob_index + ids = unob_ids + else: + ob_indices = [k for k, e in enumerate(obs) if e] + unob_indices = [k for k, e in enumerate(obs) if not e] + _, index, inverse = np.unique( + unob_indices + ob_indices, return_index=True, return_inverse=True + ) + result_index_codes = np.concatenate( + [ + np.tile(unob_index.codes, len(ob_index)), + np.repeat(ob_index.codes, len(unob_index), axis=1), + ], + axis=0, + ) + result_index = MultiIndex( + levels=[levels[k] for k in inverse], + codes=result_index_codes, + names=[names[k] for k in inverse], + ).reorder_levels(index) + + ids = len(unob_index) * ob_ids + unob_ids + sorter = result_index.argsort() + result_index = result_index.take(sorter) + _, inverse = np.unique(sorter, return_index=True) + ids = inverse.take(ids) + + if len(levels) == 1: + result_index = result_index.get_level_values(0) + + return result_index, ids + # TODO: How is this different from .levels? @final - def get_group_levels(self) -> list[ArrayLike]: + def get_group_levels(self) -> list[Index]: # Note: only called from _insert_inaxis_grouper, which # is only called for BaseGrouper, never for BinGrouper + result_index = self.result_index_and_codes[0] if len(self.groupings) == 1: - return [self.groupings[0].group_arraylike] - - name_list = [] - for ping, codes in zip(self.groupings, self.reconstructed_codes): - codes = ensure_platform_int(codes) - levels = ping.group_arraylike.take(codes) - - name_list.append(levels) - - return name_list + return [result_index] + return [ + result_index.get_level_values(level) + for level in range(result_index.nlevels) + ] # ------------------------------------------------------------ # Aggregation functions @@ -826,7 +891,7 @@ def _cython_operation( cy_op = WrappedCythonOp(kind=kind, how=how, has_dropped_na=self.has_dropped_na) - ids, _, _ = self.group_info + _, ids = self.result_index_and_codes ngroups = self.ngroups return cy_op.cython_operation( values=values, @@ -881,7 +946,7 @@ def agg_series( def _aggregate_series_pure_python( self, obj: Series, func: Callable ) -> npt.NDArray[np.object_]: - _, _, ngroups = self.group_info + _, ngroups = self.group_info result = np.empty(ngroups, dtype="O") initialized = False @@ -907,7 +972,7 @@ def apply_groupwise( ) -> tuple[list, bool]: mutated = False splitter = self._get_splitter(data, axis=axis) - group_keys = self.group_keys_seq + group_keys = self.result_index result_values = [] # This calls DataSplitter.__iter__ @@ -944,17 +1009,13 @@ def apply_groupwise( # Methods for sorting subsets of our GroupBy's object @final - @cache_readonly - def _sort_idx(self) -> npt.NDArray[np.intp]: - # Counting sort indexer - ids, _, ngroups = self.group_info - return get_group_index_sorter(ids, ngroups) - - @final - @cache_readonly + @property def _sorted_ids(self) -> npt.NDArray[np.intp]: - ids, _, _ = self.group_info - return ids.take(self._sort_idx) + ids = self.result_index_and_codes[1] + result = ids.take(self.result_ilocs) + if getattr(self, "dropna", True): + result = result[result >= 0] + return result class BinGrouper(BaseGrouper): @@ -1025,7 +1086,7 @@ def nkeys(self) -> int: @cache_readonly def codes_info(self) -> npt.NDArray[np.intp]: # return the codes of items in original grouped axis - ids, _, _ = self.group_info + ids, _ = self.group_info if self.indexer is not None: sorter = np.lexsort((ids, self.indexer)) ids = ids[sorter] @@ -1069,9 +1130,8 @@ def indices(self): return indices @cache_readonly - def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: + def group_info(self) -> tuple[npt.NDArray[np.intp], int]: ngroups = self.ngroups - obs_group_ids = np.arange(ngroups, dtype=np.intp) rep = np.diff(np.r_[0, self.bins]) rep = ensure_platform_int(rep) @@ -1080,16 +1140,7 @@ def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: else: comp_ids = np.repeat(np.r_[-1, np.arange(ngroups)], rep) - return ( - ensure_platform_int(comp_ids), - obs_group_ids, - ngroups, - ) - - @cache_readonly - def reconstructed_codes(self) -> list[np.ndarray]: - # get unique result indices, and prepend 0 as groupby starts from the first - return [np.r_[0, np.flatnonzero(self.bins[1:] != self.bins[:-1]) + 1]] + return (ensure_platform_int(comp_ids), ngroups) @cache_readonly def result_index(self) -> Index: @@ -1098,6 +1149,14 @@ def result_index(self) -> Index: return self.binlabels + @cache_readonly + def codes(self) -> list[npt.NDArray[np.intp]]: + return [self.group_info[0]] + + @cache_readonly + def result_index_and_codes(self): + return self.result_index, self.group_info[0] + @property def levels(self) -> list[Index]: return [self.binlabels] diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 761f5df3bb4e0..03524fe49e6ee 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -6317,7 +6317,7 @@ def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: return True @final - def groupby(self, values) -> PrettyDict[Hashable, np.ndarray]: + def groupby(self, values) -> PrettyDict[Hashable, Index]: """ Group the index labels by a given array of values. diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 79354fdd12a2d..5380c6c7eeada 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -717,6 +717,8 @@ def crosstab( margins=margins, margins_name=margins_name, dropna=dropna, + # TODO: Not sure if this is okay + observed=dropna, **kwargs, # type: ignore[arg-type] ) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 398e9b09693e6..a5021bf72662c 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -519,7 +519,7 @@ def test_sum_uint64_overflow(): df = df + 9223372036854775807 index = Index( - [9223372036854775808, 9223372036854775810, 9223372036854775812], dtype=np.uint64 + [9223372036854775808, 9223372036854775810, 9223372036854775812], dtype=object ) expected = DataFrame( {1: [9223372036854775809, 9223372036854775811, 9223372036854775813]}, diff --git a/pandas/tests/groupby/methods/test_size.py b/pandas/tests/groupby/methods/test_size.py index 93a4e743d0d71..e83001372af4e 100644 --- a/pandas/tests/groupby/methods/test_size.py +++ b/pandas/tests/groupby/methods/test_size.py @@ -42,12 +42,14 @@ def test_size_axis_1(df, axis_1, by, sort, dropna): expected = expected.sort_index() if is_integer_dtype(expected.index.dtype) and not any(x is None for x in by): expected.index = expected.index.astype(int) - + if any(x is None for x in by): + expected.index = expected.index.astype(object) msg = "DataFrame.groupby with axis=1 is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): grouped = df.groupby(by=by, axis=axis_1, sort=sort, dropna=dropna) result = grouped.size() - tm.assert_series_equal(result, expected) + # TODO: Comes through as int-nan; expects float-nan + tm.assert_series_equal(result, expected, check_index_type=False) @pytest.mark.parametrize("by", ["A", "B", ["A", "B"]]) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 939dd176ae90e..844eb40e3ea2a 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -42,8 +42,8 @@ def f(a): # These expected values can be used across several tests (i.e. they are # the same for SeriesGroupBy and DataFrameGroupBy) but they should only be # hardcoded in one place. - "all": np.nan, - "any": np.nan, + "all": True, + "any": False, "count": 0, "corrwith": np.nan, "first": np.nan, @@ -56,7 +56,7 @@ def f(a): "min": np.nan, "nth": np.nan, "nunique": 0, - "prod": np.nan, + "prod": 1, "quantile": np.nan, "sem": np.nan, "size": 0, @@ -1275,11 +1275,7 @@ def test_seriesgroupby_observed_false_or_none(df_cat, observed, operation): names=["A", "B"], ).sortlevel() - expected = Series(data=[2, 4, np.nan, 1, np.nan, 3], index=index, name="C") - if operation == "agg": - msg = "The 'downcast' keyword in fillna is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = expected.fillna(0, downcast="infer") + expected = Series(data=[2, 4, 0, 1, 0, 3], index=index, name="C") grouped = df_cat.groupby(["A", "B"], observed=observed)["C"] msg = "using SeriesGroupBy.sum" if operation == "agg" else "using np.sum" with tm.assert_produces_warning(FutureWarning, match=msg): @@ -1470,6 +1466,7 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans( zero_or_nan = _results_for_groupbys_with_missing_categories[reduction_func] + # TODO: Clean this up.. zero_or_nan can be bool and 1 for idx in unobserved: val = result.loc[idx] assert (pd.isna(zero_or_nan) and pd.isna(val)) or (val == zero_or_nan) @@ -1478,7 +1475,12 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans( # Except for .sum(). If the observed categories sum to dtype=float (i.e. their # sums have decimals), then the zeros for the missing categories should also be # floats. - if zero_or_nan == 0 and reduction_func != "sum": + if ( + zero_or_nan == 0 + and reduction_func != "sum" + and reduction_func != "any" + and reduction_func != "all" + ): assert np.issubdtype(result.dtype, np.integer) @@ -2124,15 +2126,6 @@ def test_agg_list(request, as_index, observed, reduction_func, test_series, keys elif reduction_func == "corrwith": msg = "GH#32293: attempts to call SeriesGroupBy.corrwith" request.applymarker(pytest.mark.xfail(reason=msg)) - elif ( - reduction_func == "nunique" - and not test_series - and len(keys) != 1 - and not observed - and not as_index - ): - msg = "GH#52848 - raises a ValueError" - request.applymarker(pytest.mark.xfail(reason=msg)) df = DataFrame({"a1": [0, 0, 1], "a2": [2, 3, 3], "b": [4, 5, 6]}) df = df.astype({"a1": "category", "a2": "category"}) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index de72afccbf1c0..66e5f2bf133f9 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -141,7 +141,7 @@ def test_groupby_nonobject_dtype(mframe, df_mixed_floats): expected = mframe.groupby(key.astype("O")).sum() assert result.index.dtype == np.int8 - assert expected.index.dtype == np.int64 + assert expected.index.dtype == object tm.assert_frame_equal(result, expected, check_index_type=False) # GH 3911, mixed frame non-conversion @@ -320,7 +320,7 @@ def test_len(): df = DataFrame({"a": [np.nan] * 3, "b": [1, 2, 3]}) assert len(df.groupby("a")) == 0 assert len(df.groupby("b")) == 3 - assert len(df.groupby(["a", "b"])) == 3 + assert len(df.groupby(["a", "b"])) == 0 def test_basic_regression(): diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index b40c8f45b6d19..464eb7c09cb47 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -494,7 +494,7 @@ def test_null_is_null_for_dtype( obj = df["a"] if test_series else df gb = obj.groupby(groups, dropna=False, sort=sort) result = gb.sum() - index = pd.Index([na_value_for_dtype(groups.dtype)]) + index = pd.Index([na_value_for_dtype(groups.dtype)], dtype=groups.dtype) expected = pd.DataFrame({"a": [3]}, index=index) if test_series: tm.assert_series_equal(result, expected["a"]) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 8c2b95ba631ee..d3efd209f4f70 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -725,7 +725,7 @@ def test_level_preserve_order(self, sort, labels, mframe): # GH 17537 grouped = mframe.groupby(level=0, sort=sort) exp_labels = np.array(labels, np.intp) - tm.assert_almost_equal(grouped.grouper.codes[0], exp_labels) + tm.assert_almost_equal(grouped.grouper.result_index_and_codes[1], exp_labels) def test_grouping_labels(self, mframe): grouped = mframe.groupby(mframe.index.get_level_values(0)) @@ -794,11 +794,7 @@ def test_groupby_empty(self): gr.grouper.group_info[0], np.array([], dtype=np.dtype(np.intp)) ) - tm.assert_numpy_array_equal( - gr.grouper.group_info[1], np.array([], dtype=np.dtype(np.intp)) - ) - - assert gr.grouper.group_info[2] == 0 + assert gr.grouper.group_info[1] == 0 # check name assert s.groupby(s).grouper.names == ["name"] diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 0f4a73e4e2a38..7943e1ff3a81a 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -543,6 +543,7 @@ def test_groupby_raises_category_np( @pytest.mark.parametrize("how", ["method", "agg", "transform"]) def test_groupby_raises_category_on_category( + request, how, by, groupby_series, @@ -569,16 +570,6 @@ def test_groupby_raises_category_on_category( return empty_groups = not observed and any(group.empty for group in gb.groups.values()) - if ( - not observed - and how != "transform" - and isinstance(by, list) - and isinstance(by[0], str) - and by == ["a", "b"] - ): - assert not empty_groups - # TODO: empty_groups should be true due to unobserved categorical combinations - empty_groups = True if how == "transform": # empty groups will be ignored empty_groups = False diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index 80860d5192857..76eab9690b62e 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -68,7 +68,7 @@ def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper): gb = df.groupby(tdg) # check we're testing the case we're interested in - assert len(gb.grouper.result_index) != len(gb.grouper.group_keys_seq) + assert len(gb.grouper.result_index) != len(gb.grouper.codes) return gb diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 2d41b6d355ead..0728af588d256 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -261,9 +261,7 @@ def test_pivot_with_non_observable_dropna(self, dropna): expected = DataFrame( {"B": values}, index=Index( - Categorical.from_codes( - codes, categories=["low", "high"], ordered=dropna - ), + Categorical.from_codes(codes, categories=["low", "high"], ordered=True), name="A", ), ) From 31a7c928474b91fe0a4a2875408b36bfb31c11fb Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 30 Oct 2023 19:58:37 -0400 Subject: [PATCH 02/31] Refinements --- pandas/core/groupby/groupby.py | 3 +- pandas/core/groupby/ops.py | 50 +++++++++++++-------------- pandas/tests/groupby/test_grouping.py | 2 +- 3 files changed, 27 insertions(+), 28 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 8c2703eceec69..e7e077e611359 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2042,7 +2042,7 @@ def _wrap_transform_fast_result(self, result: NDFrameT) -> NDFrameT: obj = self._obj_with_exclusions # for each col, reshape to size of original frame by take operation - ids = self.grouper.result_index_and_codes[1] + ids = self.grouper.ids result = result.reindex(self.grouper.result_index, axis=self.axis, copy=False) if self.obj.ndim == 1: @@ -4439,6 +4439,7 @@ def post_processor( pass_qs = None ids, ngroups = self.grouper.group_info + # TODO: Is this hit? ids = ids[ids >= 0] nqs = len(qs) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 1f180a76f7f79..3d1ca78f70a8c 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -579,6 +579,7 @@ def __init__( groupings: Sequence[grouper.Grouping], sort: bool = True, dropna: bool = True, + # TODO: Is this still needed? observed: bool = True, ) -> None: assert isinstance(axis, Index), axis @@ -616,8 +617,8 @@ def get_iterator( for each group """ splitter = self._get_splitter(data, axis=axis) - # TODO: Skip unobserved for transform? - keys = self.result_index_and_codes[0] + # TODO: Would be more efficient to skip unobserved for transforms + keys = self.result_index yield from zip(keys, splitter) @final @@ -628,7 +629,6 @@ def _get_splitter(self, data: NDFrame, axis: AxisInt = 0) -> DataSplitter: Generator yielding subsetted objects """ ids, ngroups = self.group_info - ids = self.result_index_and_codes[1] return _get_splitter( data, ids, @@ -656,15 +656,15 @@ def result_ilocs(self) -> npt.NDArray[np.intp]: # Original indices are where group_index would go via sorting. # But when dropna is true, we need to remove null values while accounting for # any gaps that then occur because of them. - group_index = self.result_index_and_codes[1] + ids = self.ids if self.has_dropped_na: - mask = np.where(group_index >= 0) + mask = np.where(ids >= 0) # Count how many gaps are caused by previous null values for each position - null_gaps = np.cumsum(group_index == -1)[mask] - group_index = group_index[mask] + null_gaps = np.cumsum(ids == -1)[mask] + ids = ids[mask] - result = get_group_index_sorter(group_index, self.ngroups) + result = get_group_index_sorter(ids, self.ngroups) if self.has_dropped_na: # Shift by the number of prior null gaps @@ -705,13 +705,11 @@ def size(self) -> Series: def groups(self) -> dict[Hashable, Index]: """dict {group name -> group labels}""" if len(self.groupings) == 1: - result = self.groupings[0].groups - return result - + return self.groupings[0].groups if len(self.result_index) == 0: index = self.result_index else: - index = self.result_index.take(self.result_index_and_codes[1]) + index = self.result_index.take(self.ids) categories = ( self.result_index._values if isinstance(self.result_index, MultiIndex) @@ -735,13 +733,13 @@ def has_dropped_na(self) -> bool: """ Whether grouper has null value(s) that are dropped. """ - return bool((self.result_index_and_codes[1] < 0).any()) + return bool((self.ids < 0).any()) @cache_readonly def group_info(self) -> tuple[npt.NDArray[np.intp], int]: - result_index, codes = self.result_index_and_codes + result_index, ids = self.result_index_and_ids ngroups = len(result_index) - return codes, ngroups + return ids, ngroups @cache_readonly def codes_info(self) -> npt.NDArray[np.intp]: @@ -756,10 +754,14 @@ def ngroups(self) -> int: @property def result_index(self) -> Index: - return self.result_index_and_codes[0] + return self.result_index_and_ids[0] + + @property + def ids(self) -> np.ndarray: + return self.result_index_and_ids[1] @cache_readonly - def result_index_and_codes(self) -> tuple[Index, np.ndarray]: + def result_index_and_ids(self) -> tuple[Index, np.ndarray]: from pandas.core.sorting import ( compress_group_index, decons_obs_group_ids, @@ -782,7 +784,6 @@ def result_index_and_codes(self) -> tuple[Index, np.ndarray]: obs = [ ping._observed or not ping._passed_categorical for ping in self.groupings ] - ob_indices = [k for k, e in enumerate(obs) if e] if len(self.groupings) == 1: result_index: AnyArrayLike = levels[0] @@ -863,7 +864,7 @@ def result_index_and_codes(self) -> tuple[Index, np.ndarray]: def get_group_levels(self) -> list[Index]: # Note: only called from _insert_inaxis_grouper, which # is only called for BaseGrouper, never for BinGrouper - result_index = self.result_index_and_codes[0] + result_index = self.result_index if len(self.groupings) == 1: return [result_index] return [ @@ -891,14 +892,12 @@ def _cython_operation( cy_op = WrappedCythonOp(kind=kind, how=how, has_dropped_na=self.has_dropped_na) - _, ids = self.result_index_and_codes - ngroups = self.ngroups return cy_op.cython_operation( values=values, axis=axis, min_count=min_count, - comp_ids=ids, - ngroups=ngroups, + comp_ids=self.ids, + ngroups=self.ngroups, **kwargs, ) @@ -1011,8 +1010,7 @@ def apply_groupwise( @final @property def _sorted_ids(self) -> npt.NDArray[np.intp]: - ids = self.result_index_and_codes[1] - result = ids.take(self.result_ilocs) + result = self.ids.take(self.result_ilocs) if getattr(self, "dropna", True): result = result[result >= 0] return result @@ -1154,7 +1152,7 @@ def codes(self) -> list[npt.NDArray[np.intp]]: return [self.group_info[0]] @cache_readonly - def result_index_and_codes(self): + def result_index_and_ids(self): return self.result_index, self.group_info[0] @property diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index d3efd209f4f70..ac4d57eeb28b8 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -725,7 +725,7 @@ def test_level_preserve_order(self, sort, labels, mframe): # GH 17537 grouped = mframe.groupby(level=0, sort=sort) exp_labels = np.array(labels, np.intp) - tm.assert_almost_equal(grouped.grouper.result_index_and_codes[1], exp_labels) + tm.assert_almost_equal(grouped.grouper.ids, exp_labels) def test_grouping_labels(self, mframe): grouped = mframe.groupby(mframe.index.get_level_values(0)) From 8ce08d15503c0e5189422b75ee07c4d6201acfbb Mon Sep 17 00:00:00 2001 From: richard Date: Tue, 31 Oct 2023 22:00:37 -0400 Subject: [PATCH 03/31] Refinements --- pandas/core/groupby/groupby.py | 6 +++--- pandas/core/groupby/grouper.py | 8 +------- pandas/core/groupby/ops.py | 4 ---- 3 files changed, 4 insertions(+), 14 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index e7e077e611359..bcdc67f612873 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -4439,13 +4439,13 @@ def post_processor( pass_qs = None ids, ngroups = self.grouper.group_info - # TODO: Is this hit? - ids = ids[ids >= 0] + if self.dropna: + ids = ids[ids >= 0] nqs = len(qs) func = partial( libgroupby.group_quantile, - labels=ids[ids >= 0], + labels=ids, qs=qs, interpolation=interpolation, starts=starts, diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index ff249e24d0c49..a9abcc787e774 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -735,10 +735,6 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: codes = np.where(codes >= na_code, codes + 1, codes) codes = np.where(na_mask, na_code, codes) - # TODO: Can this be removed? - if not self._observed: - uniques = uniques.reorder_categories(self._orig_cats) - return codes, uniques elif isinstance(self.grouping_vector, ops.BaseGrouper): @@ -998,9 +994,7 @@ def is_in_obj(gpr) -> bool: groupings.append(Grouping(Index([], dtype="int"), np.array([], dtype=np.intp))) # create the internals grouper - grouper = ops.BaseGrouper( - group_axis, groupings, sort=sort, dropna=dropna, observed=observed - ) + grouper = ops.BaseGrouper(group_axis, groupings, sort=sort, dropna=dropna) return grouper, frozenset(exclusions), obj diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 3d1ca78f70a8c..e2e9be685bcc9 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -579,8 +579,6 @@ def __init__( groupings: Sequence[grouper.Grouping], sort: bool = True, dropna: bool = True, - # TODO: Is this still needed? - observed: bool = True, ) -> None: assert isinstance(axis, Index), axis @@ -588,7 +586,6 @@ def __init__( self._groupings: list[grouper.Grouping] = list(groupings) self._sort = sort self.dropna = dropna - self.observed = observed @property def groupings(self) -> list[grouper.Grouping]: @@ -859,7 +856,6 @@ def result_index_and_ids(self) -> tuple[Index, np.ndarray]: return result_index, ids - # TODO: How is this different from .levels? @final def get_group_levels(self) -> list[Index]: # Note: only called from _insert_inaxis_grouper, which From 6296f4afecb45ca114afc72253409269bc45228b Mon Sep 17 00:00:00 2001 From: richard Date: Wed, 1 Nov 2023 18:10:54 -0400 Subject: [PATCH 04/31] Refinements --- pandas/core/groupby/ops.py | 2 +- pandas/tests/groupby/aggregate/test_other.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index e2e9be685bcc9..a2df7887b1be2 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -769,9 +769,9 @@ def result_index_and_ids(self) -> tuple[Index, np.ndarray]: codes = [e[0] for e in codes_and_uniques] levels = [e[1] for e in codes_and_uniques] - # TODO: Modify in Grouping.groups instead? for k, (ping, level) in enumerate(zip(self.groupings, levels)): if ping._passed_categorical: + # TODO: Modify in Grouping.groups instead? # set_categories is dynamically added levels[k] = level.set_categories( # type: ignore[union-attr] ping._orig_cats diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index a5021bf72662c..8f0f14e1d02af 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -513,8 +513,8 @@ def test_agg_timezone_round_trip(): def test_sum_uint64_overflow(): - # see gh-14758 - # Convert to uint64 and don't overflow + # GH#14758 + # Don't coerce object to more specific dtype df = DataFrame([[1, 2], [3, 4], [5, 6]], dtype=object) df = df + 9223372036854775807 From 714142518acc35d84c8a31607cbe0252dd21a501 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 5 Nov 2023 08:15:43 -0500 Subject: [PATCH 05/31] Restore inferring index dtype --- pandas/core/groupby/ops.py | 2 +- pandas/tests/groupby/aggregate/test_other.py | 2 +- pandas/tests/groupby/test_groupby.py | 2 +- pandas/tests/groupby/test_groupby_dropna.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 3d1ca78f70a8c..ace50de847e90 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -771,7 +771,7 @@ def result_index_and_ids(self) -> tuple[Index, np.ndarray]: codes_and_uniques = [ping._codes_and_uniques for ping in self.groupings] codes = [e[0] for e in codes_and_uniques] - levels = [e[1] for e in codes_and_uniques] + levels = [Index._with_infer(e[1]) for e in codes_and_uniques] # TODO: Modify in Grouping.groups instead? for k, (ping, level) in enumerate(zip(self.groupings, levels)): if ping._passed_categorical: diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index a5021bf72662c..398e9b09693e6 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -519,7 +519,7 @@ def test_sum_uint64_overflow(): df = df + 9223372036854775807 index = Index( - [9223372036854775808, 9223372036854775810, 9223372036854775812], dtype=object + [9223372036854775808, 9223372036854775810, 9223372036854775812], dtype=np.uint64 ) expected = DataFrame( {1: [9223372036854775809, 9223372036854775811, 9223372036854775813]}, diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 8a878936946f8..50dc0589c43c8 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -141,7 +141,7 @@ def test_groupby_nonobject_dtype(mframe, df_mixed_floats): expected = mframe.groupby(key.astype("O")).sum() assert result.index.dtype == np.int8 - assert expected.index.dtype == object + assert expected.index.dtype == np.int64 tm.assert_frame_equal(result, expected, check_index_type=False) # GH 3911, mixed frame non-conversion diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 464eb7c09cb47..b40c8f45b6d19 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -494,7 +494,7 @@ def test_null_is_null_for_dtype( obj = df["a"] if test_series else df gb = obj.groupby(groups, dropna=False, sort=sort) result = gb.sum() - index = pd.Index([na_value_for_dtype(groups.dtype)], dtype=groups.dtype) + index = pd.Index([na_value_for_dtype(groups.dtype)]) expected = pd.DataFrame({"a": [3]}, index=index) if test_series: tm.assert_series_equal(result, expected["a"]) From e39cbc8e1ac042c53de51aec0627c587bc49baa7 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 5 Nov 2023 08:29:52 -0500 Subject: [PATCH 06/31] Test fixups --- pandas/tests/groupby/aggregate/test_other.py | 4 ++-- pandas/tests/groupby/methods/test_size.py | 6 ++---- pandas/tests/groupby/test_categorical.py | 16 +++++++--------- pandas/tests/groupby/test_raises.py | 1 - 4 files changed, 11 insertions(+), 16 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index fd441d5761883..398e9b09693e6 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -513,8 +513,8 @@ def test_agg_timezone_round_trip(): def test_sum_uint64_overflow(): - # GH#14758 - # Don't coerce object to more specific dtype + # see gh-14758 + # Convert to uint64 and don't overflow df = DataFrame([[1, 2], [3, 4], [5, 6]], dtype=object) df = df + 9223372036854775807 diff --git a/pandas/tests/groupby/methods/test_size.py b/pandas/tests/groupby/methods/test_size.py index e83001372af4e..93a4e743d0d71 100644 --- a/pandas/tests/groupby/methods/test_size.py +++ b/pandas/tests/groupby/methods/test_size.py @@ -42,14 +42,12 @@ def test_size_axis_1(df, axis_1, by, sort, dropna): expected = expected.sort_index() if is_integer_dtype(expected.index.dtype) and not any(x is None for x in by): expected.index = expected.index.astype(int) - if any(x is None for x in by): - expected.index = expected.index.astype(object) + msg = "DataFrame.groupby with axis=1 is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): grouped = df.groupby(by=by, axis=axis_1, sort=sort, dropna=dropna) result = grouped.size() - # TODO: Comes through as int-nan; expects float-nan - tm.assert_series_equal(result, expected, check_index_type=False) + tm.assert_series_equal(result, expected) @pytest.mark.parametrize("by", ["A", "B", ["A", "B"]]) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 844eb40e3ea2a..8ee2303193d03 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1464,24 +1464,22 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans( result = agg(*args) - zero_or_nan = _results_for_groupbys_with_missing_categories[reduction_func] + missing_fillin = _results_for_groupbys_with_missing_categories[reduction_func] # TODO: Clean this up.. zero_or_nan can be bool and 1 for idx in unobserved: val = result.loc[idx] - assert (pd.isna(zero_or_nan) and pd.isna(val)) or (val == zero_or_nan) + assert (pd.isna(missing_fillin) and pd.isna(val)) or (val == missing_fillin) # If we expect unobserved values to be zero, we also expect the dtype to be int. # Except for .sum(). If the observed categories sum to dtype=float (i.e. their # sums have decimals), then the zeros for the missing categories should also be # floats. - if ( - zero_or_nan == 0 - and reduction_func != "sum" - and reduction_func != "any" - and reduction_func != "all" - ): - assert np.issubdtype(result.dtype, np.integer) + if missing_fillin == 0: + if reduction_func in ["count", "nunique", "size"]: + assert np.issubdtype(result.dtype, np.integer) + else: + assert reduction_func in ["sum", "any", "all"] def test_dataframe_groupby_on_2_categoricals_when_observed_is_true(reduction_func): diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 7943e1ff3a81a..45a527cf7813d 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -543,7 +543,6 @@ def test_groupby_raises_category_np( @pytest.mark.parametrize("how", ["method", "agg", "transform"]) def test_groupby_raises_category_on_category( - request, how, by, groupby_series, From c82bd65fbf6a46e6ffd1d2fb948540b23f421d1a Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 5 Nov 2023 08:34:47 -0500 Subject: [PATCH 07/31] Refinements --- pandas/core/groupby/ops.py | 10 ++-------- pandas/tests/groupby/test_categorical.py | 3 +-- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index e9cee2a04e856..930f0f8cba17a 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -772,10 +772,7 @@ def result_index_and_ids(self) -> tuple[Index, np.ndarray]: for k, (ping, level) in enumerate(zip(self.groupings, levels)): if ping._passed_categorical: # TODO: Modify in Grouping.groups instead? - # set_categories is dynamically added - levels[k] = level.set_categories( # type: ignore[union-attr] - ping._orig_cats - ) + levels[k] = level.set_categories(ping._orig_cats) names = self.names obs = [ @@ -784,10 +781,7 @@ def result_index_and_ids(self) -> tuple[Index, np.ndarray]: if len(self.groupings) == 1: result_index: AnyArrayLike = levels[0] - if not isinstance(result_index, Index): - result_index = Index(levels[0], name=names[0]) - else: - result_index.name = names[0] + result_index.name = names[0] ids = codes[0].astype("intp") return result_index, ids elif any(obs): diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 8ee2303193d03..064364047e03e 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1466,7 +1466,6 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans( missing_fillin = _results_for_groupbys_with_missing_categories[reduction_func] - # TODO: Clean this up.. zero_or_nan can be bool and 1 for idx in unobserved: val = result.loc[idx] assert (pd.isna(missing_fillin) and pd.isna(val)) or (val == missing_fillin) @@ -1479,7 +1478,7 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans( if reduction_func in ["count", "nunique", "size"]: assert np.issubdtype(result.dtype, np.integer) else: - assert reduction_func in ["sum", "any", "all"] + assert reduction_func in ["sum", "any"] def test_dataframe_groupby_on_2_categoricals_when_observed_is_true(reduction_func): From 3a9892d566c9b0cf3abf8a504614521a8fe5fc9f Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 5 Nov 2023 08:49:57 -0500 Subject: [PATCH 08/31] Refinements --- pandas/core/groupby/groupby.py | 1 + pandas/core/groupby/ops.py | 3 ++- pandas/core/reshape/pivot.py | 2 -- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index bcdc67f612873..f3878edacfb87 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -4440,6 +4440,7 @@ def post_processor( ids, ngroups = self.grouper.group_info if self.dropna: + # splitter drops NA groups, we need to do the same ids = ids[ids >= 0] nqs = len(qs) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 930f0f8cba17a..78fa4a58883ad 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -998,10 +998,11 @@ def apply_groupwise( # Methods for sorting subsets of our GroupBy's object @final - @property + @cache_readonly def _sorted_ids(self) -> npt.NDArray[np.intp]: result = self.ids.take(self.result_ilocs) if getattr(self, "dropna", True): + # BinGrouper has no dropna result = result[result >= 0] return result diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 5380c6c7eeada..79354fdd12a2d 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -717,8 +717,6 @@ def crosstab( margins=margins, margins_name=margins_name, dropna=dropna, - # TODO: Not sure if this is okay - observed=dropna, **kwargs, # type: ignore[arg-type] ) From 25770beef05b7d19af4e7a2482d6680a36864797 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 5 Nov 2023 11:46:22 -0500 Subject: [PATCH 09/31] fixup --- pandas/core/groupby/ops.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 78fa4a58883ad..db7f957df4c58 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -24,7 +24,6 @@ ) import pandas._libs.groupby as libgroupby from pandas._typing import ( - AnyArrayLike, ArrayLike, AxisInt, NDFrameT, @@ -767,7 +766,7 @@ def result_index_and_ids(self) -> tuple[Index, np.ndarray]: codes_and_uniques = [ping._codes_and_uniques for ping in self.groupings] - codes = [e[0] for e in codes_and_uniques] + codes = [e[0].astype("intp", copy=False) for e in codes_and_uniques] levels = [Index._with_infer(e[1]) for e in codes_and_uniques] for k, (ping, level) in enumerate(zip(self.groupings, levels)): if ping._passed_categorical: @@ -780,9 +779,9 @@ def result_index_and_ids(self) -> tuple[Index, np.ndarray]: ] if len(self.groupings) == 1: - result_index: AnyArrayLike = levels[0] + result_index = levels[0] result_index.name = names[0] - ids = codes[0].astype("intp") + ids = codes[0] return result_index, ids elif any(obs): ob_codes = [e for e, o in zip(codes, obs) if o] From a338efc9539330ad210c35a3e6bf901761fa54b8 Mon Sep 17 00:00:00 2001 From: richard Date: Sun, 5 Nov 2023 18:22:41 -0500 Subject: [PATCH 10/31] fixup --- pandas/core/groupby/ops.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index db7f957df4c58..b00e7bf7b961a 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -766,7 +766,7 @@ def result_index_and_ids(self) -> tuple[Index, np.ndarray]: codes_and_uniques = [ping._codes_and_uniques for ping in self.groupings] - codes = [e[0].astype("intp", copy=False) for e in codes_and_uniques] + codes = [e[0] for e in codes_and_uniques] levels = [Index._with_infer(e[1]) for e in codes_and_uniques] for k, (ping, level) in enumerate(zip(self.groupings, levels)): if ping._passed_categorical: @@ -810,6 +810,7 @@ def result_index_and_ids(self) -> tuple[Index, np.ndarray]: shape = tuple(len(level) for level in unob_levels) unob_ids = get_group_index(unob_codes, shape, sort=True, xnull=True) + unob_ids = ensure_platform_int(unob_ids) unob_index = MultiIndex.from_product(unob_levels, names=unob_names) From dbdec9f6dcdeeb69b712924f235474f5269e1a29 Mon Sep 17 00:00:00 2001 From: richard Date: Sun, 5 Nov 2023 18:23:13 -0500 Subject: [PATCH 11/31] fixup --- pandas/core/reshape/pivot.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 79354fdd12a2d..1793ceea15f2a 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -717,6 +717,7 @@ def crosstab( margins=margins, margins_name=margins_name, dropna=dropna, + observed=dropna, **kwargs, # type: ignore[arg-type] ) From 0ae70b78a0b54f43214217cf499683266c6d7994 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 12 Nov 2023 09:50:54 -0500 Subject: [PATCH 12/31] Fix sorting and non-sorting --- pandas/core/groupby/groupby.py | 35 ++++------ pandas/core/groupby/grouper.py | 4 ++ pandas/core/groupby/ops.py | 70 +++++++++---------- .../groupby/methods/test_value_counts.py | 22 +++--- 4 files changed, 62 insertions(+), 69 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index f3878edacfb87..77a140f78ccf9 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -101,7 +101,6 @@ class providing the base-class of operations. from pandas.core.arrays import ( ArrowExtensionArray, BaseMaskedArray, - Categorical, ExtensionArray, FloatingArray, IntegerArray, @@ -130,7 +129,6 @@ class providing the base-class of operations. GroupByNthSelector, ) from pandas.core.indexes.api import ( - CategoricalIndex, Index, MultiIndex, RangeIndex, @@ -2806,18 +2804,20 @@ def _value_counts( result_series = cast(Series, gb.size()) result_series.name = name - # GH-46357 Include non-observed categories - # of non-grouping columns regardless of `observed` - if any( - isinstance(grouping.grouping_vector, (Categorical, CategoricalIndex)) - and not grouping._observed - for grouping in groupings - ): - levels_list = gb.grouper.levels - multi_index, _ = MultiIndex.from_product( - levels_list, names=[ping.name for ping in groupings] - ).sortlevel() - result_series = result_series.reindex(multi_index, fill_value=0) + if sort: + # Sort the values and then resort by the main grouping + # TODO: HACK - sort_index gets confused if index names are integers + names = result_series.index.names + result_series.index.names = range(len(names)) + index_level = list(range(len(self.grouper.groupings))) + result_series = result_series.sort_values( + ascending=ascending, kind="stable" + ) + if self.sort: + result_series = result_series.sort_index( + level=index_level, sort_remaining=False + ) + result_series.index.names = names if normalize: # Normalize the results by dividing by the original group sizes. @@ -2838,13 +2838,6 @@ def _value_counts( # Handle groups of non-observed categories result_series = result_series.fillna(0.0) - if sort: - # Sort the values and then resort by the main grouping - index_level = range(len(self.grouper.groupings)) - result_series = result_series.sort_values(ascending=ascending).sort_index( - level=index_level, sort_remaining=False - ) - result: Series | DataFrame if self.as_index: result = result_series diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index a9abcc787e774..68aef3e98ac7c 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -690,6 +690,10 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: def codes(self) -> npt.NDArray[np.signedinteger]: return self._codes_and_uniques[0] + @property + def uniques(self) -> ArrayLike: + return self._codes_and_uniques[1] + @cache_readonly def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: uniques: ArrayLike diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index b00e7bf7b961a..b1ddec6eb59fa 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -61,6 +61,9 @@ ) from pandas.core.series import Series from pandas.core.sorting import ( + compress_group_index, + decons_obs_group_ids, + get_group_index, get_group_index_sorter, get_indexer_dict, ) @@ -758,51 +761,42 @@ def ids(self) -> np.ndarray: @cache_readonly def result_index_and_ids(self) -> tuple[Index, np.ndarray]: - from pandas.core.sorting import ( - compress_group_index, - decons_obs_group_ids, - get_group_index, - ) - - codes_and_uniques = [ping._codes_and_uniques for ping in self.groupings] - - codes = [e[0] for e in codes_and_uniques] - levels = [Index._with_infer(e[1]) for e in codes_and_uniques] - for k, (ping, level) in enumerate(zip(self.groupings, levels)): - if ping._passed_categorical: - # TODO: Modify in Grouping.groups instead? - levels[k] = level.set_categories(ping._orig_cats) names = self.names - + codes = [ping.codes for ping in self.groupings] + levels = [Index._with_infer(ping.uniques) for ping in self.groupings] obs = [ ping._observed or not ping._passed_categorical for ping in self.groupings ] + # When passed a categorical grouping, keep all categories + for k, (ping, level) in enumerate(zip(self.groupings, levels)): + if ping._passed_categorical: + levels[k] = level.set_categories(ping._orig_cats) if len(self.groupings) == 1: result_index = levels[0] result_index.name = names[0] - ids = codes[0] + ids = codes[0].astype("intp", copy=False) return result_index, ids - elif any(obs): - ob_codes = [e for e, o in zip(codes, obs) if o] - ob_levels = [e for e, o in zip(levels, obs) if o] - ob_names = [e for e, o in zip(names, obs) if o] + + if any(obs): + ob_codes = [code for code, ob in zip(codes, obs) if ob] + ob_levels = [level for level, ob in zip(levels, obs) if ob] + ob_names = [name for name, ob in zip(names, obs) if ob] shape = tuple(len(level) for level in ob_levels) group_index = get_group_index(ob_codes, shape, sort=True, xnull=True) ob_ids, obs_group_ids = compress_group_index(group_index, sort=self._sort) ob_ids = ensure_platform_int(ob_ids) - ids, obs_ids = ob_ids, obs_group_ids ob_index_codes = decons_obs_group_ids( - ids, obs_ids, shape, ob_codes, xnull=True + ob_ids, obs_group_ids, shape, ob_codes, xnull=True ) - ob_index = MultiIndex( levels=ob_levels, codes=ob_index_codes, names=ob_names, verify_integrity=False, ) + if not all(obs): unob_codes = [e for e, o in zip(codes, obs) if not o] unob_levels = [e for e, o in zip(levels, obs) if not o] @@ -811,7 +805,6 @@ def result_index_and_ids(self) -> tuple[Index, np.ndarray]: shape = tuple(len(level) for level in unob_levels) unob_ids = get_group_index(unob_codes, shape, sort=True, xnull=True) unob_ids = ensure_platform_int(unob_ids) - unob_index = MultiIndex.from_product(unob_levels, names=unob_names) if all(obs): @@ -821,11 +814,9 @@ def result_index_and_ids(self) -> tuple[Index, np.ndarray]: result_index = unob_index ids = unob_ids else: - ob_indices = [k for k, e in enumerate(obs) if e] + # Combine unobserved and observed parts of result_index unob_indices = [k for k, e in enumerate(obs) if not e] - _, index, inverse = np.unique( - unob_indices + ob_indices, return_index=True, return_inverse=True - ) + ob_indices = [k for k, e in enumerate(obs) if e] result_index_codes = np.concatenate( [ np.tile(unob_index.codes, len(ob_index)), @@ -833,20 +824,25 @@ def result_index_and_ids(self) -> tuple[Index, np.ndarray]: ], axis=0, ) + _, index = np.unique(unob_indices + ob_indices, return_index=True) result_index = MultiIndex( - levels=[levels[k] for k in inverse], + levels=list(unob_index.levels) + list(ob_index.levels), codes=result_index_codes, - names=[names[k] for k in inverse], + names=list(unob_index.names) + list(ob_index.names), ).reorder_levels(index) - ids = len(unob_index) * ob_ids + unob_ids - sorter = result_index.argsort() - result_index = result_index.take(sorter) - _, inverse = np.unique(sorter, return_index=True) - ids = inverse.take(ids) - if len(levels) == 1: - result_index = result_index.get_level_values(0) + if self._sort: + sorter = result_index.argsort() + result_index = result_index.take(sorter) + _, inverse = np.unique(sorter, return_index=True) + ids = inverse.take(ids) + else: + ids, uniques = compress_group_index(ids, sort=False) + taker = np.concatenate( + [uniques, np.delete(np.arange(len(result_index)), uniques)] + ) + result_index = result_index.take(taker) return result_index, ids diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index b82908ef2aa21..3760cbdc0ce1a 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -385,8 +385,8 @@ def test_against_frame_and_seriesgroupby( "sort, ascending, expected_rows, expected_count, expected_group_size", [ (False, None, [0, 1, 2, 3, 4], [1, 1, 1, 2, 1], [1, 3, 1, 3, 1]), - (True, False, [4, 3, 1, 2, 0], [1, 2, 1, 1, 1], [1, 3, 3, 1, 1]), - (True, True, [4, 1, 3, 2, 0], [1, 1, 2, 1, 1], [1, 3, 3, 1, 1]), + (True, False, [3, 0, 1, 2, 4], [2, 1, 1, 1, 1], [3, 1, 3, 1, 1]), + (True, True, [0, 1, 2, 4, 3], [1, 1, 1, 1, 2], [1, 3, 1, 1, 3]), ], ) def test_compound( @@ -617,7 +617,7 @@ def test_categorical_single_grouper_with_only_observed_categories( ) gp = education_df.astype("category").groupby( - "country", as_index=as_index, observed=observed + "country", as_index=as_index, observed=observed, sort=True ) result = gp.value_counts(normalize=normalize) @@ -811,19 +811,19 @@ def test_categorical_single_grouper_observed_false( ("FR", "female", "high"), ("FR", "male", "medium"), ("FR", "female", "low"), - ("FR", "male", "high"), ("FR", "female", "medium"), + ("FR", "male", "high"), ("US", "female", "high"), ("US", "male", "low"), - ("US", "male", "medium"), - ("US", "male", "high"), - ("US", "female", "medium"), ("US", "female", "low"), - ("ASIA", "male", "low"), - ("ASIA", "male", "high"), - ("ASIA", "female", "medium"), - ("ASIA", "female", "low"), + ("US", "female", "medium"), + ("US", "male", "high"), + ("US", "male", "medium"), ("ASIA", "female", "high"), + ("ASIA", "female", "low"), + ("ASIA", "female", "medium"), + ("ASIA", "male", "high"), + ("ASIA", "male", "low"), ("ASIA", "male", "medium"), ] From 99d2bebdac4875a79f0ef20874af453f8eb3ef12 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 12 Nov 2023 09:53:00 -0500 Subject: [PATCH 13/31] Cleanup --- pandas/core/groupby/ops.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index b1ddec6eb59fa..db1b06ff7d40e 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -762,7 +762,7 @@ def ids(self) -> np.ndarray: @cache_readonly def result_index_and_ids(self) -> tuple[Index, np.ndarray]: names = self.names - codes = [ping.codes for ping in self.groupings] + codes = [ensure_platform_int(ping.codes) for ping in self.groupings] levels = [Index._with_infer(ping.uniques) for ping in self.groupings] obs = [ ping._observed or not ping._passed_categorical for ping in self.groupings @@ -775,7 +775,7 @@ def result_index_and_ids(self) -> tuple[Index, np.ndarray]: if len(self.groupings) == 1: result_index = levels[0] result_index.name = names[0] - ids = codes[0].astype("intp", copy=False) + ids = codes[0] return result_index, ids if any(obs): @@ -786,7 +786,6 @@ def result_index_and_ids(self) -> tuple[Index, np.ndarray]: shape = tuple(len(level) for level in ob_levels) group_index = get_group_index(ob_codes, shape, sort=True, xnull=True) ob_ids, obs_group_ids = compress_group_index(group_index, sort=self._sort) - ob_ids = ensure_platform_int(ob_ids) ob_index_codes = decons_obs_group_ids( ob_ids, obs_group_ids, shape, ob_codes, xnull=True ) @@ -804,7 +803,6 @@ def result_index_and_ids(self) -> tuple[Index, np.ndarray]: shape = tuple(len(level) for level in unob_levels) unob_ids = get_group_index(unob_codes, shape, sort=True, xnull=True) - unob_ids = ensure_platform_int(unob_ids) unob_index = MultiIndex.from_product(unob_levels, names=unob_names) if all(obs): From a477dc06bcb151c1aeac0a28e2688e3d7f4ff17c Mon Sep 17 00:00:00 2001 From: richard Date: Mon, 13 Nov 2023 18:35:13 -0500 Subject: [PATCH 14/31] Call ensure_plantform_int last --- pandas/core/groupby/ops.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index db1b06ff7d40e..d59027f281c29 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -762,7 +762,7 @@ def ids(self) -> np.ndarray: @cache_readonly def result_index_and_ids(self) -> tuple[Index, np.ndarray]: names = self.names - codes = [ensure_platform_int(ping.codes) for ping in self.groupings] + codes = [ping.codes for ping in self.groupings] levels = [Index._with_infer(ping.uniques) for ping in self.groupings] obs = [ ping._observed or not ping._passed_categorical for ping in self.groupings @@ -807,10 +807,10 @@ def result_index_and_ids(self) -> tuple[Index, np.ndarray]: if all(obs): result_index = ob_index - ids = ob_ids + ids = ensure_platform_int(ob_ids) elif not any(obs): result_index = unob_index - ids = unob_ids + ids = ensure_platform_int(unob_ids) else: # Combine unobserved and observed parts of result_index unob_indices = [k for k, e in enumerate(obs) if not e] @@ -841,6 +841,7 @@ def result_index_and_ids(self) -> tuple[Index, np.ndarray]: [uniques, np.delete(np.arange(len(result_index)), uniques)] ) result_index = result_index.take(taker) + ids = ensure_platform_int(ids) return result_index, ids From 7fb7ca61a67fa9a1ab9b7a7d641706dc72ae079b Mon Sep 17 00:00:00 2001 From: richard Date: Mon, 13 Nov 2023 22:02:25 -0500 Subject: [PATCH 15/31] fixup --- pandas/core/groupby/ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index d59027f281c29..525a58699c3b6 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -775,7 +775,7 @@ def result_index_and_ids(self) -> tuple[Index, np.ndarray]: if len(self.groupings) == 1: result_index = levels[0] result_index.name = names[0] - ids = codes[0] + ids = ensure_platform_int(codes[0]) return result_index, ids if any(obs): From b79cc850ece10d33c98caa13f43be017265aa497 Mon Sep 17 00:00:00 2001 From: richard Date: Mon, 13 Nov 2023 23:16:55 -0500 Subject: [PATCH 16/31] fixup --- pandas/core/groupby/ops.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 525a58699c3b6..64f5df7489b63 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -834,14 +834,15 @@ def result_index_and_ids(self) -> tuple[Index, np.ndarray]: sorter = result_index.argsort() result_index = result_index.take(sorter) _, inverse = np.unique(sorter, return_index=True) + ids = ensure_platform_int(ids) ids = inverse.take(ids) else: ids, uniques = compress_group_index(ids, sort=False) + ids = ensure_platform_int(ids) taker = np.concatenate( [uniques, np.delete(np.arange(len(result_index)), uniques)] ) result_index = result_index.take(taker) - ids = ensure_platform_int(ids) return result_index, ids From da9169dd3923482c6911a387ca5ee248f390299e Mon Sep 17 00:00:00 2001 From: richard Date: Wed, 16 Aug 2023 14:19:33 -0400 Subject: [PATCH 17/31] REF: Compute correct result_index upfront in groupby --- pandas/core/groupby/generic.py | 24 +- pandas/core/groupby/groupby.py | 193 +++---------- pandas/core/groupby/grouper.py | 84 ++---- pandas/core/groupby/ops.py | 265 ++++++++++-------- pandas/core/indexes/base.py | 2 +- pandas/core/reshape/pivot.py | 1 + .../groupby/methods/test_value_counts.py | 22 +- pandas/tests/groupby/test_categorical.py | 32 +-- pandas/tests/groupby/test_groupby.py | 2 +- pandas/tests/groupby/test_grouping.py | 8 +- pandas/tests/groupby/test_raises.py | 10 - pandas/tests/groupby/test_timegrouper.py | 2 +- pandas/tests/reshape/test_pivot.py | 4 +- 13 files changed, 259 insertions(+), 390 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 1bdba5a3e71fb..e92931d52093d 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -416,7 +416,6 @@ def _wrap_applied_output( # GH #823 #24880 index = self.grouper.result_index res_df = self.obj._constructor_expanddim(values, index=index) - res_df = self._reindex_output(res_df) # if self.observed is False, # keep all-NaN rows created while re-indexing res_ser = res_df.stack(future_stack=True) @@ -442,7 +441,7 @@ def _wrap_applied_output( if not self.as_index: result = self._insert_inaxis_grouper(result) result.index = default_index(len(result)) - return self._reindex_output(result) + return result def _aggregate_named(self, func, *args, **kwargs): # Note: this is very similar to _aggregate_series_pure_python, @@ -672,7 +671,7 @@ def nunique(self, dropna: bool = True) -> Series | DataFrame: 2023-02-01 1 Freq: MS, dtype: int64 """ - ids, _, _ = self.grouper.group_info + ids, _ = self.grouper.group_info val = self.obj._values @@ -721,7 +720,7 @@ def nunique(self, dropna: bool = True) -> Series | DataFrame: if not self.as_index: result = self._insert_inaxis_grouper(result) result.index = default_index(len(result)) - return self._reindex_output(result, fill_value=0) + return result @doc(Series.describe) def describe(self, percentiles=None, include=None, exclude=None) -> Series: @@ -749,7 +748,7 @@ def value_counts( from pandas.core.reshape.merge import get_join_indexers from pandas.core.reshape.tile import cut - ids, _, _ = self.grouper.group_info + ids, _ = self.grouper.group_info val = self.obj._values index_names = self.grouper.names + [self.obj.name] @@ -819,9 +818,18 @@ def value_counts( rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx)) # multi-index components - codes = self.grouper.reconstructed_codes + if isinstance(self.grouper.result_index, MultiIndex): + codes = list(self.grouper.result_index.codes) + else: + codes = [ + algorithms.factorize( + self.grouper.result_index, + sort=self.grouper._sort, + use_na_sentinel=self.grouper.dropna, + )[0] + ] codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)] - levels = [ping.group_index for ping in self.grouper.groupings] + [lev] + levels = self.grouper.levels + [lev] if dropna: mask = codes[-1] != -1 @@ -1686,7 +1694,7 @@ def _wrap_applied_output_series( if not self.as_index: result = self._insert_inaxis_grouper(result) - return self._reindex_output(result) + return result def _cython_transform( self, diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index b9b69d4ef0c87..77a140f78ccf9 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -54,7 +54,6 @@ class providing the base-class of operations. NDFrameT, PositionalIndexer, RandomState, - Scalar, T, npt, ) @@ -102,7 +101,6 @@ class providing the base-class of operations. from pandas.core.arrays import ( ArrowExtensionArray, BaseMaskedArray, - Categorical, ExtensionArray, FloatingArray, IntegerArray, @@ -131,7 +129,6 @@ class providing the base-class of operations. GroupByNthSelector, ) from pandas.core.indexes.api import ( - CategoricalIndex, Index, MultiIndex, RangeIndex, @@ -788,7 +785,7 @@ def __repr__(self) -> str: @final @property - def groups(self) -> dict[Hashable, np.ndarray]: + def groups(self) -> dict[Hashable, Index]: """ Dict {group name -> group labels}. @@ -1505,7 +1502,7 @@ def _set_result_index_ordered( return result # row order is scrambled => sort the rows by position in original index - original_positions = Index(self.grouper.result_ilocs()) + original_positions = Index(self.grouper.result_ilocs) result = result.set_axis(original_positions, axis=self.axis, copy=False) result = result.sort_index(axis=self.axis) if self.grouper.has_dropped_na: @@ -1599,7 +1596,7 @@ def _wrap_aggregated_output( # error: Argument 1 to "_maybe_transpose_result" of "GroupBy" has # incompatible type "Union[Series, DataFrame]"; expected "NDFrameT" res = self._maybe_transpose_result(result) # type: ignore[arg-type] - return self._reindex_output(res, qs=qs) + return res def _wrap_applied_output( self, @@ -1615,8 +1612,8 @@ def _wrap_applied_output( @final def _numba_prep(self, data: DataFrame): - ids, _, ngroups = self.grouper.group_info - sorted_index = self.grouper._sort_idx + ids, ngroups = self.grouper.group_info + sorted_index = self.grouper.result_ilocs sorted_ids = self.grouper._sorted_ids sorted_data = data.take(sorted_index, axis=self.axis).to_numpy() @@ -1669,7 +1666,7 @@ def _numba_agg_general( ) # Pass group ids to kernel directly if it can handle it # (This is faster since it doesn't require a sort) - ids, _, _ = self.grouper.group_info + ids, _ = self.grouper.group_info ngroups = self.grouper.ngroups res_mgr = df._mgr.apply( @@ -2043,7 +2040,7 @@ def _wrap_transform_fast_result(self, result: NDFrameT) -> NDFrameT: obj = self._obj_with_exclusions # for each col, reshape to size of original frame by take operation - ids, _, _ = self.grouper.group_info + ids = self.grouper.ids result = result.reindex(self.grouper.result_index, axis=self.axis, copy=False) if self.obj.ndim == 1: @@ -2096,7 +2093,7 @@ def _cumcount_array(self, ascending: bool = True) -> np.ndarray: this is currently implementing sort=False (though the default is sort=True) for groupby in general """ - ids, _, ngroups = self.grouper.group_info + ids, ngroups = self.grouper.group_info sorter = get_group_index_sorter(ids, ngroups) ids, count = ids[sorter], len(ids) @@ -2305,7 +2302,7 @@ def count(self) -> NDFrameT: Freq: MS, dtype: int64 """ data = self._get_data_to_aggregate() - ids, _, ngroups = self.grouper.group_info + ids, ngroups = self.grouper.group_info mask = ids != -1 is_series = data.ndim == 1 @@ -2335,15 +2332,9 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike: new_mgr = data.grouped_reduce(hfunc) new_obj = self._wrap_agged_manager(new_mgr) + result = self._wrap_aggregated_output(new_obj) - # If we are grouping on categoricals we want unobserved categories to - # return zero, rather than the default of NaN which the reindexing in - # _wrap_aggregated_output() returns. GH 35028 - # e.g. test_dataframe_groupby_on_2_categoricals_when_observed_is_false - with com.temp_setattr(self, "observed", True): - result = self._wrap_aggregated_output(new_obj) - - return self._reindex_output(result, fill_value=0) + return result @final @Substitution(name="groupby") @@ -2813,18 +2804,20 @@ def _value_counts( result_series = cast(Series, gb.size()) result_series.name = name - # GH-46357 Include non-observed categories - # of non-grouping columns regardless of `observed` - if any( - isinstance(grouping.grouping_vector, (Categorical, CategoricalIndex)) - and not grouping._observed - for grouping in groupings - ): - levels_list = [ping.result_index for ping in groupings] - multi_index, _ = MultiIndex.from_product( - levels_list, names=[ping.name for ping in groupings] - ).sortlevel() - result_series = result_series.reindex(multi_index, fill_value=0) + if sort: + # Sort the values and then resort by the main grouping + # TODO: HACK - sort_index gets confused if index names are integers + names = result_series.index.names + result_series.index.names = range(len(names)) + index_level = list(range(len(self.grouper.groupings))) + result_series = result_series.sort_values( + ascending=ascending, kind="stable" + ) + if self.sort: + result_series = result_series.sort_index( + level=index_level, sort_remaining=False + ) + result_series.index.names = names if normalize: # Normalize the results by dividing by the original group sizes. @@ -2845,13 +2838,6 @@ def _value_counts( # Handle groups of non-observed categories result_series = result_series.fillna(0.0) - if sort: - # Sort the values and then resort by the main grouping - index_level = range(len(self.grouper.groupings)) - result_series = result_series.sort_values(ascending=ascending).sort_index( - level=index_level, sort_remaining=False - ) - result: Series | DataFrame if self.as_index: result = result_series @@ -3043,10 +3029,6 @@ def size(self) -> DataFrame | Series: dtype_backend=dtype_backend, ) - with com.temp_setattr(self, "as_index", True): - # size already has the desired behavior in GH#49519, but this makes the - # as_index=False path of _reindex_output fail on categorical groupers. - result = self._reindex_output(result, fill_value=0) if not self.as_index: # error: Incompatible types in assignment (expression has # type "DataFrame", variable has type "Series") @@ -3124,7 +3106,7 @@ def sum( npfunc=np.sum, ) - return self._reindex_output(result, fill_value=0) + return result @final @doc( @@ -3522,7 +3504,7 @@ def ohlc(self) -> DataFrame: result = self.obj._constructor_expanddim( res_values, index=self.grouper.result_index, columns=agg_names ) - return self._reindex_output(result) + return result result = self._apply_to_column_groupbys(lambda sgb: sgb.ohlc()) return result @@ -3907,7 +3889,7 @@ def _fill(self, direction: Literal["ffill", "bfill"], limit: int | None = None): if limit is None: limit = -1 - ids, _, _ = self.grouper.group_info + ids, _ = self.grouper.group_info sorted_labels = np.argsort(ids, kind="mergesort").astype(np.intp, copy=False) if direction == "bfill": sorted_labels = sorted_labels[::-1] @@ -4238,7 +4220,7 @@ def _nth( if not dropna: mask = self._make_mask_from_positional_indexer(n) - ids, _, _ = self.grouper.group_info + ids, _ = self.grouper.group_info # Drop NA values in grouping mask = mask & (ids != -1) @@ -4449,7 +4431,10 @@ def post_processor( qs = np.array([q], dtype=np.float64) pass_qs = None - ids, _, ngroups = self.grouper.group_info + ids, ngroups = self.grouper.group_info + if self.dropna: + # splitter drops NA groups, we need to do the same + ids = ids[ids >= 0] nqs = len(qs) func = partial( @@ -5169,7 +5154,7 @@ def shift( else: if fill_value is lib.no_default: fill_value = None - ids, _, ngroups = self.grouper.group_info + ids, ngroups = self.grouper.group_info res_indexer = np.zeros(len(ids), dtype=np.int64) libgroupby.group_shift_indexer(res_indexer, ids, ngroups, period) @@ -5516,104 +5501,6 @@ def _mask_selected_obj(self, mask: npt.NDArray[np.bool_]) -> NDFrameT: else: return self._selected_obj.iloc[:, mask] - @final - def _reindex_output( - self, - output: OutputFrameOrSeries, - fill_value: Scalar = np.nan, - qs: npt.NDArray[np.float64] | None = None, - ) -> OutputFrameOrSeries: - """ - If we have categorical groupers, then we might want to make sure that - we have a fully re-indexed output to the levels. This means expanding - the output space to accommodate all values in the cartesian product of - our groups, regardless of whether they were observed in the data or - not. This will expand the output space if there are missing groups. - - The method returns early without modifying the input if the number of - groupings is less than 2, self.observed == True or none of the groupers - are categorical. - - Parameters - ---------- - output : Series or DataFrame - Object resulting from grouping and applying an operation. - fill_value : scalar, default np.nan - Value to use for unobserved categories if self.observed is False. - qs : np.ndarray[float64] or None, default None - quantile values, only relevant for quantile. - - Returns - ------- - Series or DataFrame - Object (potentially) re-indexed to include all possible groups. - """ - groupings = self.grouper.groupings - if len(groupings) == 1: - return output - - # if we only care about the observed values - # we are done - elif self.observed: - return output - - # reindexing only applies to a Categorical grouper - elif not any( - isinstance(ping.grouping_vector, (Categorical, CategoricalIndex)) - for ping in groupings - ): - return output - - levels_list = [ping.group_index for ping in groupings] - names = self.grouper.names - if qs is not None: - # error: Argument 1 to "append" of "list" has incompatible type - # "ndarray[Any, dtype[floating[_64Bit]]]"; expected "Index" - levels_list.append(qs) # type: ignore[arg-type] - names = names + [None] - index = MultiIndex.from_product(levels_list, names=names) - if self.sort: - index = index.sort_values() - - if self.as_index: - # Always holds for SeriesGroupBy unless GH#36507 is implemented - d = { - self.obj._get_axis_name(self.axis): index, - "copy": False, - "fill_value": fill_value, - } - return output.reindex(**d) # type: ignore[arg-type] - - # GH 13204 - # Here, the categorical in-axis groupers, which need to be fully - # expanded, are columns in `output`. An idea is to do: - # output = output.set_index(self.grouper.names) - # .reindex(index).reset_index() - # but special care has to be taken because of possible not-in-axis - # groupers. - # So, we manually select and drop the in-axis grouper columns, - # reindex `output`, and then reset the in-axis grouper columns. - - # Select in-axis groupers - in_axis_grps = [ - (i, ping.name) for (i, ping) in enumerate(groupings) if ping.in_axis - ] - if len(in_axis_grps) > 0: - g_nums, g_names = zip(*in_axis_grps) - output = output.drop(labels=list(g_names), axis=1) - - # Set a temp index and reindex (possibly expanding) - output = output.set_index(self.grouper.result_index).reindex( - index, copy=False, fill_value=fill_value - ) - - # Reset in-axis grouper columns - # (using level numbers `g_nums` because level names may not be unique) - if len(in_axis_grps) > 0: - output = output.reset_index(level=g_nums) - - return output.reset_index(drop=True) - @final def sample( self, @@ -5785,14 +5672,10 @@ def _idxmax_idxmin( if not self.observed and any( ping._passed_categorical for ping in self.grouper.groupings ): - expected_len = np.prod( - [len(ping.group_index) for ping in self.grouper.groupings] - ) - if len(self.grouper.groupings) == 1: - result_len = len(self.grouper.groupings[0].grouping_vector.unique()) - else: - # result_index only contains observed groups in this case - result_len = len(self.grouper.result_index) + expected_len = len(self.grouper.result_index) + # TODO: Better way to find # of observed groups? + group_sizes = self.grouper.size() + result_len = group_sizes[group_sizes > 0].shape[0] assert result_len <= expected_len has_unobserved = result_len < expected_len diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 06e6755079a22..68aef3e98ac7c 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -36,7 +36,6 @@ from pandas.core.groupby import ops from pandas.core.groupby.categorical import recode_for_groupby from pandas.core.indexes.api import ( - CategoricalIndex, Index, MultiIndex, ) @@ -676,7 +675,7 @@ def _ilevel(self) -> int | None: @property def ngroups(self) -> int: - return len(self.group_index) + return len(self._codes_and_uniques[1]) @cache_readonly def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: @@ -691,56 +690,10 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: def codes(self) -> npt.NDArray[np.signedinteger]: return self._codes_and_uniques[0] - @cache_readonly - def group_arraylike(self) -> ArrayLike: - """ - Analogous to result_index, but holding an ArrayLike to ensure - we can retain ExtensionDtypes. - """ - if self._all_grouper is not None: - # retain dtype for categories, including unobserved ones - return self.result_index._values - - elif self._passed_categorical: - return self.group_index._values - + @property + def uniques(self) -> ArrayLike: return self._codes_and_uniques[1] - @cache_readonly - def result_index(self) -> Index: - # result_index retains dtype for categories, including unobserved ones, - # which group_index does not - if self._all_grouper is not None: - group_idx = self.group_index - assert isinstance(group_idx, CategoricalIndex) - cats = self._orig_cats - # set_categories is dynamically added - return group_idx.set_categories(cats) # type: ignore[attr-defined] - return self.group_index - - @cache_readonly - def group_index(self) -> Index: - codes, uniques = self._codes_and_uniques - if not self._dropna and self._passed_categorical: - assert isinstance(uniques, Categorical) - if self._sort and (codes == len(uniques)).any(): - # Add NA value on the end when sorting - uniques = Categorical.from_codes( - np.append(uniques.codes, [-1]), uniques.categories, validate=False - ) - elif len(codes) > 0: - # Need to determine proper placement of NA value when not sorting - cat = self.grouping_vector - na_idx = (cat.codes < 0).argmax() - if cat.codes[na_idx] < 0: - # count number of unique codes that comes before the nan value - na_unique_idx = algorithms.nunique_ints(cat.codes[:na_idx]) - new_codes = np.insert(uniques.codes, na_unique_idx, -1) - uniques = Categorical.from_codes( - new_codes, uniques.categories, validate=False - ) - return Index._with_infer(uniques, name=self.name) - @cache_readonly def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: uniques: ArrayLike @@ -759,30 +712,33 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: else: ucodes = np.arange(len(categories)) - uniques = Categorical.from_codes( - codes=ucodes, categories=categories, ordered=cat.ordered, validate=False - ) - - codes = cat.codes if not self._dropna: - na_mask = codes < 0 + na_mask = cat.codes < 0 if np.any(na_mask): if self._sort: # Replace NA codes with `largest code + 1` na_code = len(categories) - codes = np.where(na_mask, na_code, codes) else: # Insert NA code into the codes based on first appearance # A negative code must exist, no need to check codes[na_idx] < 0 na_idx = na_mask.argmax() # count number of unique codes that comes before the nan value - na_code = algorithms.nunique_ints(codes[:na_idx]) + na_code = algorithms.nunique_ints(cat.codes[:na_idx]) + ucodes = np.insert(ucodes, na_code, -1) + + uniques = Categorical.from_codes( + codes=ucodes, categories=categories, ordered=cat.ordered, validate=False + ) + codes = cat.codes + if not self._dropna: + na_mask = codes < 0 + if np.any(na_mask): + if self._sort: + codes = np.where(na_mask, na_code, codes) + else: codes = np.where(codes >= na_code, codes + 1, codes) codes = np.where(na_mask, na_code, codes) - if not self._observed: - uniques = uniques.reorder_categories(self._orig_cats) - return codes, uniques elif isinstance(self.grouping_vector, ops.BaseGrouper): @@ -805,8 +761,10 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: return codes, uniques @cache_readonly - def groups(self) -> dict[Hashable, np.ndarray]: - cats = Categorical.from_codes(self.codes, self.group_index, validate=False) + def groups(self) -> dict[Hashable, Index]: + codes, uniques = self._codes_and_uniques + uniques = Index._with_infer(uniques, name=self.name) + cats = Categorical.from_codes(codes, uniques, validate=False) return self._index.groupby(cats) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 466bbac641077..85f694be62b59 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -50,6 +50,7 @@ maybe_fill, ) +from pandas.core.arrays import Categorical from pandas.core.frame import DataFrame from pandas.core.groupby import grouper from pandas.core.indexes.api import ( @@ -62,7 +63,6 @@ from pandas.core.sorting import ( compress_group_index, decons_obs_group_ids, - get_flattened_list, get_group_index, get_group_index_sorter, get_indexer_dict, @@ -616,7 +616,8 @@ def get_iterator( for each group """ splitter = self._get_splitter(data, axis=axis) - keys = self.group_keys_seq + # TODO: Would be more efficient to skip unobserved for transforms + keys = self.result_index yield from zip(keys, splitter) @final @@ -626,27 +627,16 @@ def _get_splitter(self, data: NDFrame, axis: AxisInt = 0) -> DataSplitter: ------- Generator yielding subsetted objects """ - ids, _, ngroups = self.group_info + ids, ngroups = self.group_info return _get_splitter( data, ids, ngroups, sorted_ids=self._sorted_ids, - sort_idx=self._sort_idx, + sort_idx=self.result_ilocs, axis=axis, ) - @final - @cache_readonly - def group_keys_seq(self): - if len(self.groupings) == 1: - return self.levels[0] - else: - ids, _, ngroups = self.group_info - - # provide "flattened" iterator for multi-group setting - return get_flattened_list(ids, ngroups, self.levels, self.codes) - @cache_readonly def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: """dict {group name -> group indices}""" @@ -654,10 +644,10 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: # This shows unused categories in indices GH#38642 return self.groupings[0].indices codes_list = [ping.codes for ping in self.groupings] - keys = [ping.group_index for ping in self.groupings] - return get_indexer_dict(codes_list, keys) + return get_indexer_dict(codes_list, self.levels) @final + @cache_readonly def result_ilocs(self) -> npt.NDArray[np.intp]: """ Get the original integer locations of result_index in the input. @@ -665,18 +655,15 @@ def result_ilocs(self) -> npt.NDArray[np.intp]: # Original indices are where group_index would go via sorting. # But when dropna is true, we need to remove null values while accounting for # any gaps that then occur because of them. - group_index = get_group_index( - self.codes, self.shape, sort=self._sort, xnull=True - ) - group_index, _ = compress_group_index(group_index, sort=self._sort) + ids = self.ids if self.has_dropped_na: - mask = np.where(group_index >= 0) + mask = np.where(ids >= 0) # Count how many gaps are caused by previous null values for each position - null_gaps = np.cumsum(group_index == -1)[mask] - group_index = group_index[mask] + null_gaps = np.cumsum(ids == -1)[mask] + ids = ids[mask] - result = get_group_index_sorter(group_index, self.ngroups) + result = get_group_index_sorter(ids, self.ngroups) if self.has_dropped_na: # Shift by the number of prior null gaps @@ -684,14 +671,17 @@ def result_ilocs(self) -> npt.NDArray[np.intp]: return result - @final @property def codes(self) -> list[npt.NDArray[np.signedinteger]]: return [ping.codes for ping in self.groupings] @property def levels(self) -> list[Index]: - return [ping.group_index for ping in self.groupings] + if len(self.groupings) > 1: + # mypy doesn't know result_index must be a MultiIndex + return list(self.result_index.levels) # type: ignore[attr-defined] + else: + return [self.result_index] @property def names(self) -> list[Hashable]: @@ -702,7 +692,7 @@ def size(self) -> Series: """ Compute group sizes. """ - ids, _, ngroups = self.group_info + ids, ngroups = self.group_info out: np.ndarray | list if ngroups: out = np.bincount(ids[ids != -1], minlength=ngroups) @@ -711,20 +701,24 @@ def size(self) -> Series: return Series(out, index=self.result_index, dtype="int64") @cache_readonly - def groups(self) -> dict[Hashable, np.ndarray]: + def groups(self) -> dict[Hashable, Index]: """dict {group name -> group labels}""" if len(self.groupings) == 1: return self.groupings[0].groups + if len(self.result_index) == 0: + index = self.result_index else: - to_groupby = [] - for ping in self.groupings: - gv = ping.grouping_vector - if not isinstance(gv, BaseGrouper): - to_groupby.append(gv) - else: - to_groupby.append(gv.groupings[0].grouping_vector) - index = MultiIndex.from_arrays(to_groupby) - return self.axis.groupby(index) + index = self.result_index.take(self.ids) + categories = ( + self.result_index._values + if isinstance(self.result_index, MultiIndex) + else self.result_index + ) + values = index._values if isinstance(index, MultiIndex) else index + cats = Categorical(values, categories) + result = {k: self.axis.take(v) for k, v in cats._reverse_indexer().items()} + + return result @final @cache_readonly @@ -738,73 +732,132 @@ def has_dropped_na(self) -> bool: """ Whether grouper has null value(s) that are dropped. """ - return bool((self.group_info[0] < 0).any()) + return bool((self.ids < 0).any()) @cache_readonly - def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: - comp_ids, obs_group_ids = self._get_compressed_codes() - - ngroups = len(obs_group_ids) - comp_ids = ensure_platform_int(comp_ids) - - return comp_ids, obs_group_ids, ngroups + def group_info(self) -> tuple[npt.NDArray[np.intp], int]: + result_index, ids = self.result_index_and_ids + ngroups = len(result_index) + return ids, ngroups @cache_readonly def codes_info(self) -> npt.NDArray[np.intp]: # return the codes of items in original grouped axis - ids, _, _ = self.group_info + ids, _ = self.group_info return ids - @final - def _get_compressed_codes( - self, - ) -> tuple[npt.NDArray[np.signedinteger], npt.NDArray[np.intp]]: - # The first returned ndarray may have any signed integer dtype - if len(self.groupings) > 1: - group_index = get_group_index(self.codes, self.shape, sort=True, xnull=True) - return compress_group_index(group_index, sort=self._sort) - # FIXME: compress_group_index's second return value is int64, not intp - - ping = self.groupings[0] - return ping.codes, np.arange(len(ping.group_index), dtype=np.intp) - @final @cache_readonly def ngroups(self) -> int: return len(self.result_index) @property - def reconstructed_codes(self) -> list[npt.NDArray[np.intp]]: - codes = self.codes - ids, obs_ids, _ = self.group_info - return decons_obs_group_ids(ids, obs_ids, self.shape, codes, xnull=True) + def result_index(self) -> Index: + return self.result_index_and_ids[0] + + @property + def ids(self) -> np.ndarray: + return self.result_index_and_ids[1] @cache_readonly - def result_index(self) -> Index: + def result_index_and_ids(self) -> tuple[Index, np.ndarray]: + names = self.names + codes = [ping.codes for ping in self.groupings] + levels = [Index._with_infer(ping.uniques) for ping in self.groupings] + obs = [ + ping._observed or not ping._passed_categorical for ping in self.groupings + ] + # When passed a categorical grouping, keep all categories + for k, (ping, level) in enumerate(zip(self.groupings, levels)): + if ping._passed_categorical: + levels[k] = level.set_categories(ping._orig_cats) + if len(self.groupings) == 1: - return self.groupings[0].result_index.rename(self.names[0]) + result_index = levels[0] + result_index.name = names[0] + ids = ensure_platform_int(codes[0]) + return result_index, ids + + if any(obs): + ob_codes = [code for code, ob in zip(codes, obs) if ob] + ob_levels = [level for level, ob in zip(levels, obs) if ob] + ob_names = [name for name, ob in zip(names, obs) if ob] + + shape = tuple(len(level) for level in ob_levels) + group_index = get_group_index(ob_codes, shape, sort=True, xnull=True) + ob_ids, obs_group_ids = compress_group_index(group_index, sort=self._sort) + ob_ids = ensure_platform_int(ob_ids) + ob_index_codes = decons_obs_group_ids( + ob_ids, obs_group_ids, shape, ob_codes, xnull=True + ) + ob_index = MultiIndex( + levels=ob_levels, + codes=ob_index_codes, + names=ob_names, + verify_integrity=False, + ) - codes = self.reconstructed_codes - levels = [ping.result_index for ping in self.groupings] - return MultiIndex( - levels=levels, codes=codes, verify_integrity=False, names=self.names - ) + if not all(obs): + unob_codes = [e for e, o in zip(codes, obs) if not o] + unob_levels = [e for e, o in zip(levels, obs) if not o] + unob_names = [e for e, o in zip(names, obs) if not o] + + shape = tuple(len(level) for level in unob_levels) + unob_ids = get_group_index(unob_codes, shape, sort=True, xnull=True) + unob_index = MultiIndex.from_product(unob_levels, names=unob_names) + + if all(obs): + result_index = ob_index + ids = ensure_platform_int(ob_ids) + elif not any(obs): + result_index = unob_index + ids = ensure_platform_int(unob_ids) + else: + # Combine unobserved and observed parts of result_index + unob_indices = [k for k, e in enumerate(obs) if not e] + ob_indices = [k for k, e in enumerate(obs) if e] + result_index_codes = np.concatenate( + [ + np.tile(unob_index.codes, len(ob_index)), + np.repeat(ob_index.codes, len(unob_index), axis=1), + ], + axis=0, + ) + _, index = np.unique(unob_indices + ob_indices, return_index=True) + result_index = MultiIndex( + levels=list(unob_index.levels) + list(ob_index.levels), + codes=result_index_codes, + names=list(unob_index.names) + list(ob_index.names), + ).reorder_levels(index) + ids = len(unob_index) * ob_ids + unob_ids + + if self._sort: + sorter = result_index.argsort() + result_index = result_index.take(sorter) + _, index = np.unique(sorter, return_index=True) + ids = ensure_platform_int(ids) + ids = index.take(ids) + else: + ids, uniques = compress_group_index(ids, sort=False) + ids = ensure_platform_int(ids) + taker = np.concatenate( + [uniques, np.delete(np.arange(len(result_index)), uniques)] + ) + result_index = result_index.take(taker) + + return result_index, ids @final - def get_group_levels(self) -> list[ArrayLike]: + def get_group_levels(self) -> list[Index]: # Note: only called from _insert_inaxis_grouper, which # is only called for BaseGrouper, never for BinGrouper + result_index = self.result_index if len(self.groupings) == 1: - return [self.groupings[0].group_arraylike] - - name_list = [] - for ping, codes in zip(self.groupings, self.reconstructed_codes): - codes = ensure_platform_int(codes) - levels = ping.group_arraylike.take(codes) - - name_list.append(levels) - - return name_list + return [result_index] + return [ + result_index.get_level_values(level) + for level in range(result_index.nlevels) + ] # ------------------------------------------------------------ # Aggregation functions @@ -826,14 +879,12 @@ def _cython_operation( cy_op = WrappedCythonOp(kind=kind, how=how, has_dropped_na=self.has_dropped_na) - ids, _, _ = self.group_info - ngroups = self.ngroups return cy_op.cython_operation( values=values, axis=axis, min_count=min_count, - comp_ids=ids, - ngroups=ngroups, + comp_ids=self.ids, + ngroups=self.ngroups, **kwargs, ) @@ -881,7 +932,7 @@ def agg_series( def _aggregate_series_pure_python( self, obj: Series, func: Callable ) -> npt.NDArray[np.object_]: - _, _, ngroups = self.group_info + _, ngroups = self.group_info result = np.empty(ngroups, dtype="O") initialized = False @@ -907,7 +958,7 @@ def apply_groupwise( ) -> tuple[list, bool]: mutated = False splitter = self._get_splitter(data, axis=axis) - group_keys = self.group_keys_seq + group_keys = self.result_index result_values = [] # This calls DataSplitter.__iter__ @@ -943,18 +994,14 @@ def apply_groupwise( # ------------------------------------------------------------ # Methods for sorting subsets of our GroupBy's object - @final - @cache_readonly - def _sort_idx(self) -> npt.NDArray[np.intp]: - # Counting sort indexer - ids, _, ngroups = self.group_info - return get_group_index_sorter(ids, ngroups) - @final @cache_readonly def _sorted_ids(self) -> npt.NDArray[np.intp]: - ids, _, _ = self.group_info - return ids.take(self._sort_idx) + result = self.ids.take(self.result_ilocs) + if getattr(self, "dropna", True): + # BinGrouper has no dropna + result = result[result >= 0] + return result class BinGrouper(BaseGrouper): @@ -1025,7 +1072,7 @@ def nkeys(self) -> int: @cache_readonly def codes_info(self) -> npt.NDArray[np.intp]: # return the codes of items in original grouped axis - ids, _, _ = self.group_info + ids, _ = self.group_info if self.indexer is not None: sorter = np.lexsort((ids, self.indexer)) ids = ids[sorter] @@ -1069,9 +1116,8 @@ def indices(self): return indices @cache_readonly - def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: + def group_info(self) -> tuple[npt.NDArray[np.intp], int]: ngroups = self.ngroups - obs_group_ids = np.arange(ngroups, dtype=np.intp) rep = np.diff(np.r_[0, self.bins]) rep = ensure_platform_int(rep) @@ -1080,16 +1126,7 @@ def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: else: comp_ids = np.repeat(np.r_[-1, np.arange(ngroups)], rep) - return ( - ensure_platform_int(comp_ids), - obs_group_ids, - ngroups, - ) - - @cache_readonly - def reconstructed_codes(self) -> list[np.ndarray]: - # get unique result indices, and prepend 0 as groupby starts from the first - return [np.r_[0, np.flatnonzero(self.bins[1:] != self.bins[:-1]) + 1]] + return (ensure_platform_int(comp_ids), ngroups) @cache_readonly def result_index(self) -> Index: @@ -1098,6 +1135,14 @@ def result_index(self) -> Index: return self.binlabels + @cache_readonly + def codes(self) -> list[npt.NDArray[np.intp]]: + return [self.group_info[0]] + + @cache_readonly + def result_index_and_ids(self): + return self.result_index, self.group_info[0] + @property def levels(self) -> list[Index]: return [self.binlabels] diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index ebf4f2d515956..d8d40e01c2ead 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -6317,7 +6317,7 @@ def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: return True @final - def groupby(self, values) -> PrettyDict[Hashable, np.ndarray]: + def groupby(self, values) -> PrettyDict[Hashable, Index]: """ Group the index labels by a given array of values. diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 79354fdd12a2d..1793ceea15f2a 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -717,6 +717,7 @@ def crosstab( margins=margins, margins_name=margins_name, dropna=dropna, + observed=dropna, **kwargs, # type: ignore[arg-type] ) diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index b82908ef2aa21..3760cbdc0ce1a 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -385,8 +385,8 @@ def test_against_frame_and_seriesgroupby( "sort, ascending, expected_rows, expected_count, expected_group_size", [ (False, None, [0, 1, 2, 3, 4], [1, 1, 1, 2, 1], [1, 3, 1, 3, 1]), - (True, False, [4, 3, 1, 2, 0], [1, 2, 1, 1, 1], [1, 3, 3, 1, 1]), - (True, True, [4, 1, 3, 2, 0], [1, 1, 2, 1, 1], [1, 3, 3, 1, 1]), + (True, False, [3, 0, 1, 2, 4], [2, 1, 1, 1, 1], [3, 1, 3, 1, 1]), + (True, True, [0, 1, 2, 4, 3], [1, 1, 1, 1, 2], [1, 3, 1, 1, 3]), ], ) def test_compound( @@ -617,7 +617,7 @@ def test_categorical_single_grouper_with_only_observed_categories( ) gp = education_df.astype("category").groupby( - "country", as_index=as_index, observed=observed + "country", as_index=as_index, observed=observed, sort=True ) result = gp.value_counts(normalize=normalize) @@ -811,19 +811,19 @@ def test_categorical_single_grouper_observed_false( ("FR", "female", "high"), ("FR", "male", "medium"), ("FR", "female", "low"), - ("FR", "male", "high"), ("FR", "female", "medium"), + ("FR", "male", "high"), ("US", "female", "high"), ("US", "male", "low"), - ("US", "male", "medium"), - ("US", "male", "high"), - ("US", "female", "medium"), ("US", "female", "low"), - ("ASIA", "male", "low"), - ("ASIA", "male", "high"), - ("ASIA", "female", "medium"), - ("ASIA", "female", "low"), + ("US", "female", "medium"), + ("US", "male", "high"), + ("US", "male", "medium"), ("ASIA", "female", "high"), + ("ASIA", "female", "low"), + ("ASIA", "female", "medium"), + ("ASIA", "male", "high"), + ("ASIA", "male", "low"), ("ASIA", "male", "medium"), ] diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 939dd176ae90e..064364047e03e 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -42,8 +42,8 @@ def f(a): # These expected values can be used across several tests (i.e. they are # the same for SeriesGroupBy and DataFrameGroupBy) but they should only be # hardcoded in one place. - "all": np.nan, - "any": np.nan, + "all": True, + "any": False, "count": 0, "corrwith": np.nan, "first": np.nan, @@ -56,7 +56,7 @@ def f(a): "min": np.nan, "nth": np.nan, "nunique": 0, - "prod": np.nan, + "prod": 1, "quantile": np.nan, "sem": np.nan, "size": 0, @@ -1275,11 +1275,7 @@ def test_seriesgroupby_observed_false_or_none(df_cat, observed, operation): names=["A", "B"], ).sortlevel() - expected = Series(data=[2, 4, np.nan, 1, np.nan, 3], index=index, name="C") - if operation == "agg": - msg = "The 'downcast' keyword in fillna is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = expected.fillna(0, downcast="infer") + expected = Series(data=[2, 4, 0, 1, 0, 3], index=index, name="C") grouped = df_cat.groupby(["A", "B"], observed=observed)["C"] msg = "using SeriesGroupBy.sum" if operation == "agg" else "using np.sum" with tm.assert_produces_warning(FutureWarning, match=msg): @@ -1468,18 +1464,21 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans( result = agg(*args) - zero_or_nan = _results_for_groupbys_with_missing_categories[reduction_func] + missing_fillin = _results_for_groupbys_with_missing_categories[reduction_func] for idx in unobserved: val = result.loc[idx] - assert (pd.isna(zero_or_nan) and pd.isna(val)) or (val == zero_or_nan) + assert (pd.isna(missing_fillin) and pd.isna(val)) or (val == missing_fillin) # If we expect unobserved values to be zero, we also expect the dtype to be int. # Except for .sum(). If the observed categories sum to dtype=float (i.e. their # sums have decimals), then the zeros for the missing categories should also be # floats. - if zero_or_nan == 0 and reduction_func != "sum": - assert np.issubdtype(result.dtype, np.integer) + if missing_fillin == 0: + if reduction_func in ["count", "nunique", "size"]: + assert np.issubdtype(result.dtype, np.integer) + else: + assert reduction_func in ["sum", "any"] def test_dataframe_groupby_on_2_categoricals_when_observed_is_true(reduction_func): @@ -2124,15 +2123,6 @@ def test_agg_list(request, as_index, observed, reduction_func, test_series, keys elif reduction_func == "corrwith": msg = "GH#32293: attempts to call SeriesGroupBy.corrwith" request.applymarker(pytest.mark.xfail(reason=msg)) - elif ( - reduction_func == "nunique" - and not test_series - and len(keys) != 1 - and not observed - and not as_index - ): - msg = "GH#52848 - raises a ValueError" - request.applymarker(pytest.mark.xfail(reason=msg)) df = DataFrame({"a1": [0, 0, 1], "a2": [2, 3, 3], "b": [4, 5, 6]}) df = df.astype({"a1": "category", "a2": "category"}) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 4a8a0851d2e42..50dc0589c43c8 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -320,7 +320,7 @@ def test_len(): df = DataFrame({"a": [np.nan] * 3, "b": [1, 2, 3]}) assert len(df.groupby("a")) == 0 assert len(df.groupby("b")) == 3 - assert len(df.groupby(["a", "b"])) == 3 + assert len(df.groupby(["a", "b"])) == 0 def test_basic_regression(): diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 8c2b95ba631ee..ac4d57eeb28b8 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -725,7 +725,7 @@ def test_level_preserve_order(self, sort, labels, mframe): # GH 17537 grouped = mframe.groupby(level=0, sort=sort) exp_labels = np.array(labels, np.intp) - tm.assert_almost_equal(grouped.grouper.codes[0], exp_labels) + tm.assert_almost_equal(grouped.grouper.ids, exp_labels) def test_grouping_labels(self, mframe): grouped = mframe.groupby(mframe.index.get_level_values(0)) @@ -794,11 +794,7 @@ def test_groupby_empty(self): gr.grouper.group_info[0], np.array([], dtype=np.dtype(np.intp)) ) - tm.assert_numpy_array_equal( - gr.grouper.group_info[1], np.array([], dtype=np.dtype(np.intp)) - ) - - assert gr.grouper.group_info[2] == 0 + assert gr.grouper.group_info[1] == 0 # check name assert s.groupby(s).grouper.names == ["name"] diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 0f4a73e4e2a38..45a527cf7813d 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -569,16 +569,6 @@ def test_groupby_raises_category_on_category( return empty_groups = not observed and any(group.empty for group in gb.groups.values()) - if ( - not observed - and how != "transform" - and isinstance(by, list) - and isinstance(by[0], str) - and by == ["a", "b"] - ): - assert not empty_groups - # TODO: empty_groups should be true due to unobserved categorical combinations - empty_groups = True if how == "transform": # empty groups will be ignored empty_groups = False diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index 48c51cdfab4e4..f392253ac310f 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -67,7 +67,7 @@ def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper): gb = df.groupby(tdg) # check we're testing the case we're interested in - assert len(gb.grouper.result_index) != len(gb.grouper.group_keys_seq) + assert len(gb.grouper.result_index) != len(gb.grouper.codes) return gb diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 1f97d7cb605cf..c546b260a8dc8 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -261,9 +261,7 @@ def test_pivot_with_non_observable_dropna(self, dropna): expected = DataFrame( {"B": values}, index=Index( - Categorical.from_codes( - codes, categories=["low", "high"], ordered=dropna - ), + Categorical.from_codes(codes, categories=["low", "high"], ordered=True), name="A", ), ) From 700f40fe791ad8074c4ac73865bdf02b7cab7be4 Mon Sep 17 00:00:00 2001 From: richard Date: Fri, 17 Nov 2023 18:36:45 -0500 Subject: [PATCH 18/31] Add test --- pandas/tests/groupby/test_categorical.py | 27 ++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index ab0442130e793..714a07e3cb188 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -2156,3 +2156,30 @@ def test_agg_list(request, as_index, observed, reduction_func, test_series, keys expected.columns = keys + [reduction_func] tm.assert_equal(result, expected) + + +def test_reduction_as_index_false_multiple_groupings(observed, reduction_func): + # GH#52848 + if reduction_func in ["corrwith"]: + pytest.skip("corrwith doesn't fit the format of this test") + df = DataFrame({"a1": [0, 0, 1], "a2": [2, 3, 3], "b": [4, 5, 6]}) + df = df.astype({"a1": "category", "a2": "category"}) + gb = df.groupby(by=["a1", "a2"], as_index=False, observed=observed) + args = get_groupby_method_args(reduction_func, df) + + if reduction_func in ["idxmin", "idxmax"] and not observed: + msg = ( + f"Can't get {reduction_func} of an empty group due to unobserved categories" + ) + with pytest.raises(ValueError, match=msg): + getattr(gb, reduction_func)(*args) + return + + result = getattr(gb, reduction_func)(*args) + + gb_as_index = df.groupby(by=["a1", "a2"], as_index=True, observed=observed) + expected = getattr(gb_as_index, reduction_func)(*args).reset_index() + if reduction_func == "size": + expected = expected.rename(columns={0: "size"}) + + tm.assert_frame_equal(result, expected) From efd20c722745e1c6c9546ae45f5c0d5ea6593e65 Mon Sep 17 00:00:00 2001 From: richard Date: Fri, 17 Nov 2023 19:50:26 -0500 Subject: [PATCH 19/31] Remove test --- pandas/tests/groupby/test_categorical.py | 27 ------------------------ 1 file changed, 27 deletions(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 714a07e3cb188..ab0442130e793 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -2156,30 +2156,3 @@ def test_agg_list(request, as_index, observed, reduction_func, test_series, keys expected.columns = keys + [reduction_func] tm.assert_equal(result, expected) - - -def test_reduction_as_index_false_multiple_groupings(observed, reduction_func): - # GH#52848 - if reduction_func in ["corrwith"]: - pytest.skip("corrwith doesn't fit the format of this test") - df = DataFrame({"a1": [0, 0, 1], "a2": [2, 3, 3], "b": [4, 5, 6]}) - df = df.astype({"a1": "category", "a2": "category"}) - gb = df.groupby(by=["a1", "a2"], as_index=False, observed=observed) - args = get_groupby_method_args(reduction_func, df) - - if reduction_func in ["idxmin", "idxmax"] and not observed: - msg = ( - f"Can't get {reduction_func} of an empty group due to unobserved categories" - ) - with pytest.raises(ValueError, match=msg): - getattr(gb, reduction_func)(*args) - return - - result = getattr(gb, reduction_func)(*args) - - gb_as_index = df.groupby(by=["a1", "a2"], as_index=True, observed=observed) - expected = getattr(gb_as_index, reduction_func)(*args).reset_index() - if reduction_func == "size": - expected = expected.rename(columns={0: "size"}) - - tm.assert_frame_equal(result, expected) From 9dac2977095595d463391eda7273842a92eaf9da Mon Sep 17 00:00:00 2001 From: richard Date: Fri, 17 Nov 2023 20:37:26 -0500 Subject: [PATCH 20/31] Move unobserved to the end --- pandas/tests/groupby/methods/test_value_counts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index b628358c1af29..d118065ae164e 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -1235,7 +1235,7 @@ def test_value_counts_sort_categorical(sort, vc_sort, normalize): elif not sort and vc_sort: taker = [0, 2, 1, 3] else: - taker = [2, 3, 0, 1] + taker = [2, 1, 0, 3] expected = expected.take(taker) tm.assert_series_equal(result, expected) From 26da0b845b82404c1c7b4940352ceb462770d774 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 18 Nov 2023 06:42:12 -0500 Subject: [PATCH 21/31] cleanup --- pandas/core/groupby/grouper.py | 2 +- pandas/tests/groupby/methods/test_value_counts.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index aa63ce7d9516e..58167c07f4d92 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -678,7 +678,7 @@ def _ilevel(self) -> int | None: @property def ngroups(self) -> int: - return len(self._codes_and_uniques[1]) + return len(self.uniques) @cache_readonly def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index d118065ae164e..d545915360696 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -617,7 +617,7 @@ def test_categorical_single_grouper_with_only_observed_categories( ) gp = education_df.astype("category").groupby( - "country", as_index=as_index, observed=observed, sort=True + "country", as_index=as_index, observed=observed ) result = gp.value_counts(normalize=normalize) From 0a6b63a8bdc247f031c84df7db2bb7b6eca32aed Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 18 Nov 2023 06:59:19 -0500 Subject: [PATCH 22/31] cleanup --- pandas/core/groupby/grouper.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 58167c07f4d92..158f9917559bf 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -715,17 +715,18 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: else: ucodes = np.arange(len(categories)) + has_dropped_na = False if not self._dropna: na_mask = cat.codes < 0 if np.any(na_mask): + has_dropped_na = True if self._sort: - # Replace NA codes with `largest code + 1` + # NA goes at the end, gets `largest non-NA code + 1` na_code = len(categories) else: - # Insert NA code into the codes based on first appearance - # A negative code must exist, no need to check codes[na_idx] < 0 + # Insert NA in result based on first appearance, need + # the number of unique codes prior na_idx = na_mask.argmax() - # count number of unique codes that comes before the nan value na_code = algorithms.nunique_ints(cat.codes[:na_idx]) ucodes = np.insert(ucodes, na_code, -1) @@ -733,14 +734,12 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: codes=ucodes, categories=categories, ordered=cat.ordered, validate=False ) codes = cat.codes - if not self._dropna: - na_mask = codes < 0 - if np.any(na_mask): - if self._sort: - codes = np.where(na_mask, na_code, codes) - else: - codes = np.where(codes >= na_code, codes + 1, codes) - codes = np.where(na_mask, na_code, codes) + + if has_dropped_na: + if not self._sort: + # NA code is based on first appearance, increment higher codes + codes = np.where(codes >= na_code, codes + 1, codes) + codes = np.where(na_mask, na_code, codes) return codes, uniques From 001881f3f58ae4d230d8578a4c31153874f4b8f3 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 18 Nov 2023 07:19:01 -0500 Subject: [PATCH 23/31] cleanup --- pandas/core/groupby/ops.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 85f694be62b59..e209bb638538e 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -832,12 +832,15 @@ def result_index_and_ids(self) -> tuple[Index, np.ndarray]: ids = len(unob_index) * ob_ids + unob_ids if self._sort: + # Sort result_index and recode ids using the new order sorter = result_index.argsort() result_index = result_index.take(sorter) _, index = np.unique(sorter, return_index=True) ids = ensure_platform_int(ids) ids = index.take(ids) else: + # Recode ids and reorder result_index with observed groups up front, + # unobserved at the end ids, uniques = compress_group_index(ids, sort=False) ids = ensure_platform_int(ids) taker = np.concatenate( From d5b37a4e8e562e519fa0db5b533de87ab270c484 Mon Sep 17 00:00:00 2001 From: richard Date: Thu, 7 Dec 2023 22:32:20 -0500 Subject: [PATCH 24/31] Merge fixup --- pandas/tests/groupby/test_groupby.py | 10 ---------- pandas/tests/groupby/test_grouping.py | 10 ---------- pandas/tests/groupby/test_timegrouper.py | 4 +--- 3 files changed, 1 insertion(+), 23 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index ecd19b81553cd..770732fbb11b4 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -3290,13 +3290,3 @@ def test_groupby_ffill_with_duplicated_index(): result = df.groupby(level=0).ffill() expected = DataFrame({"a": [1, 2, 3, 4, 2, 3]}, index=[0, 1, 2, 0, 1, 2]) tm.assert_frame_equal(result, expected, check_dtype=False) - - -@pytest.mark.parametrize("attr", ["group_keys_seq", "reconstructed_codes"]) -def test_depr_grouper_attrs(attr): - # GH#56148 - df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]}) - gb = df.groupby("a") - msg = f"{attr} is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - getattr(gb.grouper, attr) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 04fb167da0b72..1ab9ff32e15bf 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -1206,13 +1206,3 @@ def test_grouper_groups(): msg = "Grouper.indexer is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): grper.indexer - - -@pytest.mark.parametrize("attr", ["group_index", "result_index", "group_arraylike"]) -def test_depr_grouping_attrs(attr): - # GH#56148 - df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]}) - gb = df.groupby("a") - msg = f"{attr} is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - getattr(gb.grouper.groupings[0], attr) diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index a5ac5b09bfd34..dcad283a8d852 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -67,9 +67,7 @@ def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper): gb = df.groupby(tdg) # check we're testing the case we're interested in - msg = "group_keys_seq is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - assert len(gb.grouper.result_index) != len(gb.grouper.group_keys_seq) + assert len(gb.grouper.result_index) != len(gb.grouper.codes) return gb From 4f284ce850994d091dfc48007b5f3c561310ebb0 Mon Sep 17 00:00:00 2001 From: richard Date: Thu, 1 Feb 2024 23:32:49 -0500 Subject: [PATCH 25/31] fixup --- pandas/core/groupby/generic.py | 2 +- pandas/core/groupby/groupby.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index d07462c7f4723..a204dde4727d3 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -842,7 +842,7 @@ def value_counts( # ndarray[Any, Any]], Index, Series]] _, idx = get_join_indexers( left, # type: ignore[arg-type] - right, # type: ignore[arg-type] + right, sort=False, how="left", ) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 6be285d405415..b97e15a7a159e 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -5476,9 +5476,9 @@ def _idxmax_idxmin( if not self.observed and any( ping._passed_categorical for ping in self._grouper.groupings ): - expected_len = len(self.grouper.result_index) + expected_len = len(self._grouper.result_index) # TODO: Better way to find # of observed groups? - group_sizes = self.grouper.size() + group_sizes = self._grouper.size() result_len = group_sizes[group_sizes > 0].shape[0] assert result_len <= expected_len has_unobserved = result_len < expected_len From dce05daaf5d3c6e74d1909282d8c6fc1503b6c27 Mon Sep 17 00:00:00 2001 From: richard Date: Fri, 2 Feb 2024 18:27:19 -0500 Subject: [PATCH 26/31] fixup --- pandas/core/groupby/generic.py | 10 +++++----- pandas/tests/groupby/test_grouping.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index a204dde4727d3..d925a3a801486 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -788,14 +788,14 @@ def value_counts( rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx)) # multi-index components - if isinstance(self.grouper.result_index, MultiIndex): - codes = list(self.grouper.result_index.codes) + if isinstance(self._grouper.result_index, MultiIndex): + codes = list(self._grouper.result_index.codes) else: codes = [ algorithms.factorize( - self.grouper.result_index, - sort=self.grouper._sort, - use_na_sentinel=self.grouper.dropna, + self._grouper.result_index, + sort=self._grouper._sort, + use_na_sentinel=self._grouper.dropna, )[0] ] codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)] diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index b2c306f57ccf3..27b4508feb314 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -779,7 +779,7 @@ def test_groupby_empty(self): gr._grouper.group_info[0], np.array([], dtype=np.dtype(np.intp)) ) - assert gr.grouper.group_info[1] == 0 + assert gr._grouper.group_info[1] == 0 # check name gb = s.groupby(s) From 72209a86f376c107e1a859087609a82d12c00e7f Mon Sep 17 00:00:00 2001 From: richard Date: Sat, 3 Feb 2024 06:06:25 -0500 Subject: [PATCH 27/31] Fixup and test --- pandas/core/groupby/ops.py | 20 +++++++------------- pandas/tests/groupby/test_groupby.py | 27 +++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 13 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 521312ee49974..24f69cc014d94 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -702,19 +702,13 @@ def groups(self) -> dict[Hashable, Index]: """dict {group name -> group labels}""" if len(self.groupings) == 1: return self.groupings[0].groups - if len(self.result_index) == 0: - index = self.result_index - else: - index = self.result_index.take(self.ids) - categories = ( - self.result_index._values - if isinstance(self.result_index, MultiIndex) - else self.result_index - ) - values = index._values if isinstance(index, MultiIndex) else index - cats = Categorical(values, categories) - result = {k: self.axis.take(v) for k, v in cats._reverse_indexer().items()} - + result_index, ids = self.result_index_and_ids + values = result_index._values + categories = Categorical(ids, categories=np.arange(len(result_index))) + result = { + values[group]: self.axis.take(axis_ilocs) + for group, axis_ilocs in categories._reverse_indexer().items() + } return result @final diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 3f1e402b319c1..db45067162bc3 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1929,6 +1929,33 @@ def test_groupby_groups_in_BaseGrouper(): assert result.groups == expected.groups +def test_groups_sort_dropna(sort, dropna): + # GH#56966, GH#56851 + df = DataFrame([[2.0, 1.0], [np.nan, 4.0], [0.0, 3.0]]) + keys = [(2.0, 1.0), (np.nan, 4.0), (0.0, 3.0)] + values = [ + Index([0], dtype="int64"), + Index([1], dtype="int64"), + Index([2], dtype="int64"), + ] + if sort: + taker = [2, 0] if dropna else [2, 0, 1] + else: + taker = [0, 2] if dropna else [0, 1, 2] + expected = {keys[idx]: values[idx] for idx in taker} + + gb = df.groupby([0, 1], sort=sort, dropna=dropna) + result = gb.groups + + for result_key, expected_key in zip(result.keys(), expected.keys()): + # Compare as NumPy arrays to handle np.nan + result_key = np.array(result_key) + expected_key = np.array(expected_key) + tm.assert_numpy_array_equal(result_key, expected_key) + for result_value, expected_value in zip(result.values(), expected.values()): + tm.assert_index_equal(result_value, expected_value) + + @pytest.mark.parametrize( "op, expected", [ From b58b69de28a66948aab41a4f8b91575f685dcb02 Mon Sep 17 00:00:00 2001 From: richard Date: Sat, 3 Feb 2024 06:17:59 -0500 Subject: [PATCH 28/31] whatsnew --- doc/source/whatsnew/v3.0.0.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 25163a0f678b0..abec2f0b0a036 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -206,6 +206,10 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`) +- Bug in :meth:`.DataFrameGroupBy.groups` and :meth:`.SeriesGroupby.groups` that would not respect groupby arguments ``dropna`` and ``sort`` (:issue:`55919`, :issue:`56966`, :issue:`56851`) +- Bug in :meth:`.DataFrameGroupBy.nunique` and :meth:`.SeriesGroupBy.nunique` would fail with multiple categorical groupings when ``as_index=False`` (:issue:`52848`) +- Bug in :meth:`.DataFrameGroupBy.prod`, :meth:`.DataFrameGroupBy.any`, and :meth:`.DataFrameGroupBy.all` would result in NA values on unobserved groups; they now result in ``1``, ``False``, and ``True`` respectively (:issue:`55783`) +- Bug in :meth:`.DataFrameGroupBy.value_counts` would produce incorrect results when used with some categorical and some non-categorical groupings and ``observed=False`` (:issue:`56016`) - Reshaping From fe99dc52bac44f61f0b19254c914ea28e357036f Mon Sep 17 00:00:00 2001 From: richard Date: Sat, 3 Feb 2024 07:53:29 -0500 Subject: [PATCH 29/31] type ignore --- pandas/core/groupby/ops.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 24f69cc014d94..8cc7c51362d43 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -706,7 +706,8 @@ def groups(self) -> dict[Hashable, Index]: values = result_index._values categories = Categorical(ids, categories=np.arange(len(result_index))) result = { - values[group]: self.axis.take(axis_ilocs) + # mypy is not aware that group has to be an integer + values[group]: self.axis.take(axis_ilocs) # type: ignore[call-overload] for group, axis_ilocs in categories._reverse_indexer().items() } return result From 766c229dbe1eb8725559c0d4b9105dfd917bda2c Mon Sep 17 00:00:00 2001 From: richard Date: Sun, 4 Feb 2024 08:32:01 -0500 Subject: [PATCH 30/31] Refactor & type annotations --- pandas/core/groupby/grouper.py | 2 +- pandas/core/groupby/ops.py | 99 +++++++++++++++++++--------------- 2 files changed, 57 insertions(+), 44 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 99b0fe67d3dd1..9313beb9374b3 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -680,7 +680,7 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: has_dropped_na = False if not self._dropna: - na_mask = cat.codes < 0 + na_mask = cat.isna() if np.any(na_mask): has_dropped_na = True if self._sort: diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 8cc7c51362d43..1fd9446e47bf0 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -748,13 +748,11 @@ def result_index(self) -> Index: return self.result_index_and_ids[0] @property - def ids(self) -> np.ndarray: + def ids(self) -> npt.NDArray[np.intp]: return self.result_index_and_ids[1] @cache_readonly - def result_index_and_ids(self) -> tuple[Index, np.ndarray]: - names = self.names - codes = [ping.codes for ping in self.groupings] + def result_index_and_ids(self) -> tuple[Index, npt.NDArray[np.intp]]: levels = [Index._with_infer(ping.uniques) for ping in self.groupings] obs = [ ping._observed or not ping._passed_categorical for ping in self.groupings @@ -766,48 +764,29 @@ def result_index_and_ids(self) -> tuple[Index, np.ndarray]: if len(self.groupings) == 1: result_index = levels[0] - result_index.name = names[0] - ids = ensure_platform_int(codes[0]) - return result_index, ids - - if any(obs): - ob_codes = [code for code, ob in zip(codes, obs) if ob] - ob_levels = [level for level, ob in zip(levels, obs) if ob] - ob_names = [name for name, ob in zip(names, obs) if ob] - - shape = tuple(len(level) for level in ob_levels) - group_index = get_group_index(ob_codes, shape, sort=True, xnull=True) - ob_ids, obs_group_ids = compress_group_index(group_index, sort=self._sort) - ob_ids = ensure_platform_int(ob_ids) - ob_index_codes = decons_obs_group_ids( - ob_ids, obs_group_ids, shape, ob_codes, xnull=True - ) - ob_index = MultiIndex( - levels=ob_levels, - codes=ob_index_codes, - names=ob_names, - verify_integrity=False, - ) - - if not all(obs): - unob_codes = [e for e, o in zip(codes, obs) if not o] - unob_levels = [e for e, o in zip(levels, obs) if not o] - unob_names = [e for e, o in zip(names, obs) if not o] - - shape = tuple(len(level) for level in unob_levels) - unob_ids = get_group_index(unob_codes, shape, sort=True, xnull=True) - unob_index = MultiIndex.from_product(unob_levels, names=unob_names) - - if all(obs): - result_index = ob_index - ids = ensure_platform_int(ob_ids) + result_index.name = self.names[0] + ids = ensure_platform_int(self.codes[0]) + elif all(obs): + result_index, ids = self._ob_index_and_ids(levels, self.codes, self.names) elif not any(obs): - result_index = unob_index - ids = ensure_platform_int(unob_ids) + result_index, ids = self._unob_index_and_ids(levels, self.codes, self.names) else: - # Combine unobserved and observed parts of result_index - unob_indices = [k for k, e in enumerate(obs) if not e] + # Combine unobserved and observed parts + names = self.names + codes = [ping.codes for ping in self.groupings] ob_indices = [k for k, e in enumerate(obs) if e] + unob_indices = [k for k, e in enumerate(obs) if not e] + ob_index, ob_ids = self._ob_index_and_ids( + levels=[levels[idx] for idx in ob_indices], + codes=[codes[idx] for idx in ob_indices], + names=[names[idx] for idx in ob_indices], + ) + unob_index, unob_ids = self._unob_index_and_ids( + levels=[levels[idx] for idx in unob_indices], + codes=[codes[idx] for idx in unob_indices], + names=[names[idx] for idx in unob_indices], + ) + result_index_codes = np.concatenate( [ np.tile(unob_index.codes, len(ob_index)), @@ -842,6 +821,40 @@ def result_index_and_ids(self) -> tuple[Index, np.ndarray]: return result_index, ids + def _ob_index_and_ids( + self, + levels: list[Index], + codes: list[npt.NDArray[np.intp]], + names: list[Hashable], + ) -> tuple[MultiIndex, npt.NDArray[np.intp]]: + shape = tuple(len(level) for level in levels) + group_index = get_group_index(codes, shape, sort=True, xnull=True) + ob_ids, obs_group_ids = compress_group_index(group_index, sort=self._sort) + ob_ids = ensure_platform_int(ob_ids) + ob_index_codes = decons_obs_group_ids( + ob_ids, obs_group_ids, shape, codes, xnull=True + ) + ob_index = MultiIndex( + levels=levels, + codes=ob_index_codes, + names=names, + verify_integrity=False, + ) + ob_ids = ensure_platform_int(ob_ids) + return ob_index, ob_ids + + def _unob_index_and_ids( + self, + levels: list[Index], + codes: list[npt.NDArray[np.intp]], + names: list[Hashable], + ) -> tuple[MultiIndex, npt.NDArray[np.intp]]: + shape = tuple(len(level) for level in levels) + unob_ids = get_group_index(codes, shape, sort=True, xnull=True) + unob_index = MultiIndex.from_product(levels, names=names) + unob_ids = ensure_platform_int(unob_ids) + return unob_index, unob_ids + @final def get_group_levels(self) -> list[Index]: # Note: only called from _insert_inaxis_grouper, which From a05ff186f73290f67c3bd8618fa01da59c9bb14b Mon Sep 17 00:00:00 2001 From: richard Date: Sun, 4 Feb 2024 08:38:26 -0500 Subject: [PATCH 31/31] Better bikeshed --- pandas/core/groupby/ops.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 1fd9446e47bf0..46ef0f38706bc 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -774,8 +774,8 @@ def result_index_and_ids(self) -> tuple[Index, npt.NDArray[np.intp]]: # Combine unobserved and observed parts names = self.names codes = [ping.codes for ping in self.groupings] - ob_indices = [k for k, e in enumerate(obs) if e] - unob_indices = [k for k, e in enumerate(obs) if not e] + ob_indices = [idx for idx, ob in enumerate(obs) if ob] + unob_indices = [idx for idx, ob in enumerate(obs) if not ob] ob_index, ob_ids = self._ob_index_and_ids( levels=[levels[idx] for idx in ob_indices], codes=[codes[idx] for idx in ob_indices],