REF: Compute complete result_index upfront in groupby (#55738)

* REF: Compute correct result_index upfront in groupby * Refinements * Refinements * Refinements * Restore inferring index dtype * Test fixups * Refinements * Refinements * fixup * fixup * fixup * Fix sorting and non-sorting * Cleanup * Call ensure_plantform_int last * fixup * fixup * REF: Compute correct result_index upfront in groupby * Add test * Remove test * Move unobserved to the end * cleanup * cleanup * cleanup * Merge fixup * fixup * fixup * Fixup and test * whatsnew * type ignore * Refactor & type annotations * Better bikeshed
pandas-dev · Feb 7, 2024 · 05f75c6 · 05f75c6
1 parent 9b3c301
commit 05f75c6
Show file tree

Hide file tree

Showing 14 changed files with 283 additions and 421 deletions.
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -213,6 +213,11 @@ Groupby/resample/rolling
 ^^^^^^^^^^^^^^^^^^^^^^^^
 - Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`)
 - Bug in :meth:`DataFrame.ewm` and :meth:`Series.ewm` when passed ``times`` and aggregation functions other than mean (:issue:`51695`)
+- Bug in :meth:`.DataFrameGroupBy.groups` and :meth:`.SeriesGroupby.groups` that would not respect groupby arguments ``dropna`` and ``sort`` (:issue:`55919`, :issue:`56966`, :issue:`56851`)
+- Bug in :meth:`.DataFrameGroupBy.nunique` and :meth:`.SeriesGroupBy.nunique` would fail with multiple categorical groupings when ``as_index=False`` (:issue:`52848`)
+- Bug in :meth:`.DataFrameGroupBy.prod`, :meth:`.DataFrameGroupBy.any`, and :meth:`.DataFrameGroupBy.all` would result in NA values on unobserved groups; they now result in ``1``, ``False``, and ``True`` respectively (:issue:`55783`)
+- Bug in :meth:`.DataFrameGroupBy.value_counts` would produce incorrect results when used with some categorical and some non-categorical groupings and ``observed=False`` (:issue:`56016`)
+-
 
 Reshaping
 ^^^^^^^^^

diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -411,7 +411,6 @@ def _wrap_applied_output(
             # GH #823 #24880
             index = self._grouper.result_index
             res_df = self.obj._constructor_expanddim(values, index=index)
-            res_df = self._reindex_output(res_df)
             # if self.observed is False,
             # keep all-NaN rows created while re-indexing
             res_ser = res_df.stack(future_stack=True)
@@ -437,7 +436,7 @@ def _wrap_applied_output(
             if not self.as_index:
                 result = self._insert_inaxis_grouper(result)
                 result.index = default_index(len(result))
-            return self._reindex_output(result)
+            return result
 
     def _aggregate_named(self, func, *args, **kwargs):
         # Note: this is very similar to _aggregate_series_pure_python,
@@ -658,7 +657,7 @@ def nunique(self, dropna: bool = True) -> Series | DataFrame:
         2023-02-01    1
         Freq: MS, dtype: int64
         """
-        ids, _, ngroups = self._grouper.group_info
+        ids, ngroups = self._grouper.group_info
         val = self.obj._values
         codes, uniques = algorithms.factorize(val, use_na_sentinel=dropna, sort=False)
 
@@ -691,7 +690,7 @@ def nunique(self, dropna: bool = True) -> Series | DataFrame:
         if not self.as_index:
             result = self._insert_inaxis_grouper(result)
             result.index = default_index(len(result))
-        return self._reindex_output(result, fill_value=0)
+        return result
 
     @doc(Series.describe)
     def describe(self, percentiles=None, include=None, exclude=None) -> Series:
@@ -719,7 +718,7 @@ def value_counts(
         from pandas.core.reshape.merge import get_join_indexers
         from pandas.core.reshape.tile import cut
 
-        ids, _, _ = self._grouper.group_info
+        ids, _ = self._grouper.group_info
         val = self.obj._values
 
         index_names = self._grouper.names + [self.obj.name]
@@ -789,9 +788,18 @@ def value_counts(
         rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx))
 
         # multi-index components
-        codes = self._grouper.reconstructed_codes
+        if isinstance(self._grouper.result_index, MultiIndex):
+            codes = list(self._grouper.result_index.codes)
+        else:
+            codes = [
+                algorithms.factorize(
+                    self._grouper.result_index,
+                    sort=self._grouper._sort,
+                    use_na_sentinel=self._grouper.dropna,
+                )[0]
+            ]
         codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)]
-        levels = [ping._group_index for ping in self._grouper.groupings] + [lev]
+        levels = self._grouper.levels + [lev]
 
         if dropna:
             mask = codes[-1] != -1
@@ -834,7 +842,7 @@ def value_counts(
             # ndarray[Any, Any]], Index, Series]]
             _, idx = get_join_indexers(
                 left,  # type: ignore[arg-type]
-                right,  # type: ignore[arg-type]
+                right,
                 sort=False,
                 how="left",
             )
@@ -1605,7 +1613,7 @@ def _wrap_applied_output_series(
         if not self.as_index:
             result = self._insert_inaxis_grouper(result)
 
-        return self._reindex_output(result)
+        return result
 
     def _cython_transform(
         self,