diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index fae7edba057ec..a757ffb2d9665 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -190,6 +190,7 @@ Other Deprecations Removal of prior version deprecations/changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - :func:`read_excel`, :func:`read_json`, :func:`read_html`, and :func:`read_xml` no longer accept raw string or byte representation of the data. That type of data must be wrapped in a :py:class:`StringIO` or :py:class:`BytesIO` (:issue:`53767`) +- :meth:`DataFrame.groupby` with ``as_index=False`` and aggregation methods will no longer exclude from the result the groupings that do not arise from the input (:issue:`49519`) - :meth:`Series.dt.to_pydatetime` now returns a :class:`Series` of :py:class:`datetime.datetime` objects (:issue:`52459`) - :meth:`SeriesGroupBy.agg` no longer pins the name of the group to the input passed to the provided ``func`` (:issue:`51703`) - All arguments except ``name`` in :meth:`Index.rename` are now keyword only (:issue:`56493`) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 61168f71f4924..72f03bbe66263 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1287,34 +1287,43 @@ def _set_result_index_ordered( return result @final - def _insert_inaxis_grouper(self, result: Series | DataFrame) -> DataFrame: + def _insert_inaxis_grouper( + self, result: Series | DataFrame, qs: npt.NDArray[np.float64] | None = None + ) -> DataFrame: if isinstance(result, Series): result = result.to_frame() + n_groupings = len(self._grouper.groupings) + + if qs is not None: + result.insert( + 0, f"level_{n_groupings}", np.tile(qs, len(result) // len(qs)) + ) + # zip in reverse so we can always insert at loc 0 - columns = result.columns - for name, lev, in_axis in zip( - reversed(self._grouper.names), - reversed(self._grouper.get_group_levels()), - reversed([grp.in_axis for grp in self._grouper.groupings]), + for level, (name, lev, in_axis) in enumerate( + zip( + reversed(self._grouper.names), + reversed(self._grouper.get_group_levels()), + reversed([grp.in_axis for grp in self._grouper.groupings]), + ) ): + if name is None: + # Behave the same as .reset_index() when a level is unnamed + name = ( + "index" + if n_groupings == 1 and qs is None + else f"level_{n_groupings - level - 1}" + ) + # GH #28549 # When using .apply(-), name will be in columns already - if name not in columns: - if in_axis: + if name not in result.columns: + # if in_axis: + if qs is None: result.insert(0, name, lev) else: - msg = ( - "A grouping was used that is not in the columns of the " - "DataFrame and so was excluded from the result. This grouping " - "will be included in a future version of pandas. Add the " - "grouping as a column of the DataFrame to silence this warning." - ) - warnings.warn( - message=msg, - category=FutureWarning, - stacklevel=find_stack_level(), - ) + result.insert(0, name, Index(np.repeat(lev, len(qs)))) return result @@ -1341,18 +1350,17 @@ def _wrap_aggregated_output( if not self.as_index: # `not self.as_index` is only relevant for DataFrameGroupBy, # enforced in __init__ - result = self._insert_inaxis_grouper(result) + result = self._insert_inaxis_grouper(result, qs=qs) result = result._consolidate() - index = Index(range(self._grouper.ngroups)) + result.index = RangeIndex(len(result)) else: index = self._grouper.result_index - - if qs is not None: - # We get here with len(qs) != 1 and not self.as_index - # in test_pass_args_kwargs - index = _insert_quantile_level(index, qs) - result.index = index + if qs is not None: + # We get here with len(qs) != 1 and not self.as_index + # in test_pass_args_kwargs + index = _insert_quantile_level(index, qs) + result.index = index return result diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 255784e8bf24d..45a9540a1a0b9 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1250,10 +1250,7 @@ def test_pass_args_kwargs_duplicate_columns(tsframe, as_index): tsframe.columns = ["A", "B", "A", "C"] gb = tsframe.groupby(lambda x: x.month, as_index=as_index) - warn = None if as_index else FutureWarning - msg = "A grouping .* was excluded from the result" - with tm.assert_produces_warning(warn, match=msg): - res = gb.agg(np.percentile, 80, axis=0) + res = gb.agg(np.percentile, 80, axis=0) ex_data = { 1: tsframe[tsframe.index.month == 1].quantile(0.8), @@ -1261,7 +1258,7 @@ def test_pass_args_kwargs_duplicate_columns(tsframe, as_index): } expected = DataFrame(ex_data).T if not as_index: - # TODO: try to get this more consistent? + expected.insert(0, "index", [1, 2]) expected.index = Index(range(2)) tm.assert_frame_equal(res, expected) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 76c8a6fdb9570..467d932f1c45f 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -779,24 +779,27 @@ def test_as_index(): # function grouper f = lambda r: df.loc[r, "A"] - msg = "A grouping .* was excluded from the result" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby(["cat", f], as_index=False, observed=True).sum() + result = df.groupby(["cat", f], as_index=False, observed=True).sum() expected = DataFrame( { "cat": Categorical([1, 2], categories=df.cat.cat.categories), + "level_1": [10, 11], "A": [10, 22], "B": [101, 205], }, - columns=["cat", "A", "B"], ) tm.assert_frame_equal(result, expected) # another not in-axis grouper (conflicting names in index) s = Series(["a", "b", "b"], name="cat") - msg = "A grouping .* was excluded from the result" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby(["cat", s], as_index=False, observed=True).sum() + result = df.groupby(["cat", s], as_index=False, observed=True).sum() + expected = DataFrame( + { + "cat": ["a", "b"], + "A": [10, 22], + "B": [101, 205], + }, + ) tm.assert_frame_equal(result, expected) # is original index dropped? @@ -1852,7 +1855,7 @@ def test_category_order_reducer( request, as_index, sort, observed, reduction_func, index_kind, ordered ): # GH#48749 - if reduction_func == "corrwith" and not as_index: + if reduction_func == "corrwith" and not as_index and index_kind != "single": msg = "GH#49950 - corrwith with as_index=False may not have grouping column" request.applymarker(pytest.mark.xfail(reason=msg)) elif index_kind != "range" and not as_index: diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 686279f25939a..52c93d566bc73 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -103,26 +103,22 @@ def f(x, q=None, axis=0): # DataFrame for as_index in [True, False]: df_grouped = tsframe.groupby(lambda x: x.month, as_index=as_index) - warn = None if as_index else FutureWarning - msg = "A grouping .* was excluded from the result" - with tm.assert_produces_warning(warn, match=msg): - agg_result = df_grouped.agg(np.percentile, 80, axis=0) - with tm.assert_produces_warning(warn, match=msg): - apply_result = df_grouped.apply(DataFrame.quantile, 0.8) - with tm.assert_produces_warning(warn, match=msg): - expected = df_grouped.quantile(0.8) + agg_result = df_grouped.agg(np.percentile, 80, axis=0) + apply_result = df_grouped.apply(DataFrame.quantile, 0.8) + expected = df_grouped.quantile(0.8) tm.assert_frame_equal(apply_result, expected, check_names=False) tm.assert_frame_equal(agg_result, expected) apply_result = df_grouped.apply(DataFrame.quantile, [0.4, 0.8]) - with tm.assert_produces_warning(warn, match=msg): - expected_seq = df_grouped.quantile([0.4, 0.8]) + expected_seq = df_grouped.quantile([0.4, 0.8]) + if not as_index: + # apply treats the op as a transform; .quantile knows it's a reduction + apply_result = apply_result.reset_index() + apply_result["level_0"] = [1, 1, 2, 2] tm.assert_frame_equal(apply_result, expected_seq, check_names=False) - with tm.assert_produces_warning(warn, match=msg): - agg_result = df_grouped.agg(f, q=80) - with tm.assert_produces_warning(warn, match=msg): - apply_result = df_grouped.apply(DataFrame.quantile, q=0.8) + agg_result = df_grouped.agg(f, q=80) + apply_result = df_grouped.apply(DataFrame.quantile, q=0.8) tm.assert_frame_equal(agg_result, expected) tm.assert_frame_equal(apply_result, expected, check_names=False) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 54efe163f077e..9e2f9285d06ec 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -552,11 +552,6 @@ def test_categorical_reducers(reduction_func, observed, sort, as_index, index_ki expected = expected.set_index(["x", "x2"]) else: expected = expected.set_index("x") - elif index_kind != "range" and reduction_func != "size": - # size, unlike other methods, has the desired behavior in GH#49519 - expected = expected.drop(columns="x") - if index_kind == "multi": - expected = expected.drop(columns="x2") if reduction_func in ("idxmax", "idxmin") and index_kind != "range": # expected was computed with a RangeIndex; need to translate to index values values = expected["y"].values.tolist() @@ -572,13 +567,7 @@ def test_categorical_reducers(reduction_func, observed, sort, as_index, index_ki if as_index: expected = expected["size"].rename(None) - if as_index or index_kind == "range" or reduction_func == "size": - warn = None - else: - warn = FutureWarning - msg = "A grouping .* was excluded from the result" - with tm.assert_produces_warning(warn, match=msg): - result = getattr(gb_keepna, reduction_func)(*args) + result = getattr(gb_keepna, reduction_func)(*args) # size will return a Series, others are DataFrame tm.assert_equal(result, expected) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 8474d4c1d2d1c..2961369936717 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -1125,12 +1125,8 @@ def test_grouping_by_key_is_in_axis(): assert not gb._grouper.groupings[0].in_axis assert gb._grouper.groupings[1].in_axis - # Currently only in-axis groupings are including in the result when as_index=False; - # This is likely to change in the future. - msg = "A grouping .* was excluded from the result" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = gb.sum() - expected = DataFrame({"b": [1, 2], "c": [7, 5]}) + result = gb.sum() + expected = DataFrame({"a": [1, 2], "b": [1, 2], "c": [7, 5]}) tm.assert_frame_equal(result, expected)