From c0084262735c92e062c4d4f9babe8ffc8d88d9b0 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 30 May 2019 13:57:45 -0500 Subject: [PATCH 1/3] ENH: Named aggregation in SeriesGroupBy.agg ```python In [4]: animals = pd.DataFrame({'kind': ['cat', 'dog', 'cat', 'dog'], ...: 'height': [9.1, 6.0, 9.5, 34.0], ...: 'weight': [7.9, 7.5, 9.9, 198.0]}) ...: animals.groupby("kind").height.agg(max_height='max') Out[4]: max_height kind cat 9.5 dog 34.0 ``` Closes #26512 --- doc/source/user_guide/groupby.rst | 14 ++++++- doc/source/whatsnew/v0.25.0.rst | 20 +++++++-- pandas/core/groupby/generic.py | 42 +++++++++++++++---- .../tests/groupby/aggregate/test_aggregate.py | 29 +++++++++++++ pandas/tests/groupby/aggregate/test_other.py | 1 + 5 files changed, 94 insertions(+), 12 deletions(-) diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 2014dbd9865f3..9895fc606f70d 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -595,7 +595,7 @@ accepts the special syntax in :meth:`GroupBy.agg`, known as "named aggregation", animals.groupby("kind").agg( min_height=pd.NamedAgg(column='height', aggfunc='min'), max_height=pd.NamedAgg(column='height', aggfunc='max'), - average_weight=pd.NamedAgg(column='height', aggfunc=np.mean), + average_weight=pd.NamedAgg(column='weight', aggfunc=np.mean), ) @@ -606,7 +606,7 @@ accepts the special syntax in :meth:`GroupBy.agg`, known as "named aggregation", animals.groupby("kind").agg( min_height=('height', 'min'), max_height=('height', 'max'), - average_weight=('height', np.mean), + average_weight=('weight', np.mean), ) @@ -630,6 +630,16 @@ requires additional arguments, partially apply them with :meth:`functools.partia consistent. To ensure consistent ordering, the keys (and so output columns) will always be sorted for Python 3.5. +Named aggregation is also valid for Series groupby aggregations. In this case there's +no column selection, so the values are just the functions. + +.. ipython:: python + + animals.groupby("kind").height.agg( + min_height='min', + max_height='max', + ) + Applying different functions to DataFrame columns ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 89a9da4a73b35..068b3985404e1 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -28,7 +28,7 @@ Groupby Aggregation with Relabeling ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Pandas has added special groupby behavior, known as "named aggregation", for naming the -output columns when applying multiple aggregation functions to specific columns (:issue:`18366`). +output columns when applying multiple aggregation functions to specific columns (:issue:`18366`, :issue:`26512`). .. ipython:: python @@ -39,7 +39,7 @@ output columns when applying multiple aggregation functions to specific columns animals.groupby("kind").agg( min_height=pd.NamedAgg(column='height', aggfunc='min'), max_height=pd.NamedAgg(column='height', aggfunc='max'), - average_weight=pd.NamedAgg(column='height', aggfunc=np.mean), + average_weight=pd.NamedAgg(column='weight', aggfunc=np.mean), ) Pass the desired columns names as the ``**kwargs`` to ``.agg``. The values of ``**kwargs`` @@ -52,12 +52,26 @@ what the arguments to the function are, but plain tuples are accepted as well. animals.groupby("kind").agg( min_height=('height', 'min'), max_height=('height', 'max'), - average_weight=('height', np.mean), + average_weight=('weight', np.mean), ) Named aggregation is the recommended replacement for the deprecated "dict-of-dicts" approach to naming the output of column-specific aggregations (:ref:`whatsnew_0200.api_breaking.deprecate_group_agg_dict`). +A similar approach is now available for Series groupby objects as well. Because there's no need for +column selection, the values can just be the functions to apply + +.. ipython:: python + + animals.groupby("kind").height.agg( + min_height="min", + max_height="max", + ) + + +This type of aggregation is the recommended alternative to the deprecated behavior when passing +a dict to a Series groupby aggregation (:ref:`whatsnew_0200.api_breaking.deprecate_group_agg_dict`). + See :ref:`_groupby.aggregate.named` for more. .. _whatsnew_0250.enhancements.other: diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 121244cde368a..8ca1109a63000 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -735,6 +735,17 @@ def _selection_name(self): min max 1 1 2 2 3 4 + + The output column names can be controlled by passing + the desired column names and aggregations as keyword arguments. + + >>> s.groupby([1, 1, 2, 2]).agg( + ... minimum='min', + ... maximum='max', + ... ) + minimum maximum + 1 1 2 + 2 3 4 """) @Appender(_apply_docs['template'] @@ -749,8 +760,20 @@ def apply(self, func, *args, **kwargs): klass='Series', axis='') @Appender(_shared_docs['aggregate']) - def aggregate(self, func_or_funcs, *args, **kwargs): + def aggregate(self, func_or_funcs=None, *args, **kwargs): _level = kwargs.pop('_level', None) + + relabeling = func_or_funcs is None + columns = None + no_arg_message = ("Must provide 'func_or_funcs' or named " + "aggregation **kwargs.") + if relabeling: + columns = list(kwargs) + func_or_funcs = list(kwargs.values()) + kwargs = {} + if not columns: + raise TypeError(no_arg_message) + if isinstance(func_or_funcs, str): return getattr(self, func_or_funcs)(*args, **kwargs) @@ -759,6 +782,8 @@ def aggregate(self, func_or_funcs, *args, **kwargs): # but not the class list / tuple itself. ret = self._aggregate_multiple_funcs(func_or_funcs, (_level or 0) + 1) + if relabeling: + ret.columns = columns else: cyfunc = self._is_cython_func(func_or_funcs) if cyfunc and not args and not kwargs: @@ -793,11 +818,14 @@ def _aggregate_multiple_funcs(self, arg, _level): # have not shown a higher level one # GH 15931 if isinstance(self._selected_obj, Series) and _level <= 1: - warnings.warn( - ("using a dict on a Series for aggregation\n" - "is deprecated and will be removed in a future " - "version"), - FutureWarning, stacklevel=3) + msg = dedent("""\ + using a dict on a Series for aggregation + is deprecated and will be removed in a future version. Use \ + named aggregation instead. + + >>> grouper.agg(name_1=func_1, name_2=func_2) + """) + warnings.warn(msg, FutureWarning, stacklevel=3) columns = list(arg.keys()) arg = arg.items() @@ -1562,7 +1590,7 @@ def groupby_series(obj, col=None): def _is_multi_agg_with_relabel(**kwargs): """ - Check whether the kwargs pass to .agg look like multi-agg with relabling. + Check whether kwargs passed to .agg look like multi-agg with relabeling. Parameters ---------- diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 9e714a1086037..3761d678e4548 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -331,6 +331,35 @@ def test_uint64_type_handling(dtype, how): class TestNamedAggregation: + def test_series_named_agg(self): + df = pd.Series([1, 2, 3, 4]) + gr = df.groupby([0, 0, 1, 1]) + result = gr.agg(a='sum', b='min') + expected = pd.DataFrame({'a': [3, 7], 'b': [1, 3]}, + columns=['a', 'b'], index=[0, 1]) + tm.assert_frame_equal(result, expected) + + result = gr.agg(b='min', a='sum') + expected = expected[['b', 'a']] + tm.assert_frame_equal(result, expected) + + def test_no_args_raises(self): + gr = pd.Series([1, 2]).groupby([0, 1]) + with pytest.raises(TypeError, match='Must provide'): + gr.agg() + + # but we do allow this + result = gr.agg([]) + expected = pd.DataFrame() + tm.assert_frame_equal(result, expected) + + def test_series_named_agg_duplicates_raises(self): + # This is a limitation of the named agg implementation reusing + # aggregate_multiple_funcs. It could maybe be lifted in the future. + gr = pd.Series([1, 2, 3]).groupby([0, 0, 1]) + with pytest.raises(SpecificationError): + gr.agg(a='sum', b='sum') + def test_agg_relabel(self): df = pd.DataFrame({"group": ['a', 'a', 'b', 'b'], "A": [0, 1, 2, 3], diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 8168cf06ffdb1..a061eaa1a2c6f 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -225,6 +225,7 @@ def test_agg_dict_renaming_deprecation(): with tm.assert_produces_warning(FutureWarning) as w: df.groupby('A').B.agg({'foo': 'count'}) assert "using a dict on a Series for aggregation" in str(w[0].message) + assert "named aggregation instead." in str(w[0].message) def test_agg_compat(): From 4599b9198083e0b982be9584118f117f32a621e3 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 31 May 2019 15:45:21 -0500 Subject: [PATCH 2/3] py35 compat --- pandas/core/groupby/generic.py | 6 +++++- pandas/tests/groupby/aggregate/test_aggregate.py | 4 +++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 8ca1109a63000..60a1071f6aa4c 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -769,7 +769,11 @@ def aggregate(self, func_or_funcs=None, *args, **kwargs): "aggregation **kwargs.") if relabeling: columns = list(kwargs) - func_or_funcs = list(kwargs.values()) + if not PY36: + # sort for 3.5 and earlier + columns = list(sorted(columns)) + + func_or_funcs = [kwargs[col] for col in columns] kwargs = {} if not columns: raise TypeError(no_arg_message) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 3761d678e4548..67c7c2d29c6dd 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -340,7 +340,9 @@ def test_series_named_agg(self): tm.assert_frame_equal(result, expected) result = gr.agg(b='min', a='sum') - expected = expected[['b', 'a']] + # sort for 35 and earlier + if compat.PY36: + expected = expected[['b', 'a']] tm.assert_frame_equal(result, expected) def test_no_args_raises(self): From 90b2f08af1de67ced649e8de14b1d54c9f418f40 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 7 Jun 2019 14:56:55 -0500 Subject: [PATCH 3/3] split --- pandas/tests/groupby/aggregate/test_aggregate.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 67c7c2d29c6dd..801b99fed5ce6 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -329,7 +329,7 @@ def test_uint64_type_handling(dtype, how): tm.assert_frame_equal(result, expected, check_exact=True) -class TestNamedAggregation: +class TestNamedAggregationSeries: def test_series_named_agg(self): df = pd.Series([1, 2, 3, 4]) @@ -362,6 +362,8 @@ def test_series_named_agg_duplicates_raises(self): with pytest.raises(SpecificationError): gr.agg(a='sum', b='sum') + +class TestNamedAggregationDataFrame: def test_agg_relabel(self): df = pd.DataFrame({"group": ['a', 'a', 'b', 'b'], "A": [0, 1, 2, 3],