From 1f75a25f1a031e5d05181d06dd924e823f8d425e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 10 Aug 2022 23:03:24 +0200 Subject: [PATCH 01/11] ENH: Support masks in groupby prod --- doc/source/whatsnew/v1.5.0.rst | 1 + pandas/_libs/groupby.pyx | 32 ++++++++++++++++++++++------ pandas/core/groupby/ops.py | 7 ++++-- pandas/tests/groupby/test_groupby.py | 21 ++++++++++++++++++ 4 files changed, 52 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index a0d33cb513722..60b53d54cc8a7 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -1070,6 +1070,7 @@ Groupby/resample/rolling - Bug in :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` with nullable dtypes incorrectly altering the original data in place (:issue:`46220`) - Bug in :meth:`DataFrame.groupby` raising error when ``None`` is in first level of :class:`MultiIndex` (:issue:`47348`) - Bug in :meth:`.GroupBy.cummax` with ``int64`` dtype with leading value being the smallest possible int64 (:issue:`46382`) +- Bug in :meth:`GroupBy.sum` with integer dtypes losing precision (:issue:`37493`) - Bug in :meth:`.GroupBy.max` with empty groups and ``uint64`` dtype incorrectly raising ``RuntimeError`` (:issue:`46408`) - Bug in :meth:`.GroupBy.apply` would fail when ``func`` was a string and args or kwargs were supplied (:issue:`46479`) - Bug in :meth:`SeriesGroupBy.apply` would incorrectly name its result when there was a unique group (:issue:`46369`) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 06830a1d84c6e..dd4210e673b4f 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -612,10 +612,12 @@ def group_sum( @cython.wraparound(False) @cython.boundscheck(False) def group_prod( - floating[:, ::1] out, + numeric_t[:, ::1] out, int64_t[::1] counts, - ndarray[floating, ndim=2] values, + ndarray[numeric_t, ndim=2] values, const intp_t[::1] labels, + const uint8_t[:, ::1] mask, + uint8_t[:, ::1] result_mask=None, Py_ssize_t min_count=0, ) -> None: """ @@ -623,10 +625,11 @@ def group_prod( """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - floating val, count - floating[:, ::1] prodx + numeric_t val, count + numeric_t[:, ::1] prodx int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) + bint isna_entry, uses_mask = mask is not None if len_values != len_labels: raise ValueError("len(index) != len(labels)") @@ -646,15 +649,30 @@ def group_prod( for j in range(K): val = values[i, j] - # not nan - if val == val: + if uses_mask: + isna_entry = mask[i, j] + elif numeric_t is float32_t or numeric_t is float64_t: + isna_entry = not val == val + else: + isna_entry = False + + if not isna_entry: nobs[lab, j] += 1 prodx[lab, j] *= val for i in range(ncounts): for j in range(K): if nobs[i, j] < min_count: - out[i, j] = NAN + + # else case is not possible + if uses_mask: + result_mask[i, j] = True + elif numeric_t is float32_t or numeric_t is float64_t: + out[i, j] = NAN + else: + # we only get here when < mincount which gets handled later + pass + else: out[i, j] = prodx[i, j] diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 5feed98cbc75b..e38b3e6886acb 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -155,6 +155,7 @@ def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None: "last", "first", "rank", + "prod", } _cython_arity = {"ohlc": 4} # OHLC @@ -217,7 +218,7 @@ def _get_cython_vals(self, values: np.ndarray) -> np.ndarray: values = ensure_float64(values) elif values.dtype.kind in ["i", "u"]: - if how in ["sum", "var", "prod", "mean", "ohlc"] or ( + if how in ["sum", "var", "mean", "ohlc"] or ( self.kind == "transform" and self.has_dropped_na ): # result may still include NaN, so we have to cast @@ -581,6 +582,8 @@ def _call_cython_op( min_count=min_count, is_datetimelike=is_datetimelike, ) + elif self.how in ["prod"]: + func(result, counts, values, comp_ids, mask, result_mask, min_count) else: func(result, counts, values, comp_ids, min_count) else: @@ -613,7 +616,7 @@ def _call_cython_op( # need to have the result set to np.nan, which may require casting, # see GH#40767 if is_integer_dtype(result.dtype) and not is_datetimelike: - cutoff = max(1, min_count) + cutoff = max(0 if self.how == "prod" else 1, min_count) empty_groups = counts < cutoff if empty_groups.any(): if result_mask is not None and self.uses_mask(): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index a6ab13270c4dc..75c6cabb09714 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2808,3 +2808,24 @@ def test_single_element_list_grouping(): ) with tm.assert_produces_warning(FutureWarning, match=msg): values, _ = next(iter(df.groupby(["a"]))) + + +def test_groupby_prod_avoid_casting_to_float(): + # GH#37493 + val = 922337203685477580 + df = DataFrame({"a": 1, "b": [val]}) + result = df.groupby("a").prod() - val + expected = DataFrame({"b": [0]}, index=Index([1], name="a")) + tm.assert_frame_equal(result, expected) + + +def test_groupby_prod_support_mask(any_numeric_ea_dtype): + # GH#37493 + df = DataFrame({"a": 1, "b": [1, 2, pd.NA]}, dtype=any_numeric_ea_dtype) + result = df.groupby("a").prod() + expected = DataFrame( + {"b": [2]}, + index=Index([1], name="a", dtype=any_numeric_ea_dtype), + dtype=any_numeric_ea_dtype, + ) + tm.assert_frame_equal(result, expected) From 27aedf310e0127dae285a695e65db117f1ae9d8d Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 10 Aug 2022 23:42:17 +0200 Subject: [PATCH 02/11] Fix pyi --- pandas/_libs/groupby.pyi | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index dfae1bff91ac8..671e821b8dc4c 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -59,10 +59,12 @@ def group_sum( is_datetimelike: bool = ..., ) -> None: ... def group_prod( - out: np.ndarray, # floating[:, ::1] + out: np.ndarray, # numeric_t[:, ::1] counts: np.ndarray, # int64_t[::1] - values: np.ndarray, # ndarray[floating, ndim=2] + values: np.ndarray, # ndarray[numeric_t, ndim=2] labels: np.ndarray, # const intp_t[:] + mask: np.ndarray | None, + result_mask: np.ndarray | None = ..., min_count: int = ..., ) -> None: ... def group_var( From 4d7f1d1ee3717aaf9b391fad25be2ade13eabb14 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 12 Aug 2022 21:33:17 +0200 Subject: [PATCH 03/11] Refactor for overflow --- pandas/_libs/groupby.pyx | 21 +++++++++++++++------ pandas/core/groupby/ops.py | 2 +- pandas/tests/groupby/test_groupby.py | 20 +++++++++++++++----- 3 files changed, 31 insertions(+), 12 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index db7c7649a422a..9c2b86f68cdc2 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -639,12 +639,21 @@ def group_sum( out[i, j] = sumx[i, j] + +ctypedef fused prod_t: + int64_t + uint64_t + + float32_t + float64_t + + @cython.wraparound(False) @cython.boundscheck(False) def group_prod( - numeric_t[:, ::1] out, + prod_t[:, ::1] out, int64_t[::1] counts, - ndarray[numeric_t, ndim=2] values, + ndarray[prod_t, ndim=2] values, const intp_t[::1] labels, const uint8_t[:, ::1] mask, uint8_t[:, ::1] result_mask=None, @@ -655,8 +664,8 @@ def group_prod( """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - numeric_t val, count - numeric_t[:, ::1] prodx + prod_t val, count + prod_t[:, ::1] prodx int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) bint isna_entry, uses_mask = mask is not None @@ -681,7 +690,7 @@ def group_prod( if uses_mask: isna_entry = mask[i, j] - elif numeric_t is float32_t or numeric_t is float64_t: + elif prod_t is float32_t or prod_t is float64_t: isna_entry = not val == val else: isna_entry = False @@ -697,7 +706,7 @@ def group_prod( # else case is not possible if uses_mask: result_mask[i, j] = True - elif numeric_t is float32_t or numeric_t is float64_t: + elif prod_t is float32_t or prod_t is float64_t: out[i, j] = NAN else: # we only get here when < mincount which gets handled later diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 7a79e7c5fc57d..fa0304bd10b6e 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -226,7 +226,7 @@ def _get_cython_vals(self, values: np.ndarray) -> np.ndarray: # result may still include NaN, so we have to cast values = ensure_float64(values) - elif how == "sum": + elif how in ["sum", "prod"]: # Avoid overflow during group op if values.dtype.kind == "i": values = ensure_int64(values) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index d290aada18293..d0078f8754e79 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2810,21 +2810,23 @@ def test_single_element_list_grouping(): values, _ = next(iter(df.groupby(["a"]))) -def test_groupby_sum_avoid_casting_to_float(): +@pytest.mark.parametrize("func", ["prod", "sum"]) +def test_groupby_avoid_casting_to_float(func): # GH#37493 val = 922337203685477580 df = DataFrame({"a": 1, "b": [val]}) - result = df.groupby("a").sum() - val + result = getattr(df.groupby("a"), func)() - val expected = DataFrame({"b": [0]}, index=Index([1], name="a")) tm.assert_frame_equal(result, expected) -def test_groupby_sum_support_mask(any_numeric_ea_dtype): +@pytest.mark.parametrize("func, val", [("sum", 3), ("prod", 2)]) +def test_groupby_sum_support_mask(any_numeric_ea_dtype, func, val): # GH#37493 df = DataFrame({"a": 1, "b": [1, 2, pd.NA]}, dtype=any_numeric_ea_dtype) - result = df.groupby("a").sum() + result = getattr(df.groupby("a"), func)() expected = DataFrame( - {"b": [3]}, + {"b": [val]}, index=Index([1], name="a", dtype=any_numeric_ea_dtype), dtype=any_numeric_ea_dtype, ) @@ -2842,3 +2844,11 @@ def test_groupby_sum_overflow(val, dtype): dtype=f"{dtype}64", ) tm.assert_frame_equal(result, expected) + + result = df.groupby("a").prod() + expected = DataFrame( + {"b": [val * val]}, + index=Index([1], name="a", dtype=f"{dtype}64"), + dtype=f"{dtype}64", + ) + tm.assert_frame_equal(result, expected) From e96d99b589896c4d2f94cde8baf67b95ffe1c450 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 12 Aug 2022 21:35:58 +0200 Subject: [PATCH 04/11] Fix issues --- doc/source/whatsnew/v1.5.0.rst | 3 +-- pandas/_libs/groupby.pyi | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 8bd1304d9104f..fda768c8a79f1 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -1070,12 +1070,11 @@ Groupby/resample/rolling - Bug when using ``engine="numba"`` would return the same jitted function when modifying ``engine_kwargs`` (:issue:`46086`) - Bug in :meth:`.DataFrameGroupBy.transform` fails when ``axis=1`` and ``func`` is ``"first"`` or ``"last"`` (:issue:`45986`) - Bug in :meth:`DataFrameGroupBy.cumsum` with ``skipna=False`` giving incorrect results (:issue:`46216`) -- Bug in :meth:`GroupBy.sum` with integer dtypes losing precision (:issue:`37493`) +- Bug in :meth:`GroupBy.sum` and :meth:`GroupBy.prod` with integer dtypes losing precision (:issue:`37493`) - Bug in :meth:`.GroupBy.cumsum` with ``timedelta64[ns]`` dtype failing to recognize ``NaT`` as a null value (:issue:`46216`) - Bug in :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` with nullable dtypes incorrectly altering the original data in place (:issue:`46220`) - Bug in :meth:`DataFrame.groupby` raising error when ``None`` is in first level of :class:`MultiIndex` (:issue:`47348`) - Bug in :meth:`.GroupBy.cummax` with ``int64`` dtype with leading value being the smallest possible int64 (:issue:`46382`) -- Bug in :meth:`GroupBy.sum` with integer dtypes losing precision (:issue:`37493`) - Bug in :meth:`.GroupBy.max` with empty groups and ``uint64`` dtype incorrectly raising ``RuntimeError`` (:issue:`46408`) - Bug in :meth:`.GroupBy.apply` would fail when ``func`` was a string and args or kwargs were supplied (:issue:`46479`) - Bug in :meth:`SeriesGroupBy.apply` would incorrectly name its result when there was a unique group (:issue:`46369`) diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index fce07557a6e51..c40bfa301746e 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -61,9 +61,9 @@ def group_sum( is_datetimelike: bool = ..., ) -> None: ... def group_prod( - out: np.ndarray, # numeric_t[:, ::1] + out: np.ndarray, # prod_t[:, ::1] counts: np.ndarray, # int64_t[::1] - values: np.ndarray, # ndarray[numeric_t, ndim=2] + values: np.ndarray, # ndarray[prod_t, ndim=2] labels: np.ndarray, # const intp_t[:] mask: np.ndarray | None, result_mask: np.ndarray | None = ..., From 591d78c39a205b8ee48edbb3d05f64d622c5ac74 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 14 Aug 2022 22:11:22 +0200 Subject: [PATCH 05/11] Rename fused type --- pandas/_libs/groupby.pyx | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 9c2b86f68cdc2..0e7956865fd9c 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -640,7 +640,7 @@ def group_sum( -ctypedef fused prod_t: +ctypedef fused int64float_t: int64_t uint64_t @@ -651,9 +651,9 @@ ctypedef fused prod_t: @cython.wraparound(False) @cython.boundscheck(False) def group_prod( - prod_t[:, ::1] out, + int64float_t[:, ::1] out, int64_t[::1] counts, - ndarray[prod_t, ndim=2] values, + ndarray[int64float_t, ndim=2] values, const intp_t[::1] labels, const uint8_t[:, ::1] mask, uint8_t[:, ::1] result_mask=None, @@ -664,8 +664,8 @@ def group_prod( """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - prod_t val, count - prod_t[:, ::1] prodx + int64float_t val, count + int64float_t[:, ::1] prodx int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) bint isna_entry, uses_mask = mask is not None @@ -690,7 +690,7 @@ def group_prod( if uses_mask: isna_entry = mask[i, j] - elif prod_t is float32_t or prod_t is float64_t: + elif int64float_t is float32_t or int64float_t is float64_t: isna_entry = not val == val else: isna_entry = False From 6249fb67e2f0ce10881188292010e0a0fde7b98c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 15 Aug 2022 19:29:07 +0200 Subject: [PATCH 06/11] Fix type name --- pandas/_libs/groupby.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 0e7956865fd9c..5bcbdf4c73b34 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -706,7 +706,7 @@ def group_prod( # else case is not possible if uses_mask: result_mask[i, j] = True - elif prod_t is float32_t or prod_t is float64_t: + elif int64float_t is float32_t or int64float_t is float64_t: out[i, j] = NAN else: # we only get here when < mincount which gets handled later From 33aea785fda633997874de9319b1adbe2d2a9afc Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 15 Aug 2022 19:53:57 +0200 Subject: [PATCH 07/11] Remove duplicated type --- pandas/_libs/groupby.pyx | 7 ------- 1 file changed, 7 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 41f47d5342181..9ec5d289499b1 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -861,13 +861,6 @@ def group_mean( out[i, j] = sumx[i, j] / count -ctypedef fused int64float_t: - float32_t - float64_t - int64_t - uint64_t - - @cython.wraparound(False) @cython.boundscheck(False) def group_ohlc( From 1a57f3f0a062d67535749773e09ec94740b6d110 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 15 Aug 2022 19:54:53 +0200 Subject: [PATCH 08/11] Fix annotation --- pandas/_libs/groupby.pyi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index 5421a016adc47..6ca7d378fc69e 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -61,9 +61,9 @@ def group_sum( is_datetimelike: bool = ..., ) -> None: ... def group_prod( - out: np.ndarray, # prod_t[:, ::1] + out: np.ndarray, # int64float_t[:, ::1] counts: np.ndarray, # int64_t[::1] - values: np.ndarray, # ndarray[prod_t, ndim=2] + values: np.ndarray, # ndarray[int64float_t, ndim=2] labels: np.ndarray, # const intp_t[:] mask: np.ndarray | None, result_mask: np.ndarray | None = ..., From 75a1e7fb0fe5e0be52eb45796b6e21002dfecb5f Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 17 Aug 2022 23:20:13 +0200 Subject: [PATCH 09/11] Add initialize --- pandas/_libs/groupby.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 9ec5d289499b1..3b6f51f6a8c0e 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -705,6 +705,8 @@ def group_prod( # else case is not possible if uses_mask: result_mask[i, j] = True + # Be deterministic, out was initialized as empty + out[i, j] = 0 elif int64float_t is float32_t or int64float_t is float64_t: out[i, j] = NAN else: From de8a92d300fdec54b60ec8eecf9b15af262a4050 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 18 Aug 2022 09:42:12 +0200 Subject: [PATCH 10/11] Add . before groupby --- doc/source/whatsnew/v1.5.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index f3a667da1a5c3..ed4457c7aa5f7 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -1078,7 +1078,7 @@ Groupby/resample/rolling - Bug when using ``engine="numba"`` would return the same jitted function when modifying ``engine_kwargs`` (:issue:`46086`) - Bug in :meth:`.DataFrameGroupBy.transform` fails when ``axis=1`` and ``func`` is ``"first"`` or ``"last"`` (:issue:`45986`) - Bug in :meth:`DataFrameGroupBy.cumsum` with ``skipna=False`` giving incorrect results (:issue:`46216`) -- Bug in :meth:`GroupBy.sum` and :meth:`GroupBy.prod` with integer dtypes losing precision (:issue:`37493`) +- Bug in :meth:`GroupBy.sum` and :meth:`.GroupBy.prod` with integer dtypes losing precision (:issue:`37493`) - Bug in :meth:`.GroupBy.cumsum` with ``timedelta64[ns]`` dtype failing to recognize ``NaT`` as a null value (:issue:`46216`) - Bug in :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` with nullable dtypes incorrectly altering the original data in place (:issue:`46220`) - Bug in :meth:`DataFrame.groupby` raising error when ``None`` is in first level of :class:`MultiIndex` (:issue:`47348`) From 87615af662cd12d1255650ccd506a8e7ed16c0a7 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 18 Aug 2022 22:34:01 +0200 Subject: [PATCH 11/11] Remove dup type --- pandas/_libs/groupby.pyx | 9 --------- 1 file changed, 9 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 4a9a5c0e900d9..299dfdf177d91 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -679,15 +679,6 @@ def group_sum( out[i, j] = sumx[i, j] - -ctypedef fused int64float_t: - int64_t - uint64_t - - float32_t - float64_t - - @cython.wraparound(False) @cython.boundscheck(False) def group_prod(