From b8ae9bbab3c3384ad90e6145fda13a338767ea21 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 7 Sep 2022 19:39:41 +0200 Subject: [PATCH] BUG: masked mean unnecessarily overflowing (#48378) --- doc/source/whatsnew/v1.6.0.rst | 2 +- pandas/core/array_algos/masked_reductions.py | 22 +++++++++++--------- pandas/core/arrays/masked.py | 19 +++++++++++------ pandas/tests/extension/base/dim2.py | 19 +++++++++++++++-- pandas/tests/reductions/test_reductions.py | 12 +++++++++++ 5 files changed, 55 insertions(+), 19 deletions(-) diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst index c393b8a57f805..42d3ce8069322 100644 --- a/doc/source/whatsnew/v1.6.0.rst +++ b/doc/source/whatsnew/v1.6.0.rst @@ -200,7 +200,7 @@ Sparse ExtensionArray ^^^^^^^^^^^^^^ -- +- Bug in :meth:`Series.mean` overflowing unnecessarily with nullable integers (:issue:`48378`) - Styler diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index 3e59a267f7191..041905c993b0d 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -14,7 +14,7 @@ from pandas.core.nanops import check_below_min_count -def _sumprod( +def _reductions( func: Callable, values: np.ndarray, mask: npt.NDArray[np.bool_], @@ -24,7 +24,7 @@ def _sumprod( axis: int | None = None, ): """ - Sum or product for 1D masked array. + Sum, mean or product for 1D masked array. Parameters ---------- @@ -63,7 +63,7 @@ def sum( min_count: int = 0, axis: int | None = None, ): - return _sumprod( + return _reductions( np.sum, values=values, mask=mask, skipna=skipna, min_count=min_count, axis=axis ) @@ -76,7 +76,7 @@ def prod( min_count: int = 0, axis: int | None = None, ): - return _sumprod( + return _reductions( np.prod, values=values, mask=mask, skipna=skipna, min_count=min_count, axis=axis ) @@ -139,11 +139,13 @@ def max( return _minmax(np.max, values=values, mask=mask, skipna=skipna, axis=axis) -# TODO: axis kwarg -def mean(values: np.ndarray, mask: npt.NDArray[np.bool_], skipna: bool = True): +def mean( + values: np.ndarray, + mask: npt.NDArray[np.bool_], + *, + skipna: bool = True, + axis: int | None = None, +): if not values.size or mask.all(): return libmissing.NA - _sum = _sumprod(np.sum, values=values, mask=mask, skipna=skipna) - count = np.count_nonzero(~mask) - mean_value = _sum / count - return mean_value + return _reductions(np.mean, values=values, mask=mask, skipna=skipna, axis=axis) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index c5f6dea7157ab..5c77a50f4a805 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1036,17 +1036,12 @@ def _quantile( # Reductions def _reduce(self, name: str, *, skipna: bool = True, **kwargs): - if name in {"any", "all", "min", "max", "sum", "prod"}: + if name in {"any", "all", "min", "max", "sum", "prod", "mean"}: return getattr(self, name)(skipna=skipna, **kwargs) data = self._data mask = self._mask - if name in {"mean"}: - op = getattr(masked_reductions, name) - result = op(data, mask, skipna=skipna, **kwargs) - return result - # coerce to a nan-aware float if needed # (we explicitly use NaN within reductions) if self._hasna: @@ -1107,6 +1102,18 @@ def prod(self, *, skipna=True, min_count=0, axis: int | None = 0, **kwargs): "prod", result, skipna=skipna, axis=axis, **kwargs ) + def mean(self, *, skipna=True, axis: int | None = 0, **kwargs): + nv.validate_mean((), kwargs) + result = masked_reductions.mean( + self._data, + self._mask, + skipna=skipna, + axis=axis, + ) + return self._wrap_reduction_result( + "mean", result, skipna=skipna, axis=axis, **kwargs + ) + def min(self, *, skipna=True, axis: int | None = 0, **kwargs): nv.validate_min((), kwargs) return masked_reductions.min( diff --git a/pandas/tests/extension/base/dim2.py b/pandas/tests/extension/base/dim2.py index f71f3cf164bfc..d2c1e6971c56e 100644 --- a/pandas/tests/extension/base/dim2.py +++ b/pandas/tests/extension/base/dim2.py @@ -6,7 +6,13 @@ from pandas._libs.missing import is_matching_na +from pandas.core.dtypes.common import ( + is_bool_dtype, + is_integer_dtype, +) + import pandas as pd +import pandas._testing as tm from pandas.core.arrays.integer import INT_STR_TO_DTYPE from pandas.tests.extension.base.base import BaseExtensionTests @@ -191,7 +197,12 @@ def test_reductions_2d_axis0(self, data, method): kwargs["ddof"] = 0 try: - result = getattr(arr2d, method)(axis=0, **kwargs) + if method == "mean" and hasattr(data, "_mask"): + # Empty slices produced by the mask cause RuntimeWarnings by numpy + with tm.assert_produces_warning(RuntimeWarning, check_stacklevel=False): + result = getattr(arr2d, method)(axis=0, **kwargs) + else: + result = getattr(arr2d, method)(axis=0, **kwargs) except Exception as err: try: getattr(data, method)() @@ -212,7 +223,7 @@ def get_reduction_result_dtype(dtype): # i.e. dtype.kind == "u" return INT_STR_TO_DTYPE[np.dtype(np.uint).name] - if method in ["mean", "median", "sum", "prod"]: + if method in ["median", "sum", "prod"]: # std and var are not dtype-preserving expected = data if method in ["sum", "prod"] and data.dtype.kind in "iub": @@ -229,6 +240,10 @@ def get_reduction_result_dtype(dtype): self.assert_extension_array_equal(result, expected) elif method == "std": self.assert_extension_array_equal(result, data - data) + elif method == "mean": + if is_integer_dtype(data) or is_bool_dtype(data): + data = data.astype("Float64") + self.assert_extension_array_equal(result, data) # punt on method == "var" @pytest.mark.parametrize("method", ["mean", "median", "var", "std", "sum", "prod"]) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index fa53ed47dbdba..ef94a18016719 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -775,6 +775,18 @@ def test_sum_overflow_float(self, use_bottleneck, dtype): result = s.max(skipna=False) assert np.allclose(float(result), v[-1]) + def test_mean_masked_overflow(self): + # GH#48378 + val = 100_000_000_000_000_000 + n_elements = 100 + na = np.array([val] * n_elements) + ser = Series([val] * n_elements, dtype="Int64") + + result_numpy = np.mean(na) + result_masked = ser.mean() + assert result_masked - result_numpy == 0 + assert result_masked == 1e17 + @pytest.mark.parametrize("dtype", ("m8[ns]", "m8[ns]", "M8[ns]", "M8[ns, UTC]")) @pytest.mark.parametrize("skipna", [True, False]) def test_empty_timeseries_reductions_return_nat(self, dtype, skipna):