Skip to content

Commit

Permalink
BUG: masked mean unnecessarily overflowing (#48378)
Browse files Browse the repository at this point in the history
  • Loading branch information
phofl authored Sep 7, 2022
1 parent ce143a2 commit b8ae9bb
Show file tree
Hide file tree
Showing 5 changed files with 55 additions and 19 deletions.
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.6.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ Sparse

ExtensionArray
^^^^^^^^^^^^^^
-
- Bug in :meth:`Series.mean` overflowing unnecessarily with nullable integers (:issue:`48378`)
-

Styler
Expand Down
22 changes: 12 additions & 10 deletions pandas/core/array_algos/masked_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from pandas.core.nanops import check_below_min_count


def _sumprod(
def _reductions(
func: Callable,
values: np.ndarray,
mask: npt.NDArray[np.bool_],
Expand All @@ -24,7 +24,7 @@ def _sumprod(
axis: int | None = None,
):
"""
Sum or product for 1D masked array.
Sum, mean or product for 1D masked array.
Parameters
----------
Expand Down Expand Up @@ -63,7 +63,7 @@ def sum(
min_count: int = 0,
axis: int | None = None,
):
return _sumprod(
return _reductions(
np.sum, values=values, mask=mask, skipna=skipna, min_count=min_count, axis=axis
)

Expand All @@ -76,7 +76,7 @@ def prod(
min_count: int = 0,
axis: int | None = None,
):
return _sumprod(
return _reductions(
np.prod, values=values, mask=mask, skipna=skipna, min_count=min_count, axis=axis
)

Expand Down Expand Up @@ -139,11 +139,13 @@ def max(
return _minmax(np.max, values=values, mask=mask, skipna=skipna, axis=axis)


# TODO: axis kwarg
def mean(values: np.ndarray, mask: npt.NDArray[np.bool_], skipna: bool = True):
def mean(
values: np.ndarray,
mask: npt.NDArray[np.bool_],
*,
skipna: bool = True,
axis: int | None = None,
):
if not values.size or mask.all():
return libmissing.NA
_sum = _sumprod(np.sum, values=values, mask=mask, skipna=skipna)
count = np.count_nonzero(~mask)
mean_value = _sum / count
return mean_value
return _reductions(np.mean, values=values, mask=mask, skipna=skipna, axis=axis)
19 changes: 13 additions & 6 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -1036,17 +1036,12 @@ def _quantile(
# Reductions

def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
if name in {"any", "all", "min", "max", "sum", "prod"}:
if name in {"any", "all", "min", "max", "sum", "prod", "mean"}:
return getattr(self, name)(skipna=skipna, **kwargs)

data = self._data
mask = self._mask

if name in {"mean"}:
op = getattr(masked_reductions, name)
result = op(data, mask, skipna=skipna, **kwargs)
return result

# coerce to a nan-aware float if needed
# (we explicitly use NaN within reductions)
if self._hasna:
Expand Down Expand Up @@ -1107,6 +1102,18 @@ def prod(self, *, skipna=True, min_count=0, axis: int | None = 0, **kwargs):
"prod", result, skipna=skipna, axis=axis, **kwargs
)

def mean(self, *, skipna=True, axis: int | None = 0, **kwargs):
nv.validate_mean((), kwargs)
result = masked_reductions.mean(
self._data,
self._mask,
skipna=skipna,
axis=axis,
)
return self._wrap_reduction_result(
"mean", result, skipna=skipna, axis=axis, **kwargs
)

def min(self, *, skipna=True, axis: int | None = 0, **kwargs):
nv.validate_min((), kwargs)
return masked_reductions.min(
Expand Down
19 changes: 17 additions & 2 deletions pandas/tests/extension/base/dim2.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,13 @@

from pandas._libs.missing import is_matching_na

from pandas.core.dtypes.common import (
is_bool_dtype,
is_integer_dtype,
)

import pandas as pd
import pandas._testing as tm
from pandas.core.arrays.integer import INT_STR_TO_DTYPE
from pandas.tests.extension.base.base import BaseExtensionTests

Expand Down Expand Up @@ -191,7 +197,12 @@ def test_reductions_2d_axis0(self, data, method):
kwargs["ddof"] = 0

try:
result = getattr(arr2d, method)(axis=0, **kwargs)
if method == "mean" and hasattr(data, "_mask"):
# Empty slices produced by the mask cause RuntimeWarnings by numpy
with tm.assert_produces_warning(RuntimeWarning, check_stacklevel=False):
result = getattr(arr2d, method)(axis=0, **kwargs)
else:
result = getattr(arr2d, method)(axis=0, **kwargs)
except Exception as err:
try:
getattr(data, method)()
Expand All @@ -212,7 +223,7 @@ def get_reduction_result_dtype(dtype):
# i.e. dtype.kind == "u"
return INT_STR_TO_DTYPE[np.dtype(np.uint).name]

if method in ["mean", "median", "sum", "prod"]:
if method in ["median", "sum", "prod"]:
# std and var are not dtype-preserving
expected = data
if method in ["sum", "prod"] and data.dtype.kind in "iub":
Expand All @@ -229,6 +240,10 @@ def get_reduction_result_dtype(dtype):
self.assert_extension_array_equal(result, expected)
elif method == "std":
self.assert_extension_array_equal(result, data - data)
elif method == "mean":
if is_integer_dtype(data) or is_bool_dtype(data):
data = data.astype("Float64")
self.assert_extension_array_equal(result, data)
# punt on method == "var"

@pytest.mark.parametrize("method", ["mean", "median", "var", "std", "sum", "prod"])
Expand Down
12 changes: 12 additions & 0 deletions pandas/tests/reductions/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -775,6 +775,18 @@ def test_sum_overflow_float(self, use_bottleneck, dtype):
result = s.max(skipna=False)
assert np.allclose(float(result), v[-1])

def test_mean_masked_overflow(self):
# GH#48378
val = 100_000_000_000_000_000
n_elements = 100
na = np.array([val] * n_elements)
ser = Series([val] * n_elements, dtype="Int64")

result_numpy = np.mean(na)
result_masked = ser.mean()
assert result_masked - result_numpy == 0
assert result_masked == 1e17

@pytest.mark.parametrize("dtype", ("m8[ns]", "m8[ns]", "M8[ns]", "M8[ns, UTC]"))
@pytest.mark.parametrize("skipna", [True, False])
def test_empty_timeseries_reductions_return_nat(self, dtype, skipna):
Expand Down

0 comments on commit b8ae9bb

Please sign in to comment.