Skip to content

PERF: Add var to masked arrays #48379

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Sep 13, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.6.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ Performance improvements
- Performance improvement for :meth:`Series.value_counts` with nullable dtype (:issue:`48338`)
- Performance improvement for :class:`Series` constructor passing integer numpy array with nullable dtype (:issue:`48338`)
- Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`)
- Performance improvement in ``var`` for nullable dtypes (:issue:`48379`).
- Performance improvement to :func:`read_sas` with ``blank_missing=True`` (:issue:`48502`)
-

Expand Down
21 changes: 19 additions & 2 deletions pandas/core/array_algos/masked_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def _reductions(
skipna: bool = True,
min_count: int = 0,
axis: int | None = None,
**kwargs,
):
"""
Sum, mean or product for 1D masked array.
Expand All @@ -45,14 +46,14 @@ def _reductions(
if mask.any(axis=axis) or check_below_min_count(values.shape, None, min_count):
return libmissing.NA
else:
return func(values, axis=axis)
return func(values, axis=axis, **kwargs)
else:
if check_below_min_count(values.shape, mask, min_count) and (
axis is None or values.ndim == 1
):
return libmissing.NA

return func(values, where=~mask, axis=axis)
return func(values, where=~mask, axis=axis, **kwargs)


def sum(
Expand Down Expand Up @@ -149,3 +150,19 @@ def mean(
if not values.size or mask.all():
return libmissing.NA
return _reductions(np.mean, values=values, mask=mask, skipna=skipna, axis=axis)


def var(
values: np.ndarray,
mask: npt.NDArray[np.bool_],
*,
skipna: bool = True,
axis: int | None = None,
ddof: int = 1,
):
if not values.size or mask.all():
return libmissing.NA

return _reductions(
np.var, values=values, mask=mask, skipna=skipna, axis=axis, ddof=ddof
)
15 changes: 14 additions & 1 deletion pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -1028,7 +1028,7 @@ def _quantile(
# Reductions

def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
if name in {"any", "all", "min", "max", "sum", "prod", "mean"}:
if name in {"any", "all", "min", "max", "sum", "prod", "mean", "var"}:
return getattr(self, name)(skipna=skipna, **kwargs)

data = self._data
Expand Down Expand Up @@ -1106,6 +1106,19 @@ def mean(self, *, skipna=True, axis: int | None = 0, **kwargs):
"mean", result, skipna=skipna, axis=axis, **kwargs
)

def var(self, *, skipna=True, axis: int | None = 0, ddof: int = 1, **kwargs):
nv.validate_stat_ddof_func((), kwargs, fname="var")
result = masked_reductions.var(
self._data,
self._mask,
skipna=skipna,
axis=axis,
ddof=ddof,
)
return self._wrap_reduction_result(
"var", result, skipna=skipna, axis=axis, **kwargs
)

def min(self, *, skipna=True, axis: int | None = 0, **kwargs):
nv.validate_min((), kwargs)
return masked_reductions.min(
Expand Down
12 changes: 11 additions & 1 deletion pandas/tests/reductions/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -696,7 +696,7 @@ def test_empty_multi(self, method, unit):
expected = Series([1, np.nan], index=["a", "b"])
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("method", ["mean"])
@pytest.mark.parametrize("method", ["mean", "var"])
@pytest.mark.parametrize("dtype", ["Float64", "Int64", "boolean"])
def test_ops_consistency_on_empty_nullable(self, method, dtype):

Expand Down Expand Up @@ -787,6 +787,16 @@ def test_mean_masked_overflow(self):
assert result_masked - result_numpy == 0
assert result_masked == 1e17

@pytest.mark.parametrize("ddof, exp", [(1, 2.5), (0, 2.0)])
def test_var_masked_array(self, ddof, exp):
# GH#48379
ser = Series([1, 2, 3, 4, 5], dtype="Int64")
ser_numpy_dtype = Series([1, 2, 3, 4, 5], dtype="int64")
result = ser.var(ddof=ddof)
result_numpy_dtype = ser_numpy_dtype.var(ddof=ddof)
assert result == result_numpy_dtype
assert result == exp

@pytest.mark.parametrize("dtype", ("m8[ns]", "m8[ns]", "M8[ns]", "M8[ns, UTC]"))
@pytest.mark.parametrize("skipna", [True, False])
def test_empty_timeseries_reductions_return_nat(self, dtype, skipna):
Expand Down