diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst index aff950c6933dd..9260ac0e63771 100644 --- a/doc/source/whatsnew/v1.6.0.rst +++ b/doc/source/whatsnew/v1.6.0.rst @@ -108,6 +108,7 @@ Performance improvements - Performance improvement for :meth:`Series.value_counts` with nullable dtype (:issue:`48338`) - Performance improvement for :class:`Series` constructor passing integer numpy array with nullable dtype (:issue:`48338`) - Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`) +- Performance improvement in ``var`` for nullable dtypes (:issue:`48379`). - Performance improvement to :func:`read_sas` with ``blank_missing=True`` (:issue:`48502`) - diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index 041905c993b0d..979d3ddac63c2 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -22,6 +22,7 @@ def _reductions( skipna: bool = True, min_count: int = 0, axis: int | None = None, + **kwargs, ): """ Sum, mean or product for 1D masked array. @@ -45,14 +46,14 @@ def _reductions( if mask.any(axis=axis) or check_below_min_count(values.shape, None, min_count): return libmissing.NA else: - return func(values, axis=axis) + return func(values, axis=axis, **kwargs) else: if check_below_min_count(values.shape, mask, min_count) and ( axis is None or values.ndim == 1 ): return libmissing.NA - return func(values, where=~mask, axis=axis) + return func(values, where=~mask, axis=axis, **kwargs) def sum( @@ -149,3 +150,19 @@ def mean( if not values.size or mask.all(): return libmissing.NA return _reductions(np.mean, values=values, mask=mask, skipna=skipna, axis=axis) + + +def var( + values: np.ndarray, + mask: npt.NDArray[np.bool_], + *, + skipna: bool = True, + axis: int | None = None, + ddof: int = 1, +): + if not values.size or mask.all(): + return libmissing.NA + + return _reductions( + np.var, values=values, mask=mask, skipna=skipna, axis=axis, ddof=ddof + ) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 7fe2c8fdd62f0..d67a5e215886b 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1028,7 +1028,7 @@ def _quantile( # Reductions def _reduce(self, name: str, *, skipna: bool = True, **kwargs): - if name in {"any", "all", "min", "max", "sum", "prod", "mean"}: + if name in {"any", "all", "min", "max", "sum", "prod", "mean", "var"}: return getattr(self, name)(skipna=skipna, **kwargs) data = self._data @@ -1106,6 +1106,19 @@ def mean(self, *, skipna=True, axis: int | None = 0, **kwargs): "mean", result, skipna=skipna, axis=axis, **kwargs ) + def var(self, *, skipna=True, axis: int | None = 0, ddof: int = 1, **kwargs): + nv.validate_stat_ddof_func((), kwargs, fname="var") + result = masked_reductions.var( + self._data, + self._mask, + skipna=skipna, + axis=axis, + ddof=ddof, + ) + return self._wrap_reduction_result( + "var", result, skipna=skipna, axis=axis, **kwargs + ) + def min(self, *, skipna=True, axis: int | None = 0, **kwargs): nv.validate_min((), kwargs) return masked_reductions.min( diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index ef94a18016719..66f263b84de4d 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -696,7 +696,7 @@ def test_empty_multi(self, method, unit): expected = Series([1, np.nan], index=["a", "b"]) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize("method", ["mean"]) + @pytest.mark.parametrize("method", ["mean", "var"]) @pytest.mark.parametrize("dtype", ["Float64", "Int64", "boolean"]) def test_ops_consistency_on_empty_nullable(self, method, dtype): @@ -787,6 +787,16 @@ def test_mean_masked_overflow(self): assert result_masked - result_numpy == 0 assert result_masked == 1e17 + @pytest.mark.parametrize("ddof, exp", [(1, 2.5), (0, 2.0)]) + def test_var_masked_array(self, ddof, exp): + # GH#48379 + ser = Series([1, 2, 3, 4, 5], dtype="Int64") + ser_numpy_dtype = Series([1, 2, 3, 4, 5], dtype="int64") + result = ser.var(ddof=ddof) + result_numpy_dtype = ser_numpy_dtype.var(ddof=ddof) + assert result == result_numpy_dtype + assert result == exp + @pytest.mark.parametrize("dtype", ("m8[ns]", "m8[ns]", "M8[ns]", "M8[ns, UTC]")) @pytest.mark.parametrize("skipna", [True, False]) def test_empty_timeseries_reductions_return_nat(self, dtype, skipna):