From 7ff6a4e94484f6f5c41c223e08e4bc53890c021d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 3 Apr 2020 16:23:23 +0200 Subject: [PATCH 1/7] PERF: masked ops for reductions (min/max) --- pandas/core/array_algos/masked_reductions.py | 39 +++++++++++++ pandas/core/arrays/integer.py | 7 ++- pandas/tests/reductions/test_reductions.py | 61 ++++++++++++++------ 3 files changed, 87 insertions(+), 20 deletions(-) diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index 0fb2605b554c2..32a61dd7c39ad 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -45,3 +45,42 @@ def sum( return np.sum(values[~mask]) else: return np.sum(values, where=~mask) + + +def _minmax(func): + def reduction(values: np.ndarray, mask: np.ndarray, skipna: bool = True): + """ + Reduction for 1D masked array. + + Parameters + ---------- + values : np.ndarray + Numpy array with the values (can be of any dtype that support the + operation). + mask : np.ndarray + Boolean numpy array (True values indicate missing values). + skipna : bool, default True + Whether to skip NA. + """ + if not skipna: + if mask.any(): + return libmissing.NA + else: + if values.size: + return func(values) + else: + # min/max with empty array raise in numpy, pandas returns NA + return libmissing.NA + else: + subset = values[~mask] + if subset.size: + return func(values[~mask]) + else: + # min/max with empty array raise in numpy, pandas returns NA + return libmissing.NA + + return reduction + + +min = _minmax(np.min) +max = _minmax(np.max) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 4f3c68aa03b16..8966d7ef48844 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -561,8 +561,9 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): data = self._data mask = self._mask - if name == "sum": - return masked_reductions.sum(data, mask, skipna=skipna, **kwargs) + if name in {"sum", "min", "max"}: + op = getattr(masked_reductions, name) + return op(data, mask, skipna=skipna, **kwargs) # coerce to a nan-aware float if needed # (we explicitly use NaN within reductions) @@ -581,7 +582,7 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): # if we have a preservable numeric op, # provide coercion back to an integer type if possible - elif name in ["min", "max", "prod"]: + elif name in ["prod"]: # GH#31409 more performant than casting-then-checking result = com.cast_scalar_indexer(result) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 962b105d1e8fc..1c0eee2226a6d 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -65,27 +65,57 @@ def test_ops(self, opname, obj): assert result.value == expected @pytest.mark.parametrize("opname", ["max", "min"]) - def test_nanops(self, opname, index_or_series): + @pytest.mark.parametrize( + "dtype, val", + [ + ("object", 2.0), + ("float64", 2.0), + ("Int64", 2), + ("datetime64[ns]", datetime(2011, 11, 1)), + ], + ) + def test_nanminmax(self, opname, dtype, val, index_or_series): # GH#7261 klass = index_or_series - arg_op = "arg" + opname if klass is Index else "idx" + opname - obj = klass([np.nan, 2.0]) - assert getattr(obj, opname)() == 2.0 + if dtype == "Int64" and klass == pd.Index: + pytest.skip("EAs can't yet be stored in an index") - obj = klass([np.nan]) - assert pd.isna(getattr(obj, opname)()) - assert pd.isna(getattr(obj, opname)(skipna=False)) + def check_missing(res): + if dtype == "datetime64[ns]": + return res is pd.NaT + elif dtype == "Int64": + return res is pd.NA + else: + return pd.isna(res) - obj = klass([], dtype=object) - assert pd.isna(getattr(obj, opname)()) - assert pd.isna(getattr(obj, opname)(skipna=False)) + obj = klass([None], dtype=dtype) + assert check_missing(getattr(obj, opname)()) + assert check_missing(getattr(obj, opname)(skipna=False)) - obj = klass([pd.NaT, datetime(2011, 11, 1)]) - # check DatetimeIndex monotonic path - assert getattr(obj, opname)() == datetime(2011, 11, 1) - assert getattr(obj, opname)(skipna=False) is pd.NaT + obj = klass([], dtype=dtype) + assert check_missing(getattr(obj, opname)()) + assert check_missing(getattr(obj, opname)(skipna=False)) + + if dtype == "object": + # generic test with object only works for empty / all NaN + return + + obj = klass([None, val], dtype=dtype) + assert getattr(obj, opname)() == val + assert check_missing(getattr(obj, opname)(skipna=False)) + obj = klass([None, val, None], dtype=dtype) + assert getattr(obj, opname)() == val + assert check_missing(getattr(obj, opname)(skipna=False)) + + @pytest.mark.parametrize("opname", ["max", "min"]) + def test_nanargminmax(self, opname, index_or_series): + # GH#7261 + klass = index_or_series + arg_op = "arg" + opname if klass is Index else "idx" + opname + + obj = klass([pd.NaT, datetime(2011, 11, 1)]) assert getattr(obj, arg_op)() == 1 result = getattr(obj, arg_op)(skipna=False) if klass is Series: @@ -95,9 +125,6 @@ def test_nanops(self, opname, index_or_series): obj = klass([pd.NaT, datetime(2011, 11, 1), pd.NaT]) # check DatetimeIndex non-monotonic path - assert getattr(obj, opname)(), datetime(2011, 11, 1) - assert getattr(obj, opname)(skipna=False) is pd.NaT - assert getattr(obj, arg_op)() == 1 result = getattr(obj, arg_op)(skipna=False) if klass is Series: From d93e811d25edee51cf703f05d5b079e31250f41f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 3 Apr 2020 17:27:04 +0200 Subject: [PATCH 2/7] fix test --- pandas/tests/arrays/integer/test_dtypes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py index ee1ec86745246..515013e95c717 100644 --- a/pandas/tests/arrays/integer/test_dtypes.py +++ b/pandas/tests/arrays/integer/test_dtypes.py @@ -34,7 +34,7 @@ def test_preserve_dtypes(op): # op result = getattr(df.C, op)() - if op == "sum": + if op in {"sum", "min", "max"}: assert isinstance(result, np.int64) else: assert isinstance(result, int) From 899913d253a0f9f3f1e98fe1111b732f8b7eb802 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 3 Apr 2020 17:28:00 +0200 Subject: [PATCH 3/7] add whatsnew --- doc/source/whatsnew/v1.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 19e8acdaa7384..d232b68e3c450 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -274,7 +274,7 @@ Performance improvements sparse values from ``scipy.sparse`` matrices using the :meth:`DataFrame.sparse.from_spmatrix` constructor (:issue:`32821`, :issue:`32825`, :issue:`32826`, :issue:`32856`, :issue:`32858`). -- Performance improvement in :meth:`Series.sum` for nullable (integer and boolean) dtypes (:issue:`30982`). +- Performance improvement in reductions (sum, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`). .. --------------------------------------------------------------------------- From 9d496ebe004565b7c208a4e2fa1c8dd65e9f5553 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 3 Apr 2020 21:29:32 +0200 Subject: [PATCH 4/7] also add for boolean --- pandas/core/arrays/boolean.py | 8 +++----- pandas/tests/reductions/test_reductions.py | 5 +++-- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 442d4ca8cef6d..e85534def6b97 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -696,8 +696,9 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): data = self._data mask = self._mask - if name == "sum": - return masked_reductions.sum(data, mask, skipna=skipna, **kwargs) + if name in {"sum", "min", "max"}: + op = getattr(masked_reductions, name) + return op(data, mask, skipna=skipna, **kwargs) # coerce to a nan-aware float if needed if self._hasna: @@ -715,9 +716,6 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): if int_result == result: result = int_result - elif name in ["min", "max"] and notna(result): - result = np.bool_(result) - return result def _maybe_mask_result(self, result, mask, other, op_name: str): diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 1c0eee2226a6d..8fb035e085d40 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -70,15 +70,16 @@ def test_ops(self, opname, obj): [ ("object", 2.0), ("float64", 2.0), - ("Int64", 2), ("datetime64[ns]", datetime(2011, 11, 1)), + ("Int64", 2), + ("boolean", True), ], ) def test_nanminmax(self, opname, dtype, val, index_or_series): # GH#7261 klass = index_or_series - if dtype == "Int64" and klass == pd.Index: + if dtype in ["Int64", "boolean"] and klass == pd.Index: pytest.skip("EAs can't yet be stored in an index") def check_missing(res): From cd1e2913d58428f8316627c286bf85df3623364a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 3 Apr 2020 21:29:56 +0200 Subject: [PATCH 5/7] Update pandas/core/arrays/integer.py Co-Authored-By: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> --- pandas/core/arrays/integer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 8966d7ef48844..541ecdeb45830 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -582,7 +582,7 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): # if we have a preservable numeric op, # provide coercion back to an integer type if possible - elif name in ["prod"]: + elif name == "prod": # GH#31409 more performant than casting-then-checking result = com.cast_scalar_indexer(result) From cb992a22982d2dcdff1b33b283030d638338c764 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 3 Apr 2020 21:36:30 +0200 Subject: [PATCH 6/7] different nesting to be mypy friendlier --- pandas/core/array_algos/masked_reductions.py | 59 ++++++++++---------- 1 file changed, 30 insertions(+), 29 deletions(-) diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index 32a61dd7c39ad..58325aac9ab80 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -47,40 +47,41 @@ def sum( return np.sum(values, where=~mask) -def _minmax(func): - def reduction(values: np.ndarray, mask: np.ndarray, skipna: bool = True): - """ - Reduction for 1D masked array. +def _minmax(func, values: np.ndarray, mask: np.ndarray, skipna: bool = True): + """ + Reduction for 1D masked array. - Parameters - ---------- - values : np.ndarray - Numpy array with the values (can be of any dtype that support the - operation). - mask : np.ndarray - Boolean numpy array (True values indicate missing values). - skipna : bool, default True - Whether to skip NA. - """ - if not skipna: - if mask.any(): - return libmissing.NA - else: - if values.size: - return func(values) - else: - # min/max with empty array raise in numpy, pandas returns NA - return libmissing.NA + Parameters + ---------- + values : np.ndarray + Numpy array with the values (can be of any dtype that support the + operation). + mask : np.ndarray + Boolean numpy array (True values indicate missing values). + skipna : bool, default True + Whether to skip NA. + """ + if not skipna: + if mask.any(): + return libmissing.NA else: - subset = values[~mask] - if subset.size: - return func(values[~mask]) + if values.size: + return func(values) else: # min/max with empty array raise in numpy, pandas returns NA return libmissing.NA + else: + subset = values[~mask] + if subset.size: + return func(values[~mask]) + else: + # min/max with empty array raise in numpy, pandas returns NA + return libmissing.NA + - return reduction +def min(values: np.ndarray, mask: np.ndarray, skipna: bool = True): + return _minmax(np.min, values=values, mask=mask, skipna=skipna) -min = _minmax(np.min) -max = _minmax(np.max) +def max(values: np.ndarray, mask: np.ndarray, skipna: bool = True): + return _minmax(np.max, values=values, mask=mask, skipna=skipna) From eb00ad00f69d6848b38070c3edc62e86198caed5 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 3 Apr 2020 21:37:58 +0200 Subject: [PATCH 7/7] update docstring --- pandas/core/array_algos/masked_reductions.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index 58325aac9ab80..b3723340cefd6 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -53,6 +53,7 @@ def _minmax(func, values: np.ndarray, mask: np.ndarray, skipna: bool = True): Parameters ---------- + func : np.min or np.max values : np.ndarray Numpy array with the values (can be of any dtype that support the operation).