From 6f4c382e17e14751005aefd6aeb44e97e5e5a4a6 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 4 Sep 2021 18:29:53 -0700 Subject: [PATCH] BUG: Groupby min/max with nullable dtypes (#42567) --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/_libs/groupby.pyi | 4 +++ pandas/_libs/groupby.pyx | 36 ++++++++++++++++++--- pandas/core/arrays/masked.py | 2 ++ pandas/core/groupby/ops.py | 22 +++++++++++-- pandas/tests/groupby/test_min_max.py | 48 ++++++++++++++++++++++++++++ 6 files changed, 105 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index beabcf8f32d29..70ebd7d02d8c0 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -394,6 +394,7 @@ Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Fixed bug in :meth:`SeriesGroupBy.apply` where passing an unrecognized string argument failed to raise ``TypeError`` when the underlying ``Series`` is empty (:issue:`42021`) - Bug in :meth:`Series.rolling.apply`, :meth:`DataFrame.rolling.apply`, :meth:`Series.expanding.apply` and :meth:`DataFrame.expanding.apply` with ``engine="numba"`` where ``*args`` were being cached with the user passed function (:issue:`42287`) +- Bug in :meth:`GroupBy.max` and :meth:`GroupBy.min` with nullable integer dtypes losing precision (:issue:`41743`) - Bug in :meth:`DataFrame.groupby.rolling.var` would calculate the rolling variance only on the first group (:issue:`42442`) - Bug in :meth:`GroupBy.shift` that would return the grouping columns if ``fill_value`` was not None (:issue:`41556`) - Bug in :meth:`SeriesGroupBy.nlargest` and :meth:`SeriesGroupBy.nsmallest` would have an inconsistent index when the input Series was sorted and ``n`` was greater than or equal to all group sizes (:issue:`15272`, :issue:`16345`, :issue:`29129`) diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index 7b1dcbe562123..b363524e4e592 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -123,6 +123,8 @@ def group_max( values: np.ndarray, # ndarray[groupby_t, ndim=2] labels: np.ndarray, # const int64_t[:] min_count: int = ..., + mask: np.ndarray | None = ..., + result_mask: np.ndarray | None = ..., ) -> None: ... def group_min( out: np.ndarray, # groupby_t[:, ::1] @@ -130,6 +132,8 @@ def group_min( values: np.ndarray, # ndarray[groupby_t, ndim=2] labels: np.ndarray, # const int64_t[:] min_count: int = ..., + mask: np.ndarray | None = ..., + result_mask: np.ndarray | None = ..., ) -> None: ... def group_cummin( out: np.ndarray, # groupby_t[:, ::1] diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 7123d0d543090..40e1049c39588 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1182,7 +1182,9 @@ cdef group_min_max(groupby_t[:, ::1] out, const intp_t[::1] labels, Py_ssize_t min_count=-1, bint is_datetimelike=False, - bint compute_max=True): + bint compute_max=True, + const uint8_t[:, ::1] mask=None, + uint8_t[:, ::1] result_mask=None): """ Compute minimum/maximum of columns of `values`, in row groups `labels`. @@ -1203,6 +1205,12 @@ cdef group_min_max(groupby_t[:, ::1] out, True if `values` contains datetime-like entries. compute_max : bint, default True True to compute group-wise max, False to compute min + mask : ndarray[bool, ndim=2], optional + If not None, indices represent missing values, + otherwise the mask will not be used + result_mask : ndarray[bool, ndim=2], optional + If not None, these specify locations in the output that are NA. + Modified in-place. Notes ----- @@ -1215,6 +1223,8 @@ cdef group_min_max(groupby_t[:, ::1] out, ndarray[groupby_t, ndim=2] group_min_or_max bint runtime_error = False int64_t[:, ::1] nobs + bint uses_mask = mask is not None + bint isna_entry # TODO(cython 3.0): # Instead of `labels.shape[0]` use `len(labels)` @@ -1249,7 +1259,12 @@ cdef group_min_max(groupby_t[:, ::1] out, for j in range(K): val = values[i, j] - if not _treat_as_na(val, is_datetimelike): + if uses_mask: + isna_entry = mask[i, j] + else: + isna_entry = _treat_as_na(val, is_datetimelike) + + if not isna_entry: nobs[lab, j] += 1 if compute_max: if val > group_min_or_max[lab, j]: @@ -1265,7 +1280,10 @@ cdef group_min_max(groupby_t[:, ::1] out, runtime_error = True break else: - out[i, j] = nan_val + if uses_mask: + result_mask[i, j] = True + else: + out[i, j] = nan_val else: out[i, j] = group_min_or_max[i, j] @@ -1282,7 +1300,9 @@ def group_max(groupby_t[:, ::1] out, ndarray[groupby_t, ndim=2] values, const intp_t[::1] labels, Py_ssize_t min_count=-1, - bint is_datetimelike=False) -> None: + bint is_datetimelike=False, + const uint8_t[:, ::1] mask=None, + uint8_t[:, ::1] result_mask=None) -> None: """See group_min_max.__doc__""" group_min_max( out, @@ -1292,6 +1312,8 @@ def group_max(groupby_t[:, ::1] out, min_count=min_count, is_datetimelike=is_datetimelike, compute_max=True, + mask=mask, + result_mask=result_mask, ) @@ -1302,7 +1324,9 @@ def group_min(groupby_t[:, ::1] out, ndarray[groupby_t, ndim=2] values, const intp_t[::1] labels, Py_ssize_t min_count=-1, - bint is_datetimelike=False) -> None: + bint is_datetimelike=False, + const uint8_t[:, ::1] mask=None, + uint8_t[:, ::1] result_mask=None) -> None: """See group_min_max.__doc__""" group_min_max( out, @@ -1312,6 +1336,8 @@ def group_min(groupby_t[:, ::1] out, min_count=min_count, is_datetimelike=is_datetimelike, compute_max=False, + mask=mask, + result_mask=result_mask, ) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 1d78a74db98f0..718e135464385 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -123,6 +123,8 @@ def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): raise ValueError("values must be a 1D array") if mask.ndim != 1: raise ValueError("mask must be a 1D array") + if values.shape != mask.shape: + raise ValueError("values and mask must have same shape") if copy: values = values.copy() diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 37488a8e4993e..e35f5331195fa 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -138,7 +138,7 @@ def __init__(self, kind: str, how: str): }, } - _MASKED_CYTHON_FUNCTIONS = {"cummin", "cummax"} + _MASKED_CYTHON_FUNCTIONS = {"cummin", "cummax", "min", "max"} _cython_arity = {"ohlc": 4} # OHLC @@ -404,6 +404,7 @@ def _masked_ea_wrap_cython_operation( # Copy to ensure input and result masks don't end up shared mask = values._mask.copy() + result_mask = np.zeros(ngroups, dtype=bool) arr = values._data res_values = self._cython_op_ndim_compat( @@ -412,13 +413,18 @@ def _masked_ea_wrap_cython_operation( ngroups=ngroups, comp_ids=comp_ids, mask=mask, + result_mask=result_mask, **kwargs, ) + dtype = self._get_result_dtype(orig_values.dtype) assert isinstance(dtype, BaseMaskedDtype) cls = dtype.construct_array_type() - return cls(res_values.astype(dtype.type, copy=False), mask) + if self.kind != "aggregate": + return cls(res_values.astype(dtype.type, copy=False), mask) + else: + return cls(res_values.astype(dtype.type, copy=False), result_mask) @final def _cython_op_ndim_compat( @@ -428,7 +434,8 @@ def _cython_op_ndim_compat( min_count: int, ngroups: int, comp_ids: np.ndarray, - mask: np.ndarray | None, + mask: np.ndarray | None = None, + result_mask: np.ndarray | None = None, **kwargs, ) -> np.ndarray: if values.ndim == 1: @@ -436,12 +443,15 @@ def _cython_op_ndim_compat( values2d = values[None, :] if mask is not None: mask = mask[None, :] + if result_mask is not None: + result_mask = result_mask[None, :] res = self._call_cython_op( values2d, min_count=min_count, ngroups=ngroups, comp_ids=comp_ids, mask=mask, + result_mask=result_mask, **kwargs, ) if res.shape[0] == 1: @@ -456,6 +466,7 @@ def _cython_op_ndim_compat( ngroups=ngroups, comp_ids=comp_ids, mask=mask, + result_mask=result_mask, **kwargs, ) @@ -468,6 +479,7 @@ def _call_cython_op( ngroups: int, comp_ids: np.ndarray, mask: np.ndarray | None, + result_mask: np.ndarray | None, **kwargs, ) -> np.ndarray: # np.ndarray[ndim=2] orig_values = values @@ -493,6 +505,8 @@ def _call_cython_op( values = values.T if mask is not None: mask = mask.T + if result_mask is not None: + result_mask = result_mask.T out_shape = self._get_output_shape(ngroups, values) func, values = self.get_cython_func_and_vals(values, is_numeric) @@ -508,6 +522,8 @@ def _call_cython_op( values, comp_ids, min_count, + mask=mask, + result_mask=result_mask, is_datetimelike=is_datetimelike, ) else: diff --git a/pandas/tests/groupby/test_min_max.py b/pandas/tests/groupby/test_min_max.py index 41e6959b00dc3..767ef2915d66e 100644 --- a/pandas/tests/groupby/test_min_max.py +++ b/pandas/tests/groupby/test_min_max.py @@ -177,3 +177,51 @@ def test_aggregate_categorical_lost_index(func: str): expected["B"] = expected["B"].astype(ds.dtype) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["Int64", "Int32", "Float64", "Float32", "boolean"]) +def test_groupby_min_max_nullable(dtype): + if dtype == "Int64": + # GH#41743 avoid precision loss + ts = 1618556707013635762 + elif dtype == "boolean": + ts = 0 + else: + ts = 4.0 + + df = DataFrame({"id": [2, 2], "ts": [ts, ts + 1]}) + df["ts"] = df["ts"].astype(dtype) + + gb = df.groupby("id") + + result = gb.min() + expected = df.iloc[:1].set_index("id") + tm.assert_frame_equal(result, expected) + + res_max = gb.max() + expected_max = df.iloc[1:].set_index("id") + tm.assert_frame_equal(res_max, expected_max) + + result2 = gb.min(min_count=3) + expected2 = DataFrame({"ts": [pd.NA]}, index=expected.index, dtype=dtype) + tm.assert_frame_equal(result2, expected2) + + res_max2 = gb.max(min_count=3) + tm.assert_frame_equal(res_max2, expected2) + + # Case with NA values + df2 = DataFrame({"id": [2, 2, 2], "ts": [ts, pd.NA, ts + 1]}) + df2["ts"] = df2["ts"].astype(dtype) + gb2 = df2.groupby("id") + + result3 = gb2.min() + tm.assert_frame_equal(result3, expected) + + res_max3 = gb2.max() + tm.assert_frame_equal(res_max3, expected_max) + + result4 = gb2.min(min_count=100) + tm.assert_frame_equal(result4, expected2) + + res_max4 = gb2.max(min_count=100) + tm.assert_frame_equal(res_max4, expected2)