Skip to content

Commit

Permalink
BUG: Groupby min/max with nullable dtypes (#42567)
Browse files Browse the repository at this point in the history
  • Loading branch information
jbrockmendel authored Sep 5, 2021
1 parent b1f7e58 commit 6f4c382
Show file tree
Hide file tree
Showing 6 changed files with 105 additions and 8 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,7 @@ Groupby/resample/rolling
^^^^^^^^^^^^^^^^^^^^^^^^
- Fixed bug in :meth:`SeriesGroupBy.apply` where passing an unrecognized string argument failed to raise ``TypeError`` when the underlying ``Series`` is empty (:issue:`42021`)
- Bug in :meth:`Series.rolling.apply`, :meth:`DataFrame.rolling.apply`, :meth:`Series.expanding.apply` and :meth:`DataFrame.expanding.apply` with ``engine="numba"`` where ``*args`` were being cached with the user passed function (:issue:`42287`)
- Bug in :meth:`GroupBy.max` and :meth:`GroupBy.min` with nullable integer dtypes losing precision (:issue:`41743`)
- Bug in :meth:`DataFrame.groupby.rolling.var` would calculate the rolling variance only on the first group (:issue:`42442`)
- Bug in :meth:`GroupBy.shift` that would return the grouping columns if ``fill_value`` was not None (:issue:`41556`)
- Bug in :meth:`SeriesGroupBy.nlargest` and :meth:`SeriesGroupBy.nsmallest` would have an inconsistent index when the input Series was sorted and ``n`` was greater than or equal to all group sizes (:issue:`15272`, :issue:`16345`, :issue:`29129`)
Expand Down
4 changes: 4 additions & 0 deletions pandas/_libs/groupby.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -123,13 +123,17 @@ def group_max(
values: np.ndarray, # ndarray[groupby_t, ndim=2]
labels: np.ndarray, # const int64_t[:]
min_count: int = ...,
mask: np.ndarray | None = ...,
result_mask: np.ndarray | None = ...,
) -> None: ...
def group_min(
out: np.ndarray, # groupby_t[:, ::1]
counts: np.ndarray, # int64_t[::1]
values: np.ndarray, # ndarray[groupby_t, ndim=2]
labels: np.ndarray, # const int64_t[:]
min_count: int = ...,
mask: np.ndarray | None = ...,
result_mask: np.ndarray | None = ...,
) -> None: ...
def group_cummin(
out: np.ndarray, # groupby_t[:, ::1]
Expand Down
36 changes: 31 additions & 5 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1182,7 +1182,9 @@ cdef group_min_max(groupby_t[:, ::1] out,
const intp_t[::1] labels,
Py_ssize_t min_count=-1,
bint is_datetimelike=False,
bint compute_max=True):
bint compute_max=True,
const uint8_t[:, ::1] mask=None,
uint8_t[:, ::1] result_mask=None):
"""
Compute minimum/maximum of columns of `values`, in row groups `labels`.
Expand All @@ -1203,6 +1205,12 @@ cdef group_min_max(groupby_t[:, ::1] out,
True if `values` contains datetime-like entries.
compute_max : bint, default True
True to compute group-wise max, False to compute min
mask : ndarray[bool, ndim=2], optional
If not None, indices represent missing values,
otherwise the mask will not be used
result_mask : ndarray[bool, ndim=2], optional
If not None, these specify locations in the output that are NA.
Modified in-place.
Notes
-----
Expand All @@ -1215,6 +1223,8 @@ cdef group_min_max(groupby_t[:, ::1] out,
ndarray[groupby_t, ndim=2] group_min_or_max
bint runtime_error = False
int64_t[:, ::1] nobs
bint uses_mask = mask is not None
bint isna_entry

# TODO(cython 3.0):
# Instead of `labels.shape[0]` use `len(labels)`
Expand Down Expand Up @@ -1249,7 +1259,12 @@ cdef group_min_max(groupby_t[:, ::1] out,
for j in range(K):
val = values[i, j]

if not _treat_as_na(val, is_datetimelike):
if uses_mask:
isna_entry = mask[i, j]
else:
isna_entry = _treat_as_na(val, is_datetimelike)

if not isna_entry:
nobs[lab, j] += 1
if compute_max:
if val > group_min_or_max[lab, j]:
Expand All @@ -1265,7 +1280,10 @@ cdef group_min_max(groupby_t[:, ::1] out,
runtime_error = True
break
else:
out[i, j] = nan_val
if uses_mask:
result_mask[i, j] = True
else:
out[i, j] = nan_val
else:
out[i, j] = group_min_or_max[i, j]

Expand All @@ -1282,7 +1300,9 @@ def group_max(groupby_t[:, ::1] out,
ndarray[groupby_t, ndim=2] values,
const intp_t[::1] labels,
Py_ssize_t min_count=-1,
bint is_datetimelike=False) -> None:
bint is_datetimelike=False,
const uint8_t[:, ::1] mask=None,
uint8_t[:, ::1] result_mask=None) -> None:
"""See group_min_max.__doc__"""
group_min_max(
out,
Expand All @@ -1292,6 +1312,8 @@ def group_max(groupby_t[:, ::1] out,
min_count=min_count,
is_datetimelike=is_datetimelike,
compute_max=True,
mask=mask,
result_mask=result_mask,
)


Expand All @@ -1302,7 +1324,9 @@ def group_min(groupby_t[:, ::1] out,
ndarray[groupby_t, ndim=2] values,
const intp_t[::1] labels,
Py_ssize_t min_count=-1,
bint is_datetimelike=False) -> None:
bint is_datetimelike=False,
const uint8_t[:, ::1] mask=None,
uint8_t[:, ::1] result_mask=None) -> None:
"""See group_min_max.__doc__"""
group_min_max(
out,
Expand All @@ -1312,6 +1336,8 @@ def group_min(groupby_t[:, ::1] out,
min_count=min_count,
is_datetimelike=is_datetimelike,
compute_max=False,
mask=mask,
result_mask=result_mask,
)


Expand Down
2 changes: 2 additions & 0 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,8 @@ def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False):
raise ValueError("values must be a 1D array")
if mask.ndim != 1:
raise ValueError("mask must be a 1D array")
if values.shape != mask.shape:
raise ValueError("values and mask must have same shape")

if copy:
values = values.copy()
Expand Down
22 changes: 19 additions & 3 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def __init__(self, kind: str, how: str):
},
}

_MASKED_CYTHON_FUNCTIONS = {"cummin", "cummax"}
_MASKED_CYTHON_FUNCTIONS = {"cummin", "cummax", "min", "max"}

_cython_arity = {"ohlc": 4} # OHLC

Expand Down Expand Up @@ -404,6 +404,7 @@ def _masked_ea_wrap_cython_operation(

# Copy to ensure input and result masks don't end up shared
mask = values._mask.copy()
result_mask = np.zeros(ngroups, dtype=bool)
arr = values._data

res_values = self._cython_op_ndim_compat(
Expand All @@ -412,13 +413,18 @@ def _masked_ea_wrap_cython_operation(
ngroups=ngroups,
comp_ids=comp_ids,
mask=mask,
result_mask=result_mask,
**kwargs,
)

dtype = self._get_result_dtype(orig_values.dtype)
assert isinstance(dtype, BaseMaskedDtype)
cls = dtype.construct_array_type()

return cls(res_values.astype(dtype.type, copy=False), mask)
if self.kind != "aggregate":
return cls(res_values.astype(dtype.type, copy=False), mask)
else:
return cls(res_values.astype(dtype.type, copy=False), result_mask)

@final
def _cython_op_ndim_compat(
Expand All @@ -428,20 +434,24 @@ def _cython_op_ndim_compat(
min_count: int,
ngroups: int,
comp_ids: np.ndarray,
mask: np.ndarray | None,
mask: np.ndarray | None = None,
result_mask: np.ndarray | None = None,
**kwargs,
) -> np.ndarray:
if values.ndim == 1:
# expand to 2d, dispatch, then squeeze if appropriate
values2d = values[None, :]
if mask is not None:
mask = mask[None, :]
if result_mask is not None:
result_mask = result_mask[None, :]
res = self._call_cython_op(
values2d,
min_count=min_count,
ngroups=ngroups,
comp_ids=comp_ids,
mask=mask,
result_mask=result_mask,
**kwargs,
)
if res.shape[0] == 1:
Expand All @@ -456,6 +466,7 @@ def _cython_op_ndim_compat(
ngroups=ngroups,
comp_ids=comp_ids,
mask=mask,
result_mask=result_mask,
**kwargs,
)

Expand All @@ -468,6 +479,7 @@ def _call_cython_op(
ngroups: int,
comp_ids: np.ndarray,
mask: np.ndarray | None,
result_mask: np.ndarray | None,
**kwargs,
) -> np.ndarray: # np.ndarray[ndim=2]
orig_values = values
Expand All @@ -493,6 +505,8 @@ def _call_cython_op(
values = values.T
if mask is not None:
mask = mask.T
if result_mask is not None:
result_mask = result_mask.T

out_shape = self._get_output_shape(ngroups, values)
func, values = self.get_cython_func_and_vals(values, is_numeric)
Expand All @@ -508,6 +522,8 @@ def _call_cython_op(
values,
comp_ids,
min_count,
mask=mask,
result_mask=result_mask,
is_datetimelike=is_datetimelike,
)
else:
Expand Down
48 changes: 48 additions & 0 deletions pandas/tests/groupby/test_min_max.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,3 +177,51 @@ def test_aggregate_categorical_lost_index(func: str):
expected["B"] = expected["B"].astype(ds.dtype)

tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("dtype", ["Int64", "Int32", "Float64", "Float32", "boolean"])
def test_groupby_min_max_nullable(dtype):
if dtype == "Int64":
# GH#41743 avoid precision loss
ts = 1618556707013635762
elif dtype == "boolean":
ts = 0
else:
ts = 4.0

df = DataFrame({"id": [2, 2], "ts": [ts, ts + 1]})
df["ts"] = df["ts"].astype(dtype)

gb = df.groupby("id")

result = gb.min()
expected = df.iloc[:1].set_index("id")
tm.assert_frame_equal(result, expected)

res_max = gb.max()
expected_max = df.iloc[1:].set_index("id")
tm.assert_frame_equal(res_max, expected_max)

result2 = gb.min(min_count=3)
expected2 = DataFrame({"ts": [pd.NA]}, index=expected.index, dtype=dtype)
tm.assert_frame_equal(result2, expected2)

res_max2 = gb.max(min_count=3)
tm.assert_frame_equal(res_max2, expected2)

# Case with NA values
df2 = DataFrame({"id": [2, 2, 2], "ts": [ts, pd.NA, ts + 1]})
df2["ts"] = df2["ts"].astype(dtype)
gb2 = df2.groupby("id")

result3 = gb2.min()
tm.assert_frame_equal(result3, expected)

res_max3 = gb2.max()
tm.assert_frame_equal(res_max3, expected_max)

result4 = gb2.min(min_count=100)
tm.assert_frame_equal(result4, expected2)

res_max4 = gb2.max(min_count=100)
tm.assert_frame_equal(res_max4, expected2)

0 comments on commit 6f4c382

Please sign in to comment.