From c839b003e0887cce457e93e4d8f2ad7465f83ae0 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 14 Aug 2022 22:08:51 +0200 Subject: [PATCH 1/2] ENH: Add support for groupby.ohlc for ea dtypes --- doc/source/whatsnew/v1.5.0.rst | 1 + pandas/_libs/groupby.pyi | 6 ++- pandas/_libs/groupby.pyx | 42 ++++++++++++++++--- pandas/core/groupby/ops.py | 10 ++++- .../tests/groupby/aggregate/test_aggregate.py | 16 +++++++ 5 files changed, 65 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index b71d294b97f9a..b042f37e71e38 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -287,6 +287,7 @@ Other enhancements - ``times`` argument in :class:`.ExponentialMovingWindow` now accepts ``np.timedelta64`` (:issue:`47003`) - :class:`.DataError`, :class:`.SpecificationError`, :class:`.SettingWithCopyError`, :class:`.SettingWithCopyWarning`, :class:`.NumExprClobberingError`, :class:`.UndefinedVariableError`, :class:`.IndexingError`, :class:`.PyperclipException`, :class:`.PyperclipWindowsException`, :class:`.CSSWarning`, :class:`.PossibleDataLossError`, :class:`.ClosedFileError`, :class:`.IncompatibilityWarning`, :class:`.AttributeConflictWarning`, :class:`.DatabaseError, :class:`.PossiblePrecisionLoss, :class:`.ValueLabelTypeMismatch, :class:`.InvalidColumnName, and :class:`.CategoricalConversionWarning` are now exposed in ``pandas.errors`` (:issue:`27656`) - Added ``check_like`` argument to :func:`testing.assert_series_equal` (:issue:`47247`) +- Add support for :meth:`GroupBy.ohlc` for extension array dtypes (:issue:`37493`) - Allow reading compressed SAS files with :func:`read_sas` (e.g., ``.sas7bdat.gz`` files) - :meth:`DatetimeIndex.astype` now supports casting timezone-naive indexes to ``datetime64[s]``, ``datetime64[ms]``, and ``datetime64[us]``, and timezone-aware indexes to the corresponding ``datetime64[unit, tzname]`` dtypes (:issue:`47579`) - :class:`Series` reducers (e.g. ``min``, ``max``, ``sum``, ``mean``) will now successfully operate when the dtype is numeric and ``numeric_only=True`` is provided; previously this would raise a ``NotImplementedError`` (:issue:`47500`) diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index 3ec37718eb652..55662ff6c7494 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -86,11 +86,13 @@ def group_mean( result_mask: np.ndarray | None = ..., ) -> None: ... def group_ohlc( - out: np.ndarray, # floating[:, ::1] + out: np.ndarray, # floatingintuint_t[:, ::1] counts: np.ndarray, # int64_t[::1] - values: np.ndarray, # ndarray[floating, ndim=2] + values: np.ndarray, # ndarray[floatingintuint_t, ndim=2] labels: np.ndarray, # const intp_t[:] min_count: int = ..., + mask: np.ndarray | None = ..., + result_mask: np.ndarray | None = ..., ) -> None: ... def group_quantile( out: npt.NDArray[np.float64], diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 6e2b79a320dd7..443ce9d3b5f46 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -835,21 +835,32 @@ def group_mean( out[i, j] = sumx[i, j] / count +ctypedef fused int64float_t: + float32_t + float64_t + int64_t + uint64_t + + @cython.wraparound(False) @cython.boundscheck(False) def group_ohlc( - floating[:, ::1] out, + int64float_t[:, ::1] out, int64_t[::1] counts, - ndarray[floating, ndim=2] values, + ndarray[int64float_t, ndim=2] values, const intp_t[::1] labels, Py_ssize_t min_count=-1, + const uint8_t[:, ::1] mask=None, + uint8_t[:, ::1] result_mask=None, ) -> None: """ Only aggregates on axis=0 """ cdef: Py_ssize_t i, j, N, K, lab - floating val + int64float_t val + uint8_t[::1] first_element_set + bint isna_entry, uses_mask = not mask is None assert min_count == -1, "'min_count' only used in sum and prod" @@ -863,7 +874,15 @@ def group_ohlc( if K > 1: raise NotImplementedError("Argument 'values' must have only one dimension") - out[:] = np.nan + + if int64float_t is float32_t or int64float_t is float64_t: + out[:] = np.nan + else: + out[:] = 0 + + first_element_set = np.zeros((counts).shape, dtype=np.uint8) + if uses_mask: + result_mask[:] = True with nogil: for i in range(N): @@ -873,11 +892,22 @@ def group_ohlc( counts[lab] += 1 val = values[i, 0] - if val != val: + + if uses_mask: + isna_entry = mask[i, 0] + elif ohlc_t is float32_t or ohlc_t is float64_t: + isna_entry = val != val + else: + isna_entry = False + + if isna_entry: continue - if out[lab, 0] != out[lab, 0]: + if not first_element_set[lab]: out[lab, 0] = out[lab, 1] = out[lab, 2] = out[lab, 3] = val + first_element_set[lab] = True + if uses_mask: + result_mask[lab] = False else: out[lab, 1] = max(out[lab, 1], val) out[lab, 2] = min(out[lab, 2], val) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index caea70e03b6f3..038e4afdbd767 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -157,6 +157,7 @@ def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None: "first", "rank", "sum", + "ohlc", } _cython_arity = {"ohlc": 4} # OHLC @@ -219,13 +220,13 @@ def _get_cython_vals(self, values: np.ndarray) -> np.ndarray: values = ensure_float64(values) elif values.dtype.kind in ["i", "u"]: - if how in ["var", "prod", "mean", "ohlc"] or ( + if how in ["var", "prod", "mean"] or ( self.kind == "transform" and self.has_dropped_na ): # result may still include NaN, so we have to cast values = ensure_float64(values) - elif how == "sum": + elif how in ["sum", "ohlc"]: # Avoid overflow during group op if values.dtype.kind == "i": values = ensure_int64(values) @@ -480,6 +481,9 @@ def _masked_ea_wrap_cython_operation( **kwargs, ) + if self.how == "ohlc": + result_mask = np.tile(result_mask, (4, 1)).T + # res_values should already have the correct dtype, we just need to # wrap in a MaskedArray return orig_values._maybe_mask_result(res_values, result_mask) @@ -592,6 +596,8 @@ def _call_cython_op( min_count=min_count, is_datetimelike=is_datetimelike, ) + elif self.how == "ohlc": + func(result, counts, values, comp_ids, min_count, mask, result_mask) else: func(result, counts, values, comp_ids, min_count) else: diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 54ee32502bbc9..bda4d0da9f6ce 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -564,6 +564,22 @@ def test_order_aggregate_multiple_funcs(): tm.assert_index_equal(result, expected) +def test_ohlc_ea_dtypes(any_numeric_ea_dtype): + # GH#37493 + df = DataFrame( + {"a": [1, 1, 2, 3, 4, 4], "b": [22, 11, pd.NA, 10, 20, pd.NA]}, + dtype=any_numeric_ea_dtype, + ) + result = df.groupby("a").ohlc() + expected = DataFrame( + [[22, 22, 11, 11], [pd.NA] * 4, [10] * 4, [20] * 4], + columns=MultiIndex.from_product([["b"], ["open", "high", "low", "close"]]), + index=Index([1, 2, 3, 4], dtype=any_numeric_ea_dtype, name="a"), + dtype=any_numeric_ea_dtype, + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("dtype", [np.int64, np.uint64]) @pytest.mark.parametrize("how", ["first", "last", "min", "max", "mean", "median"]) def test_uint64_type_handling(dtype, how): From d7ec5c5fcba5d6707f5108a04bf7ebbcbc2e2e5b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 14 Aug 2022 22:53:24 +0200 Subject: [PATCH 2/2] Fix type --- pandas/_libs/groupby.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 443ce9d3b5f46..e8206df5b47f1 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -895,7 +895,7 @@ def group_ohlc( if uses_mask: isna_entry = mask[i, 0] - elif ohlc_t is float32_t or ohlc_t is float64_t: + elif int64float_t is float32_t or int64float_t is float64_t: isna_entry = val != val else: isna_entry = False