diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 8957ea493e9ad..ab0e262caa6a9 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -628,6 +628,69 @@ def _concat_same_type( arr = pa.chunked_array(chunks) return cls(arr) + def _reduce(self, name: str, *, skipna: bool = True, **kwargs): + """ + Return a scalar result of performing the reduction operation. + + Parameters + ---------- + name : str + Name of the function, supported values are: + { any, all, min, max, sum, mean, median, prod, + std, var, sem, kurt, skew }. + skipna : bool, default True + If True, skip NaN values. + **kwargs + Additional keyword arguments passed to the reduction function. + Currently, `ddof` is the only supported kwarg. + + Returns + ------- + scalar + + Raises + ------ + TypeError : subclass does not define reductions + """ + if name == "sem": + + def pyarrow_meth(data, skipna, **kwargs): + numerator = pc.stddev(data, skip_nulls=skipna, **kwargs) + denominator = pc.sqrt_checked( + pc.subtract_checked( + pc.count(self._data, skip_nulls=skipna), kwargs["ddof"] + ) + ) + return pc.divide_checked(numerator, denominator) + + else: + pyarrow_name = { + "median": "approximate_median", + "prod": "product", + "std": "stddev", + "var": "variance", + }.get(name, name) + # error: Incompatible types in assignment + # (expression has type "Optional[Any]", variable has type + # "Callable[[Any, Any, KwArg(Any)], Any]") + pyarrow_meth = getattr(pc, pyarrow_name, None) # type: ignore[assignment] + if pyarrow_meth is None: + # Let ExtensionArray._reduce raise the TypeError + return super()._reduce(name, skipna=skipna, **kwargs) + try: + result = pyarrow_meth(self._data, skip_nulls=skipna, **kwargs) + except (AttributeError, NotImplementedError, TypeError) as err: + msg = ( + f"'{type(self).__name__}' with dtype {self.dtype} " + f"does not support reduction '{name}' with pyarrow " + f"version {pa.__version__}. '{name}' may be supported by " + f"upgrading pyarrow." + ) + raise TypeError(msg) from err + if pc.is_null(result).as_py(): + return self.dtype.na_value + return result.as_py() + def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None: """Set one or more values inplace. diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 4376a0de37a8c..6a17a56a47cbc 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -5,7 +5,10 @@ import numpy as np import pytest -from pandas.compat import pa_version_under2p0 +from pandas.compat import ( + pa_version_under2p0, + pa_version_under6p0, +) from pandas.errors import PerformanceWarning import pandas.util._test_decorators as td @@ -375,7 +378,7 @@ def test_reduce_missing(skipna, dtype): @pytest.mark.parametrize("method", ["min", "max"]) @pytest.mark.parametrize("skipna", [True, False]) def test_min_max(method, skipna, dtype, request): - if dtype.storage == "pyarrow": + if dtype.storage == "pyarrow" and pa_version_under6p0: reason = "'ArrowStringArray' object has no attribute 'max'" mark = pytest.mark.xfail(raises=TypeError, reason=reason) request.node.add_marker(mark) @@ -392,7 +395,7 @@ def test_min_max(method, skipna, dtype, request): @pytest.mark.parametrize("method", ["min", "max"]) @pytest.mark.parametrize("box", [pd.Series, pd.array]) def test_min_max_numpy(method, box, dtype, request): - if dtype.storage == "pyarrow": + if dtype.storage == "pyarrow" and (pa_version_under6p0 or box is pd.array): if box is pd.array: reason = "'<=' not supported between instances of 'str' and 'NoneType'" else: diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index ef576692c83b6..62f8a855ce263 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -24,6 +24,7 @@ from pandas.compat import ( pa_version_under2p0, pa_version_under3p0, + pa_version_under6p0, pa_version_under8p0, ) @@ -303,6 +304,95 @@ def test_loc_iloc_frame_single_dtype(self, request, using_array_manager, data): super().test_loc_iloc_frame_single_dtype(data) +class TestBaseNumericReduce(base.BaseNumericReduceTests): + def check_reduce(self, ser, op_name, skipna): + pa_dtype = ser.dtype.pyarrow_dtype + result = getattr(ser, op_name)(skipna=skipna) + if pa.types.is_boolean(pa_dtype): + # Can't convert if ser contains NA + pytest.skip( + "pandas boolean data with NA does not fully support all reductions" + ) + elif pa.types.is_integer(pa_dtype) or pa.types.is_floating(pa_dtype): + ser = ser.astype("Float64") + expected = getattr(ser, op_name)(skipna=skipna) + tm.assert_almost_equal(result, expected) + + @pytest.mark.parametrize("skipna", [True, False]) + def test_reduce_series(self, data, all_numeric_reductions, skipna, request): + pa_dtype = data.dtype.pyarrow_dtype + xfail_mark = pytest.mark.xfail( + raises=TypeError, + reason=( + f"{all_numeric_reductions} is not implemented in " + f"pyarrow={pa.__version__} for {pa_dtype}" + ), + ) + if all_numeric_reductions in {"skew", "kurt"}: + request.node.add_marker(xfail_mark) + elif ( + all_numeric_reductions in {"median", "var", "std", "prod", "max", "min"} + and pa_version_under6p0 + ): + request.node.add_marker(xfail_mark) + elif all_numeric_reductions in {"sum", "mean"} and pa_version_under2p0: + request.node.add_marker(xfail_mark) + elif ( + all_numeric_reductions in {"sum", "mean"} + and skipna is False + and pa_version_under6p0 + and (pa.types.is_integer(pa_dtype) or pa.types.is_floating(pa_dtype)) + ): + request.node.add_marker( + pytest.mark.xfail( + raises=AssertionError, + reason=( + f"{all_numeric_reductions} with skip_nulls={skipna} did not " + f"return NA for {pa_dtype} with pyarrow={pa.__version__}" + ), + ) + ) + elif not ( + pa.types.is_integer(pa_dtype) + or pa.types.is_floating(pa_dtype) + or pa.types.is_boolean(pa_dtype) + ) and not ( + all_numeric_reductions in {"min", "max"} + and (pa.types.is_temporal(pa_dtype) and not pa.types.is_duration(pa_dtype)) + ): + request.node.add_marker(xfail_mark) + elif pa.types.is_boolean(pa_dtype) and all_numeric_reductions in { + "std", + "var", + "median", + }: + request.node.add_marker(xfail_mark) + super().test_reduce_series(data, all_numeric_reductions, skipna) + + +class TestBaseBooleanReduce(base.BaseBooleanReduceTests): + @pytest.mark.parametrize("skipna", [True, False]) + def test_reduce_series( + self, data, all_boolean_reductions, skipna, na_value, request + ): + pa_dtype = data.dtype.pyarrow_dtype + xfail_mark = pytest.mark.xfail( + raises=TypeError, + reason=( + f"{all_boolean_reductions} is not implemented in " + f"pyarrow={pa.__version__} for {pa_dtype}" + ), + ) + if not pa.types.is_boolean(pa_dtype): + request.node.add_marker(xfail_mark) + elif pa_version_under3p0: + request.node.add_marker(xfail_mark) + op_name = all_boolean_reductions + s = pd.Series(data) + result = getattr(s, op_name)(skipna=skipna) + assert result is (op_name == "any") + + class TestBaseGroupby(base.BaseGroupbyTests): def test_groupby_agg_extension(self, data_for_grouping, request): tz = getattr(data_for_grouping.dtype.pyarrow_dtype, "tz", None)