From b1ee91fd9be6551c28ff4d7321cb9459c8ab88ae Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 14 Aug 2024 12:44:56 -0700 Subject: [PATCH] REF (string): Move StringArrayNumpySemantics methods to base class (#59514) * REF (string): Move StringArrayNumpySemantics methods to base class * mypy fixup --- pandas/core/arrays/string_.py | 55 +++++++++++++++-------------------- 1 file changed, 23 insertions(+), 32 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index c881437ba25af..f3e5e6fe5f3da 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -738,11 +738,23 @@ def astype(self, dtype, copy: bool = True): def _reduce( self, name: str, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs ): + if self.dtype.na_value is np.nan and name in ["any", "all"]: + if name == "any": + return nanops.nanany(self._ndarray, skipna=skipna) + else: + return nanops.nanall(self._ndarray, skipna=skipna) + if name in ["min", "max"]: return getattr(self, name)(skipna=skipna, axis=axis) raise TypeError(f"Cannot perform reduction '{name}' with string dtype") + def _wrap_reduction_result(self, axis: AxisInt | None, result) -> Any: + if self.dtype.na_value is np.nan and result is libmissing.NA: + # the masked_reductions use pd.NA -> convert to np.nan + return np.nan + return super()._wrap_reduction_result(axis, result) + def min(self, axis=None, skipna: bool = True, **kwargs) -> Scalar: nv.validate_min((), kwargs) result = masked_reductions.min( @@ -761,7 +773,11 @@ def value_counts(self, dropna: bool = True) -> Series: from pandas.core.algorithms import value_counts_internal as value_counts result = value_counts(self._ndarray, dropna=dropna).astype("Int64") + result = value_counts(self._ndarray, sort=False, dropna=dropna) result.index = result.index.astype(self.dtype) + + if self.dtype.na_value is libmissing.NA: + result = result.astype("Int64") return result def memory_usage(self, deep: bool = False) -> int: @@ -812,7 +828,13 @@ def _cmp_method(self, other, op): # logical result = np.zeros(len(self._ndarray), dtype="bool") result[valid] = op(self._ndarray[valid], other) - return BooleanArray(result, mask) + res_arr = BooleanArray(result, mask) + if self.dtype.na_value is np.nan: + if op == operator.ne: + return res_arr.to_numpy(np.bool_, na_value=True) + else: + return res_arr.to_numpy(np.bool_, na_value=False) + return res_arr _arith_method = _cmp_method @@ -853,37 +875,6 @@ def _from_backing_data(self, arr: np.ndarray) -> StringArrayNumpySemantics: # we always preserve the dtype return NDArrayBacked._from_backing_data(self, arr) - def _reduce( - self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs - ): - if name in ["any", "all"]: - if name == "any": - return nanops.nanany(self._ndarray, skipna=skipna) - else: - return nanops.nanall(self._ndarray, skipna=skipna) - else: - return super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs) - - def _wrap_reduction_result(self, axis: AxisInt | None, result) -> Any: - # the masked_reductions use pd.NA - if result is libmissing.NA: - return np.nan - return super()._wrap_reduction_result(axis, result) - - def _cmp_method(self, other, op): - result = super()._cmp_method(other, op) - if op == operator.ne: - return result.to_numpy(np.bool_, na_value=True) - else: - return result.to_numpy(np.bool_, na_value=False) - - def value_counts(self, dropna: bool = True) -> Series: - from pandas.core.algorithms import value_counts_internal as value_counts - - result = value_counts(self._ndarray, sort=False, dropna=dropna) - result.index = result.index.astype(self.dtype) - return result - # ------------------------------------------------------------------------ # String methods interface _str_na_value = np.nan