diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 9295cf7873d98..75c502b6ad5a8 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -7,9 +7,9 @@ from typing import ( TYPE_CHECKING, Any, + Callable, Literal, cast, - overload, ) import unicodedata @@ -17,6 +17,7 @@ from pandas._libs import lib from pandas._libs.tslibs import ( + NaT, Timedelta, Timestamp, timezones, @@ -27,6 +28,7 @@ pa_version_under13p0, ) from pandas.util._decorators import doc +from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.cast import ( can_hold_element, @@ -41,8 +43,6 @@ is_list_like, is_numeric_dtype, is_scalar, - is_string_dtype, - pandas_dtype, ) from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.missing import isna @@ -69,7 +69,6 @@ unpack_tuple_and_ellipses, validate_indices, ) -from pandas.core.nanops import check_below_min_count from pandas.core.strings.base import BaseStringArrayMethods from pandas.io._util import _arrow_dtype_mapping @@ -176,12 +175,8 @@ def floordiv_compat( } if TYPE_CHECKING: - from collections.abc import ( - Callable, - Sequence, - ) + from collections.abc import Sequence - from pandas._libs.missing import NAType from pandas._typing import ( ArrayLike, AxisInt, @@ -202,14 +197,12 @@ def floordiv_compat( npt, ) - from pandas.core.dtypes.dtypes import ExtensionDtype - from pandas import Series from pandas.core.arrays.datetimes import DatetimeArray from pandas.core.arrays.timedeltas import TimedeltaArray -def get_unit_from_pa_dtype(pa_dtype) -> str: +def get_unit_from_pa_dtype(pa_dtype): # https://github.com/pandas-dev/pandas/pull/50998#discussion_r1100344804 if pa_version_under11p0: unit = str(pa_dtype).split("[", 1)[-1][:-1] @@ -258,7 +251,6 @@ class ArrowExtensionArray( Parameters ---------- values : pyarrow.Array or pyarrow.ChunkedArray - The input data to initialize the ArrowExtensionArray. Attributes ---------- @@ -272,12 +264,6 @@ class ArrowExtensionArray( ------- ArrowExtensionArray - See Also - -------- - array : Create a Pandas array with a specified dtype. - DataFrame.to_feather : Write a DataFrame to the binary Feather format. - read_feather : Load a feather-format object from the file path. - Notes ----- Most methods are implemented using `pyarrow compute functions. `__ @@ -315,9 +301,7 @@ def __init__(self, values: pa.Array | pa.ChunkedArray) -> None: self._dtype = ArrowDtype(self._pa_array.type) @classmethod - def _from_sequence( - cls, scalars, *, dtype: Dtype | None = None, copy: bool = False - ) -> Self: + def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False): """ Construct a new ExtensionArray from a sequence of scalars. """ @@ -328,8 +312,8 @@ def _from_sequence( @classmethod def _from_sequence_of_strings( - cls, strings, *, dtype: ExtensionDtype, copy: bool = False - ) -> Self: + cls, strings, *, dtype: Dtype | None = None, copy: bool = False + ): """ Construct a new ExtensionArray from a sequence of strings. """ @@ -536,20 +520,21 @@ def _box_pa_array( if pa_type is not None and pa_array.type != pa_type: if pa.types.is_dictionary(pa_type): pa_array = pa_array.dictionary_encode() - if pa_array.type != pa_type: - pa_array = pa_array.cast(pa_type) else: try: pa_array = pa_array.cast(pa_type) - except (pa.ArrowNotImplementedError, pa.ArrowTypeError): + except ( + pa.ArrowInvalid, + pa.ArrowTypeError, + pa.ArrowNotImplementedError, + ): if pa.types.is_string(pa_array.type) or pa.types.is_large_string( pa_array.type ): # TODO: Move logic in _from_sequence_of_strings into # _box_pa_array - dtype = ArrowDtype(pa_type) return cls._from_sequence_of_strings( - value, dtype=dtype + value, dtype=pa_type )._pa_array else: raise @@ -585,11 +570,10 @@ def __getitem__(self, item: PositionalIndexer): if isinstance(item, np.ndarray): if not len(item): # Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string] - if ( - isinstance(self._dtype, StringDtype) - and self._dtype.storage == "pyarrow" + if self._dtype.name == "string" and self._dtype.storage in ( + "pyarrow", + "pyarrow_numpy", ): - # TODO(infer_string) should this be large_string? pa_dtype = pa.string() else: pa_dtype = self._dtype.pyarrow_dtype @@ -676,16 +660,7 @@ def __array__( self, dtype: NpDtype | None = None, copy: bool | None = None ) -> np.ndarray: """Correctly construct numpy arrays when passed to `np.asarray()`.""" - if copy is False: - # TODO: By using `zero_copy_only` it may be possible to implement this - raise ValueError( - "Unable to avoid copy while creating an array as requested." - ) - elif copy is None: - # `to_numpy(copy=False)` has the meaning of NumPy `copy=None`. - copy = False - - return self.to_numpy(dtype=dtype, copy=copy) + return self.to_numpy(dtype=dtype) def __invert__(self) -> Self: # This is a bit wise op for integer types @@ -700,12 +675,7 @@ def __invert__(self) -> Self: return type(self)(pc.invert(self._pa_array)) def __neg__(self) -> Self: - try: - return type(self)(pc.negate_checked(self._pa_array)) - except pa.ArrowNotImplementedError as err: - raise TypeError( - f"unary '-' not supported for dtype '{self.dtype}'" - ) from err + return type(self)(pc.negate_checked(self._pa_array)) def __pos__(self) -> Self: return type(self)(self._pa_array) @@ -728,51 +698,59 @@ def __setstate__(self, state) -> None: state["_pa_array"] = pa.chunked_array(data) self.__dict__.update(state) - def _cmp_method(self, other, op) -> ArrowExtensionArray: - pc_func = ARROW_CMP_FUNCS[op.__name__] - if isinstance( - other, (ArrowExtensionArray, np.ndarray, list, BaseMaskedArray) - ) or isinstance(getattr(other, "dtype", None), CategoricalDtype): - try: - result = pc_func(self._pa_array, self._box_pa(other)) - except pa.ArrowNotImplementedError: - # TODO: could this be wrong if other is object dtype? - # in which case we need to operate pointwise? - result = ops.invalid_comparison(self, other, op) - result = pa.array(result, type=pa.bool_()) - elif is_scalar(other): - try: - result = pc_func(self._pa_array, self._box_pa(other)) - except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid): - mask = isna(self) | isna(other) - valid = ~mask - result = np.zeros(len(self), dtype="bool") + def _cmp_method(self, other, op): + from pandas.core.arrays.datetimes import DatetimeArray + pc_func = ARROW_CMP_FUNCS[op.__name__] + + # Handle comparisons with another ArrowExtensionArray, list, NumPy array, or masked array + if isinstance(other, (ArrowExtensionArray, np.ndarray, list, BaseMaskedArray)) or isinstance(getattr(other, "dtype", None), CategoricalDtype): + result = pc_func(self._pa_array, self._box_pa(other)) + + # Handle scalar comparisons (e.g., comparing a single datetime value) + elif is_scalar(other) or isinstance(other, DatetimeArray): + try: + result = pc_func(self._pa_array, self._box_pa(other)) + except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid): + np_array = None # Ensure np_array is always defined + + # Convert PyArrow-backed datetimes to Pandas datetime for comparison + if isinstance(self.dtype, pd.ArrowDtype) and pa.types.is_timestamp(self._pa_array.type): + print("Converting PyArrow datetime to Pandas datetime...") + np_array = self.to_numpy(dtype="datetime64[ns]") # Convert to numpy datetime64[ns] + other = pd.Timestamp(other) # Ensure 'other' is in Pandas datetime format + else: + print("Converting to NumPy array...") np_array = np.array(self) - try: - result[valid] = op(np_array[valid], other) - except TypeError: - result = ops.invalid_comparison(self, other, op) - result = pa.array(result, type=pa.bool_()) - result = pc.if_else(valid, result, None) - else: - raise NotImplementedError( - f"{op.__name__} not implemented for {type(other)}" - ) - return ArrowExtensionArray(result) - def _op_method_error_message(self, other, op) -> str: - if hasattr(other, "dtype"): - other_type = f"dtype '{other.dtype}'" - else: - other_type = f"object of type {type(other)}" - return ( - f"operation '{op.__name__}' not supported for " - f"dtype '{self.dtype}' with {other_type}" - ) + # Debug: Check if np_array is assigned correctly + print(f"np_array assigned: {np_array is not None}") + + # Ensure np_array is properly assigned before using it + if np_array is None: + raise ValueError("np_array was not assigned properly") + + # Handle missing values (NaNs) in the comparison + mask = isna(self) | isna(other) + valid = ~mask + result = np.zeros(len(self), dtype="bool") + + # Perform comparison on valid elements + try: + result[valid] = op(np_array[valid], other) + except TypeError: + result = ops.invalid_comparison(np_array, other, op) + + result = pa.array(result, type=pa.bool_()) # Create PyArrow array from result + result = pc.if_else(valid, result, None) # Handle invalid elements as None + + else: + raise NotImplementedError(f"{op.__name__} not implemented for {type(other)}") + + return ArrowExtensionArray(result) - def _evaluate_op_method(self, other, op, arrow_funcs) -> Self: + + def _evaluate_op_method(self, other, op, arrow_funcs): pa_type = self._pa_array.type - other_original = other other = self._box_pa(other) if ( @@ -782,15 +760,10 @@ def _evaluate_op_method(self, other, op, arrow_funcs) -> Self: ): if op in [operator.add, roperator.radd]: sep = pa.scalar("", type=pa_type) - try: - if op is operator.add: - result = pc.binary_join_element_wise(self._pa_array, other, sep) - elif op is roperator.radd: - result = pc.binary_join_element_wise(other, self._pa_array, sep) - except pa.ArrowNotImplementedError as err: - raise TypeError( - self._op_method_error_message(other_original, op) - ) from err + if op is operator.add: + result = pc.binary_join_element_wise(self._pa_array, other, sep) + elif op is roperator.radd: + result = pc.binary_join_element_wise(other, self._pa_array, sep) return type(self)(result) elif op in [operator.mul, roperator.rmul]: binary = self._pa_array @@ -822,17 +795,12 @@ def _evaluate_op_method(self, other, op, arrow_funcs) -> Self: pc_func = arrow_funcs[op.__name__] if pc_func is NotImplemented: - if pa.types.is_string(pa_type) or pa.types.is_large_string(pa_type): - raise TypeError(self._op_method_error_message(other_original, op)) raise NotImplementedError(f"{op.__name__} not implemented.") - try: - result = pc_func(self._pa_array, other) - except pa.ArrowNotImplementedError as err: - raise TypeError(self._op_method_error_message(other_original, op)) from err + result = pc_func(self._pa_array, other) return type(self)(result) - def _logical_method(self, other, op) -> Self: + def _logical_method(self, other, op): # For integer types `^`, `|`, `&` are bitwise operators and return # integer types. Otherwise these are boolean ops. if pa.types.is_integer(self._pa_array.type): @@ -840,7 +808,7 @@ def _logical_method(self, other, op) -> Self: else: return self._evaluate_op_method(other, op, ARROW_LOGICAL_FUNCS) - def _arith_method(self, other, op) -> Self: + def _arith_method(self, other, op): return self._evaluate_op_method(other, op, ARROW_ARITHMETIC_FUNCS) def equals(self, other) -> bool: @@ -905,13 +873,7 @@ def isna(self) -> npt.NDArray[np.bool_]: return self._pa_array.is_null().to_numpy() - @overload - def any(self, *, skipna: Literal[True] = ..., **kwargs) -> bool: ... - - @overload - def any(self, *, skipna: bool, **kwargs) -> bool | NAType: ... - - def any(self, *, skipna: bool = True, **kwargs) -> bool | NAType: + def any(self, *, skipna: bool = True, **kwargs): """ Return whether any element is truthy. @@ -969,13 +931,7 @@ def any(self, *, skipna: bool = True, **kwargs) -> bool | NAType: """ return self._reduce("any", skipna=skipna, **kwargs) - @overload - def all(self, *, skipna: Literal[True] = ..., **kwargs) -> bool: ... - - @overload - def all(self, *, skipna: bool, **kwargs) -> bool | NAType: ... - - def all(self, *, skipna: bool = True, **kwargs) -> bool | NAType: + def all(self, *, skipna: bool = True, **kwargs): """ Return whether all elements are truthy. @@ -1105,7 +1061,8 @@ def _pad_or_backfill( copy: bool = True, ) -> Self: if not self._hasna: - return self + # TODO(CoW): Not necessary anymore when CoW is the default + return self.copy() if limit is None and limit_area is None: method = missing.clean_fill_method(method) @@ -1121,7 +1078,6 @@ def _pad_or_backfill( # a kernel for duration types. pass - # TODO: Why do we no longer need the above cases? # TODO(3.0): after EA.fillna 'method' deprecation is enforced, we can remove # this method entirely. return super()._pad_or_backfill( @@ -1131,15 +1087,22 @@ def _pad_or_backfill( @doc(ExtensionArray.fillna) def fillna( self, - value: object | ArrayLike, + value: object | ArrayLike | None = None, + method: FillnaOptions | None = None, limit: int | None = None, copy: bool = True, ) -> Self: + value, method = validate_fillna_kwargs(value, method) + if not self._hasna: + # TODO(CoW): Not necessary anymore when CoW is the default return self.copy() if limit is not None: - return super().fillna(value=value, limit=limit, copy=copy) + return super().fillna(value=value, method=method, limit=limit, copy=copy) + + if method is not None: + return super().fillna(method=method, limit=limit, copy=copy) if isinstance(value, (np.ndarray, ExtensionArray)): # Similar to check_value_size, but we do not mask here since we may @@ -1153,7 +1116,7 @@ def fillna( try: fill_value = self._box_pa(value, pa_type=self._pa_array.type) except pa.ArrowTypeError as err: - msg = f"Invalid value '{value!s}' for dtype '{self.dtype}'" + msg = f"Invalid value '{str(value)}' for dtype {self.dtype}" raise TypeError(msg) from err try: @@ -1165,7 +1128,7 @@ def fillna( # a kernel for duration types. pass - return super().fillna(value=value, limit=limit, copy=copy) + return super().fillna(value=value, method=method, limit=limit, copy=copy) def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: # short-circuit to return all False array. @@ -1208,12 +1171,7 @@ def factorize( data = data.cast(pa.int64()) if pa.types.is_dictionary(data.type): - if null_encoding == "encode": - # dictionary encode does nothing if an already encoded array is given - data = data.cast(data.type.value_type) - encoded = data.dictionary_encode(null_encoding=null_encoding) - else: - encoded = data + encoded = data else: encoded = data.dictionary_encode(null_encoding=null_encoding) if encoded.length() == 0: @@ -1403,7 +1361,7 @@ def _to_datetimearray(self) -> DatetimeArray: np_dtype = np.dtype(f"M8[{pa_type.unit}]") dtype = tz_to_dtype(pa_type.tz, pa_type.unit) np_array = self._pa_array.to_numpy() - np_array = np_array.astype(np_dtype, copy=False) + np_array = np_array.astype(np_dtype) return DatetimeArray._simple_new(np_array, dtype=dtype) def _to_timedeltaarray(self) -> TimedeltaArray: @@ -1414,7 +1372,7 @@ def _to_timedeltaarray(self) -> TimedeltaArray: assert pa.types.is_duration(pa_type) np_dtype = np.dtype(f"m8[{pa_type.unit}]") np_array = self._pa_array.to_numpy() - np_array = np_array.astype(np_dtype, copy=False) + np_array = np_array.astype(np_dtype) return TimedeltaArray._simple_new(np_array, dtype=np_dtype) def _values_for_json(self) -> np.ndarray: @@ -1459,7 +1417,8 @@ def to_numpy( pa.types.is_floating(pa_type) and ( na_value is np.nan - or (original_na_value is lib.no_default and is_float_dtype(dtype)) + or original_na_value is lib.no_default + and is_float_dtype(dtype) ) ): result = data._pa_array.to_numpy() @@ -1480,7 +1439,7 @@ def to_numpy( result[~mask] = data[~mask]._pa_array.to_numpy() return result - def map(self, mapper, na_action: Literal["ignore"] | None = None): + def map(self, mapper, na_action=None): if is_numeric_dtype(self.dtype): return map_array(self.to_numpy(), mapper, na_action=na_action) else: @@ -1632,9 +1591,6 @@ def _accumulate( ------ NotImplementedError : subclass does not define accumulations """ - if is_string_dtype(self): - return self._str_accumulate(name=name, skipna=skipna, **kwargs) - pyarrow_name = { "cummax": "cumulative_max", "cummin": "cumulative_min", @@ -1659,68 +1615,13 @@ def _accumulate( else: data_to_accum = data_to_accum.cast(pa.int64()) - try: - result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs) - except pa.ArrowNotImplementedError as err: - msg = f"operation '{name}' not supported for dtype '{self.dtype}'" - raise TypeError(msg) from err + result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs) if convert_to_int: result = result.cast(pa_dtype) return type(self)(result) - def _str_accumulate( - self, name: str, *, skipna: bool = True, **kwargs - ) -> ArrowExtensionArray | ExtensionArray: - """ - Accumulate implementation for strings, see `_accumulate` docstring for details. - - pyarrow.compute does not implement these methods for strings. - """ - if name == "cumprod": - msg = f"operation '{name}' not supported for dtype '{self.dtype}'" - raise TypeError(msg) - - # We may need to strip out trailing NA values - tail: pa.array | None = None - na_mask: pa.array | None = None - pa_array = self._pa_array - np_func = { - "cumsum": np.cumsum, - "cummin": np.minimum.accumulate, - "cummax": np.maximum.accumulate, - }[name] - - if self._hasna: - na_mask = pc.is_null(pa_array) - if pc.all(na_mask) == pa.scalar(True): - return type(self)(pa_array) - if skipna: - if name == "cumsum": - pa_array = pc.fill_null(pa_array, "") - else: - # We can retain the running min/max by forward/backward filling. - pa_array = pc.fill_null_forward(pa_array) - pa_array = pc.fill_null_backward(pa_array) - else: - # When not skipping NA values, the result should be null from - # the first NA value onward. - idx = pc.index(na_mask, True).as_py() - tail = pa.nulls(len(pa_array) - idx, type=pa_array.type) - pa_array = pa_array[:idx] - - # error: Cannot call function of unknown type - pa_result = pa.array(np_func(pa_array), type=pa_array.type) # type: ignore[operator] - - if tail is not None: - pa_result = pa.concat_arrays([pa_result, tail]) - elif na_mask is not None: - pa_result = pc.if_else(na_mask, None, pa_result) - - result = type(self)(pa_result) - return result - def _reduce_pyarrow(self, name: str, *, skipna: bool = True, **kwargs) -> pa.Scalar: """ Return a pyarrow scalar result of performing the reduction operation. @@ -1785,37 +1686,6 @@ def pyarrow_meth(data, skip_nulls, **kwargs): denominator = pc.sqrt_checked(pc.count(self._pa_array)) return pc.divide_checked(numerator, denominator) - elif name == "sum" and ( - pa.types.is_string(pa_type) or pa.types.is_large_string(pa_type) - ): - - def pyarrow_meth(data, skip_nulls, min_count=0): # type: ignore[misc] - mask = pc.is_null(data) if data.null_count > 0 else None - if skip_nulls: - if min_count > 0 and check_below_min_count( - (len(data),), - None if mask is None else mask.to_numpy(), - min_count, - ): - return pa.scalar(None, type=data.type) - if data.null_count > 0: - # binary_join returns null if there is any null -> - # have to filter out any nulls - data = data.filter(pc.invert(mask)) - else: - if mask is not None or check_below_min_count( - (len(data),), None, min_count - ): - return pa.scalar(None, type=data.type) - - if pa.types.is_large_string(data.type): - # binary_join only supports string, not large_string - data = data.cast(pa.string()) - data_list = pa.ListArray.from_arrays( - [0, len(data)], data.combine_chunks() - )[0] - return pc.binary_join(data_list, "") - else: pyarrow_name = { "median": "quantile", @@ -1843,7 +1713,7 @@ def pyarrow_meth(data, skip_nulls, min_count=0): # type: ignore[misc] except (AttributeError, NotImplementedError, TypeError) as err: msg = ( f"'{type(self).__name__}' with dtype {self.dtype} " - f"does not support operation '{name}' with pyarrow " + f"does not support reduction '{name}' with pyarrow " f"version {pa.__version__}. '{name}' may be supported by " f"upgrading pyarrow." ) @@ -1851,6 +1721,8 @@ def pyarrow_meth(data, skip_nulls, min_count=0): # type: ignore[misc] if name == "median": # GH 52679: Use quantile instead of approximate_median; returns array result = result[0] + if pc.is_null(result).as_py(): + return result if name in ["min", "max", "sum"] and pa.types.is_duration(pa_type): result = result.cast(pa_type) @@ -1938,10 +1810,7 @@ def _explode(self): """ # child class explode method supports only list types; return # default implementation for non list types. - if not ( - pa.types.is_list(self.dtype.pyarrow_dtype) - or pa.types.is_large_list(self.dtype.pyarrow_dtype) - ): + if not pa.types.is_list(self.dtype.pyarrow_dtype): return super()._explode() values = self counts = pa.compute.list_value_length(values._pa_array) @@ -2025,8 +1894,7 @@ def __setitem__(self, key, value) -> None: raise ValueError("Length of indexer and values mismatch") if len(indices) == 0: return - # GH#58530 wrong item assignment by repeated key - _, argsort = np.unique(indices, return_index=True) + argsort = np.argsort(indices) indices = indices[argsort] value = value.take(argsort) mask = np.zeros(len(self), dtype=np.bool_) @@ -2109,11 +1977,11 @@ def _rank( na_option: str = "keep", ascending: bool = True, pct: bool = False, - ) -> Self: + ): """ See Series.rank.__doc__. """ - return self._convert_rank_result( + return type(self)( self._rank_calc( axis=axis, method=method, @@ -2209,7 +2077,7 @@ def _maybe_convert_setitem_value(self, value): try: value = self._box_pa(value, self._pa_array.type) except pa.ArrowTypeError as err: - msg = f"Invalid value '{value!s}' for dtype '{self.dtype}'" + msg = f"Invalid value '{str(value)}' for dtype {self.dtype}" raise TypeError(msg) from err return value @@ -2229,23 +2097,6 @@ def interpolate( See NDFrame.interpolate.__doc__. """ # NB: we return type(self) even if copy=False - if not self.dtype._is_numeric: - raise TypeError(f"Cannot interpolate with {self.dtype} dtype") - - if ( - not pa_version_under13p0 - and method == "linear" - and limit_area is None - and limit is None - and limit_direction == "forward" - ): - values = self._pa_array.combine_chunks() - na_value = pa.array([None], type=values.type) - y_diff_2 = pc.fill_null_backward(pc.pairwise_diff_checked(values, period=2)) - prev_values = pa.concat_arrays([na_value, values[:-2], na_value]) - interps = pc.add_checked(prev_values, pc.divide_checked(y_diff_2, 2)) - return type(self)(pc.coalesce(self._pa_array, interps)) - mask = self.isna() if self.dtype.kind == "f": data = self._pa_array.to_numpy() @@ -2275,7 +2126,7 @@ def _if_else( cond: npt.NDArray[np.bool_] | bool, left: ArrayLike | Scalar, right: ArrayLike | Scalar, - ) -> pa.Array: + ): """ Choose values based on a condition. @@ -2319,7 +2170,7 @@ def _replace_with_mask( values: pa.Array | pa.ChunkedArray, mask: npt.NDArray[np.bool_] | bool, replacements: ArrayLike | Scalar, - ) -> pa.Array | pa.ChunkedArray: + ): """ Replace items selected with a mask. @@ -2385,20 +2236,6 @@ def _groupby_op( **kwargs, ): if isinstance(self.dtype, StringDtype): - if how in [ - "prod", - "mean", - "median", - "cumsum", - "cumprod", - "std", - "sem", - "var", - "skew", - ]: - raise TypeError( - f"dtype '{self.dtype}' does not support operation '{how}'" - ) return super()._groupby_op( how=how, has_dropped_na=has_dropped_na, @@ -2428,13 +2265,7 @@ def _groupby_op( ) if isinstance(result, np.ndarray): return result - elif isinstance(result, BaseMaskedArray): - pa_result = result.__arrow_array__() - return type(self)(pa_result) - else: - # DatetimeArray, TimedeltaArray - pa_result = pa.array(result, from_pandas=True) - return type(self)(pa_result) + return type(self)._from_sequence(result, copy=False) def _apply_elementwise(self, func: Callable) -> list[list[Any]]: """Apply a callable to each element while maintaining the chunking structure.""" @@ -2446,30 +2277,127 @@ def _apply_elementwise(self, func: Callable) -> list[list[Any]]: for chunk in self._pa_array.iterchunks() ] - def _convert_bool_result(self, result, na=lib.no_default, method_name=None): - if na is not lib.no_default and not isna(na): # pyright: ignore [reportGeneralTypeIssues] + def _str_count(self, pat: str, flags: int = 0): + if flags: + raise NotImplementedError(f"count not implemented with {flags=}") + return type(self)(pc.count_substring_regex(self._pa_array, pat)) + + def _str_contains( + self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True + ): + if flags: + raise NotImplementedError(f"contains not implemented with {flags=}") + + if regex: + pa_contains = pc.match_substring_regex + else: + pa_contains = pc.match_substring + result = pa_contains(self._pa_array, pat, ignore_case=not case) + if not isna(na): result = result.fill_null(na) return type(self)(result) - def _convert_int_result(self, result): + def _str_startswith(self, pat: str | tuple[str, ...], na=None): + if isinstance(pat, str): + result = pc.starts_with(self._pa_array, pattern=pat) + else: + if len(pat) == 0: + # For empty tuple, pd.StringDtype() returns null for missing values + # and false for valid values. + result = pc.if_else(pc.is_null(self._pa_array), None, False) + else: + result = pc.starts_with(self._pa_array, pattern=pat[0]) + + for p in pat[1:]: + result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p)) + if not isna(na): + result = result.fill_null(na) return type(self)(result) - def _convert_rank_result(self, result): + def _str_endswith(self, pat: str | tuple[str, ...], na=None): + if isinstance(pat, str): + result = pc.ends_with(self._pa_array, pattern=pat) + else: + if len(pat) == 0: + # For empty tuple, pd.StringDtype() returns null for missing values + # and false for valid values. + result = pc.if_else(pc.is_null(self._pa_array), None, False) + else: + result = pc.ends_with(self._pa_array, pattern=pat[0]) + + for p in pat[1:]: + result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p)) + if not isna(na): + result = result.fill_null(na) return type(self)(result) - def _str_count(self, pat: str, flags: int = 0) -> Self: - if flags: - raise NotImplementedError(f"count not implemented with {flags=}") - return type(self)(pc.count_substring_regex(self._pa_array, pat)) + def _str_replace( + self, + pat: str | re.Pattern, + repl: str | Callable, + n: int = -1, + case: bool = True, + flags: int = 0, + regex: bool = True, + ): + if isinstance(pat, re.Pattern) or callable(repl) or not case or flags: + raise NotImplementedError( + "replace is not supported with a re.Pattern, callable repl, " + "case=False, or flags!=0" + ) - def _str_repeat(self, repeats: int | Sequence[int]) -> Self: + func = pc.replace_substring_regex if regex else pc.replace_substring + # https://github.com/apache/arrow/issues/39149 + # GH 56404, unexpected behavior with negative max_replacements with pyarrow. + pa_max_replacements = None if n < 0 else n + result = func( + self._pa_array, + pattern=pat, + replacement=repl, + max_replacements=pa_max_replacements, + ) + return type(self)(result) + + def _str_repeat(self, repeats: int | Sequence[int]): if not isinstance(repeats, int): raise NotImplementedError( f"repeat is not implemented when repeats is {type(repeats).__name__}" ) - return type(self)(pc.binary_repeat(self._pa_array, repeats)) + else: + return type(self)(pc.binary_repeat(self._pa_array, repeats)) + + def _str_match( + self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None + ): + if not pat.startswith("^"): + pat = f"^{pat}" + return self._str_contains(pat, case, flags, na, regex=True) - def _str_join(self, sep: str) -> Self: + def _str_fullmatch( + self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None + ): + if not pat.endswith("$") or pat.endswith("\\$"): + pat = f"{pat}$" + return self._str_match(pat, case, flags, na) + + def _str_find(self, sub: str, start: int = 0, end: int | None = None): + if start != 0 and end is not None: + slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end) + result = pc.find_substring(slices, sub) + not_found = pc.equal(result, -1) + start_offset = max(0, start) + offset_result = pc.add(result, start_offset) + result = pc.if_else(not_found, result, offset_result) + elif start == 0 and end is None: + slices = self._pa_array + result = pc.find_substring(slices, sub) + else: + raise NotImplementedError( + f"find not implemented with {sub=}, {start=}, {end=}" + ) + return type(self)(result) + + def _str_join(self, sep: str): if pa.types.is_string(self._pa_array.type) or pa.types.is_large_string( self._pa_array.type ): @@ -2479,22 +2407,100 @@ def _str_join(self, sep: str) -> Self: result = self._pa_array return type(self)(pc.binary_join(result, sep)) - def _str_partition(self, sep: str, expand: bool) -> Self: + def _str_partition(self, sep: str, expand: bool): predicate = lambda val: val.partition(sep) result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) - def _str_rpartition(self, sep: str, expand: bool) -> Self: + def _str_rpartition(self, sep: str, expand: bool): predicate = lambda val: val.rpartition(sep) result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) - def _str_casefold(self) -> Self: + def _str_slice( + self, start: int | None = None, stop: int | None = None, step: int | None = None + ): + if start is None: + start = 0 + if step is None: + step = 1 + return type(self)( + pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) + ) + + def _str_isalnum(self): + return type(self)(pc.utf8_is_alnum(self._pa_array)) + + def _str_isalpha(self): + return type(self)(pc.utf8_is_alpha(self._pa_array)) + + def _str_isdecimal(self): + return type(self)(pc.utf8_is_decimal(self._pa_array)) + + def _str_isdigit(self): + return type(self)(pc.utf8_is_digit(self._pa_array)) + + def _str_islower(self): + return type(self)(pc.utf8_is_lower(self._pa_array)) + + def _str_isnumeric(self): + return type(self)(pc.utf8_is_numeric(self._pa_array)) + + def _str_isspace(self): + return type(self)(pc.utf8_is_space(self._pa_array)) + + def _str_istitle(self): + return type(self)(pc.utf8_is_title(self._pa_array)) + + def _str_isupper(self): + return type(self)(pc.utf8_is_upper(self._pa_array)) + + def _str_len(self): + return type(self)(pc.utf8_length(self._pa_array)) + + def _str_lower(self): + return type(self)(pc.utf8_lower(self._pa_array)) + + def _str_upper(self): + return type(self)(pc.utf8_upper(self._pa_array)) + + def _str_strip(self, to_strip=None): + if to_strip is None: + result = pc.utf8_trim_whitespace(self._pa_array) + else: + result = pc.utf8_trim(self._pa_array, characters=to_strip) + return type(self)(result) + + def _str_lstrip(self, to_strip=None): + if to_strip is None: + result = pc.utf8_ltrim_whitespace(self._pa_array) + else: + result = pc.utf8_ltrim(self._pa_array, characters=to_strip) + return type(self)(result) + + def _str_rstrip(self, to_strip=None): + if to_strip is None: + result = pc.utf8_rtrim_whitespace(self._pa_array) + else: + result = pc.utf8_rtrim(self._pa_array, characters=to_strip) + return type(self)(result) + + def _str_removeprefix(self, prefix: str): + if not pa_version_under13p0: + starts_with = pc.starts_with(self._pa_array, pattern=prefix) + removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) + result = pc.if_else(starts_with, removed, self._pa_array) + return type(self)(result) + predicate = lambda val: val.removeprefix(prefix) + result = self._apply_elementwise(predicate) + return type(self)(pa.chunked_array(result)) + + def _str_casefold(self): predicate = lambda val: val.casefold() result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) - def _str_encode(self, encoding: str, errors: str = "strict") -> Self: + def _str_encode(self, encoding: str, errors: str = "strict"): predicate = lambda val: val.encode(encoding, errors) result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) @@ -2514,15 +2520,13 @@ def _str_extract(self, pat: str, flags: int = 0, expand: bool = True): else: return type(self)(pc.struct_field(result, [0])) - def _str_findall(self, pat: str, flags: int = 0) -> Self: + def _str_findall(self, pat: str, flags: int = 0): regex = re.compile(pat, flags=flags) predicate = lambda val: regex.findall(val) result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) - def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None): - if dtype is None: - dtype = np.bool_ + def _str_get_dummies(self, sep: str = "|"): split = pc.split_pattern(self._pa_array, sep) flattened_values = pc.list_flatten(split) uniques = flattened_values.unique() @@ -2532,34 +2536,28 @@ def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None): n_cols = len(uniques) indices = pc.index_in(flattened_values, uniques_sorted).to_numpy() indices = indices + np.arange(n_rows).repeat(lengths) * n_cols - _dtype = pandas_dtype(dtype) - dummies_dtype: NpDtype - if isinstance(_dtype, np.dtype): - dummies_dtype = _dtype - else: - dummies_dtype = np.bool_ - dummies = np.zeros(n_rows * n_cols, dtype=dummies_dtype) + dummies = np.zeros(n_rows * n_cols, dtype=np.bool_) dummies[indices] = True dummies = dummies.reshape((n_rows, n_cols)) result = type(self)(pa.array(list(dummies))) return result, uniques_sorted.to_pylist() - def _str_index(self, sub: str, start: int = 0, end: int | None = None) -> Self: + def _str_index(self, sub: str, start: int = 0, end: int | None = None): predicate = lambda val: val.index(sub, start, end) result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) - def _str_rindex(self, sub: str, start: int = 0, end: int | None = None) -> Self: + def _str_rindex(self, sub: str, start: int = 0, end: int | None = None): predicate = lambda val: val.rindex(sub, start, end) result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) - def _str_normalize(self, form: Literal["NFC", "NFD", "NFKC", "NFKD"]) -> Self: + def _str_normalize(self, form: str): predicate = lambda val: unicodedata.normalize(form, val) result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) - def _str_rfind(self, sub: str, start: int = 0, end=None) -> Self: + def _str_rfind(self, sub: str, start: int = 0, end=None): predicate = lambda val: val.rfind(sub, start, end) result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) @@ -2570,7 +2568,7 @@ def _str_split( n: int | None = -1, expand: bool = False, regex: bool | None = None, - ) -> Self: + ): if n in {-1, 0}: n = None if pat is None: @@ -2581,23 +2579,24 @@ def _str_split( split_func = functools.partial(pc.split_pattern, pattern=pat) return type(self)(split_func(self._pa_array, max_splits=n)) - def _str_rsplit(self, pat: str | None = None, n: int | None = -1) -> Self: + def _str_rsplit(self, pat: str | None = None, n: int | None = -1): if n in {-1, 0}: n = None if pat is None: return type(self)( pc.utf8_split_whitespace(self._pa_array, max_splits=n, reverse=True) ) - return type(self)( - pc.split_pattern(self._pa_array, pat, max_splits=n, reverse=True) - ) + else: + return type(self)( + pc.split_pattern(self._pa_array, pat, max_splits=n, reverse=True) + ) - def _str_translate(self, table: dict[int, str]) -> Self: + def _str_translate(self, table: dict[int, str]): predicate = lambda val: val.translate(table) result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) - def _str_wrap(self, width: int, **kwargs) -> Self: + def _str_wrap(self, width: int, **kwargs): kwargs["width"] = width tw = textwrap.TextWrapper(**kwargs) predicate = lambda val: "\n".join(tw.wrap(val)) @@ -2605,87 +2604,85 @@ def _str_wrap(self, width: int, **kwargs) -> Self: return type(self)(pa.chunked_array(result)) @property - def _dt_days(self) -> Self: + def _dt_days(self): return type(self)( - pa.array( - self._to_timedeltaarray().components.days, - from_pandas=True, - type=pa.int32(), - ) + pa.array(self._to_timedeltaarray().days, from_pandas=True, type=pa.int32()) ) @property - def _dt_hours(self) -> Self: + def _dt_hours(self): return type(self)( pa.array( - self._to_timedeltaarray().components.hours, - from_pandas=True, + [ + td.components.hours if td is not NaT else None + for td in self._to_timedeltaarray() + ], type=pa.int32(), ) ) @property - def _dt_minutes(self) -> Self: + def _dt_minutes(self): return type(self)( pa.array( - self._to_timedeltaarray().components.minutes, - from_pandas=True, + [ + td.components.minutes if td is not NaT else None + for td in self._to_timedeltaarray() + ], type=pa.int32(), ) ) @property - def _dt_seconds(self) -> Self: + def _dt_seconds(self): return type(self)( pa.array( - self._to_timedeltaarray().components.seconds, - from_pandas=True, - type=pa.int32(), + self._to_timedeltaarray().seconds, from_pandas=True, type=pa.int32() ) ) @property - def _dt_milliseconds(self) -> Self: + def _dt_milliseconds(self): return type(self)( pa.array( - self._to_timedeltaarray().components.milliseconds, - from_pandas=True, + [ + td.components.milliseconds if td is not NaT else None + for td in self._to_timedeltaarray() + ], type=pa.int32(), ) ) @property - def _dt_microseconds(self) -> Self: + def _dt_microseconds(self): return type(self)( pa.array( - self._to_timedeltaarray().components.microseconds, + self._to_timedeltaarray().microseconds, from_pandas=True, type=pa.int32(), ) ) @property - def _dt_nanoseconds(self) -> Self: + def _dt_nanoseconds(self): return type(self)( pa.array( - self._to_timedeltaarray().components.nanoseconds, - from_pandas=True, - type=pa.int32(), + self._to_timedeltaarray().nanoseconds, from_pandas=True, type=pa.int32() ) ) - def _dt_to_pytimedelta(self) -> np.ndarray: + def _dt_to_pytimedelta(self): data = self._pa_array.to_pylist() if self._dtype.pyarrow_dtype.unit == "ns": data = [None if ts is None else ts.to_pytimedelta() for ts in data] return np.array(data, dtype=object) - def _dt_total_seconds(self) -> Self: + def _dt_total_seconds(self): return type(self)( pa.array(self._to_timedeltaarray().total_seconds(), from_pandas=True) ) - def _dt_as_unit(self, unit: str) -> Self: + def _dt_as_unit(self, unit: str): if pa.types.is_date(self.dtype.pyarrow_dtype): raise NotImplementedError("as_unit not implemented for date types") pd_array = self._maybe_convert_datelike_array() @@ -2693,43 +2690,43 @@ def _dt_as_unit(self, unit: str) -> Self: return type(self)(pa.array(pd_array.as_unit(unit), from_pandas=True)) @property - def _dt_year(self) -> Self: + def _dt_year(self): return type(self)(pc.year(self._pa_array)) @property - def _dt_day(self) -> Self: + def _dt_day(self): return type(self)(pc.day(self._pa_array)) @property - def _dt_day_of_week(self) -> Self: + def _dt_day_of_week(self): return type(self)(pc.day_of_week(self._pa_array)) _dt_dayofweek = _dt_day_of_week _dt_weekday = _dt_day_of_week @property - def _dt_day_of_year(self) -> Self: + def _dt_day_of_year(self): return type(self)(pc.day_of_year(self._pa_array)) _dt_dayofyear = _dt_day_of_year @property - def _dt_hour(self) -> Self: + def _dt_hour(self): return type(self)(pc.hour(self._pa_array)) - def _dt_isocalendar(self) -> Self: + def _dt_isocalendar(self): return type(self)(pc.iso_calendar(self._pa_array)) @property - def _dt_is_leap_year(self) -> Self: + def _dt_is_leap_year(self): return type(self)(pc.is_leap_year(self._pa_array)) @property - def _dt_is_month_start(self) -> Self: + def _dt_is_month_start(self): return type(self)(pc.equal(pc.day(self._pa_array), 1)) @property - def _dt_is_month_end(self) -> Self: + def _dt_is_month_end(self): result = pc.equal( pc.days_between( pc.floor_temporal(self._pa_array, unit="day"), @@ -2740,7 +2737,7 @@ def _dt_is_month_end(self) -> Self: return type(self)(result) @property - def _dt_is_year_start(self) -> Self: + def _dt_is_year_start(self): return type(self)( pc.and_( pc.equal(pc.month(self._pa_array), 1), @@ -2749,7 +2746,7 @@ def _dt_is_year_start(self) -> Self: ) @property - def _dt_is_year_end(self) -> Self: + def _dt_is_year_end(self): return type(self)( pc.and_( pc.equal(pc.month(self._pa_array), 12), @@ -2758,7 +2755,7 @@ def _dt_is_year_end(self) -> Self: ) @property - def _dt_is_quarter_start(self) -> Self: + def _dt_is_quarter_start(self): result = pc.equal( pc.floor_temporal(self._pa_array, unit="quarter"), pc.floor_temporal(self._pa_array, unit="day"), @@ -2766,7 +2763,7 @@ def _dt_is_quarter_start(self) -> Self: return type(self)(result) @property - def _dt_is_quarter_end(self) -> Self: + def _dt_is_quarter_end(self): result = pc.equal( pc.days_between( pc.floor_temporal(self._pa_array, unit="day"), @@ -2777,7 +2774,7 @@ def _dt_is_quarter_end(self) -> Self: return type(self)(result) @property - def _dt_days_in_month(self) -> Self: + def _dt_days_in_month(self): result = pc.days_between( pc.floor_temporal(self._pa_array, unit="month"), pc.ceil_temporal(self._pa_array, unit="month"), @@ -2787,38 +2784,35 @@ def _dt_days_in_month(self) -> Self: _dt_daysinmonth = _dt_days_in_month @property - def _dt_microsecond(self) -> Self: - # GH 59154 - us = pc.microsecond(self._pa_array) - ms_to_us = pc.multiply(pc.millisecond(self._pa_array), 1000) - return type(self)(pc.add(us, ms_to_us)) + def _dt_microsecond(self): + return type(self)(pc.microsecond(self._pa_array)) @property - def _dt_minute(self) -> Self: + def _dt_minute(self): return type(self)(pc.minute(self._pa_array)) @property - def _dt_month(self) -> Self: + def _dt_month(self): return type(self)(pc.month(self._pa_array)) @property - def _dt_nanosecond(self) -> Self: + def _dt_nanosecond(self): return type(self)(pc.nanosecond(self._pa_array)) @property - def _dt_quarter(self) -> Self: + def _dt_quarter(self): return type(self)(pc.quarter(self._pa_array)) @property - def _dt_second(self) -> Self: + def _dt_second(self): return type(self)(pc.second(self._pa_array)) @property - def _dt_date(self) -> Self: + def _dt_date(self): return type(self)(self._pa_array.cast(pa.date32())) @property - def _dt_time(self) -> Self: + def _dt_time(self): unit = ( self.dtype.pyarrow_dtype.unit if self.dtype.pyarrow_dtype.unit in {"us", "ns"} @@ -2834,10 +2828,10 @@ def _dt_tz(self): def _dt_unit(self): return self.dtype.pyarrow_dtype.unit - def _dt_normalize(self) -> Self: + def _dt_normalize(self): return type(self)(pc.floor_temporal(self._pa_array, 1, "day")) - def _dt_strftime(self, format: str) -> Self: + def _dt_strftime(self, format: str): return type(self)(pc.strftime(self._pa_array, format=format)) def _round_temporally( @@ -2846,7 +2840,7 @@ def _round_temporally( freq, ambiguous: TimeAmbiguous = "raise", nonexistent: TimeNonexistent = "raise", - ) -> Self: + ): if ambiguous != "raise": raise NotImplementedError("ambiguous is not supported.") if nonexistent != "raise": @@ -2882,7 +2876,7 @@ def _dt_ceil( freq, ambiguous: TimeAmbiguous = "raise", nonexistent: TimeNonexistent = "raise", - ) -> Self: + ): return self._round_temporally("ceil", freq, ambiguous, nonexistent) def _dt_floor( @@ -2890,7 +2884,7 @@ def _dt_floor( freq, ambiguous: TimeAmbiguous = "raise", nonexistent: TimeNonexistent = "raise", - ) -> Self: + ): return self._round_temporally("floor", freq, ambiguous, nonexistent) def _dt_round( @@ -2898,22 +2892,20 @@ def _dt_round( freq, ambiguous: TimeAmbiguous = "raise", nonexistent: TimeNonexistent = "raise", - ) -> Self: + ): return self._round_temporally("round", freq, ambiguous, nonexistent) - def _dt_day_name(self, locale: str | None = None) -> Self: + def _dt_day_name(self, locale: str | None = None): if locale is None: locale = "C" return type(self)(pc.strftime(self._pa_array, format="%A", locale=locale)) - def _dt_month_name(self, locale: str | None = None) -> Self: + def _dt_month_name(self, locale: str | None = None): if locale is None: locale = "C" return type(self)(pc.strftime(self._pa_array, format="%B", locale=locale)) - def _dt_to_pydatetime(self) -> Series: - from pandas import Series - + def _dt_to_pydatetime(self): if pa.types.is_date(self.dtype.pyarrow_dtype): raise ValueError( f"to_pydatetime cannot be called with {self.dtype.pyarrow_dtype} type. " @@ -2922,14 +2914,14 @@ def _dt_to_pydatetime(self) -> Series: data = self._pa_array.to_pylist() if self._dtype.pyarrow_dtype.unit == "ns": data = [None if ts is None else ts.to_pydatetime(warn=False) for ts in data] - return Series(data, dtype=object) + return np.array(data, dtype=object) def _dt_tz_localize( self, tz, ambiguous: TimeAmbiguous = "raise", nonexistent: TimeNonexistent = "raise", - ) -> Self: + ): if ambiguous != "raise": raise NotImplementedError(f"{ambiguous=} is not supported") nonexistent_pa = { @@ -2937,8 +2929,7 @@ def _dt_tz_localize( "shift_backward": "earliest", "shift_forward": "latest", }.get( - nonexistent, # type: ignore[arg-type] - None, + nonexistent, None # type: ignore[arg-type] ) if nonexistent_pa is None: raise NotImplementedError(f"{nonexistent=} is not supported") @@ -2950,7 +2941,7 @@ def _dt_tz_localize( ) return type(self)(result) - def _dt_tz_convert(self, tz) -> Self: + def _dt_tz_convert(self, tz): if self.dtype.pyarrow_dtype.tz is None: raise TypeError( "Cannot convert tz-naive timestamps, use tz_localize to localize" @@ -2970,7 +2961,7 @@ def transpose_homogeneous_pyarrow( """ arrays = list(arrays) nrows, ncols = len(arrays[0]), len(arrays) - indices = np.arange(nrows * ncols).reshape(ncols, nrows).T.reshape(-1) + indices = np.arange(nrows * ncols).reshape(ncols, nrows).T.flatten() arr = pa.chunked_array([chunk for arr in arrays for chunk in arr._pa_array.chunks]) arr = arr.take(indices) return [ArrowExtensionArray(arr.slice(i * ncols, ncols)) for i in range(nrows)]