diff --git a/pandas/__init__.py b/pandas/__init__.py index 3645e8744d8af..7d266566528d9 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -55,6 +55,9 @@ UInt16Dtype, UInt32Dtype, UInt64Dtype, + Float16ArrowDtype, + Float32ArrowDtype, + Float64ArrowDtype, Float32Dtype, Float64Dtype, CategoricalDtype, @@ -317,6 +320,9 @@ def __getattr__(name): "ExcelFile", "ExcelWriter", "Flags", + "Float16ArrowDtype", + "Float32ArrowDtype", + "Float64ArrowDtype", "Float32Dtype", "Float64Dtype", "Grouper", diff --git a/pandas/core/api.py b/pandas/core/api.py index cf082d2013d3b..3c49cbd5a16ad 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -27,6 +27,11 @@ value_counts, ) from pandas.core.arrays import Categorical +from pandas.core.arrays.arrow.floating import ( + Float16ArrowDtype, + Float32ArrowDtype, + Float64ArrowDtype, +) from pandas.core.arrays.boolean import BooleanDtype from pandas.core.arrays.floating import ( Float32Dtype, diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index fdd505e259dd9..47b4cda39962a 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -39,6 +39,7 @@ import pyarrow.compute as pc from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning + from pandas.core.arrays.arrow.dtype import ArrowDtype if TYPE_CHECKING: from pandas import Series @@ -52,9 +53,11 @@ class ArrowExtensionArray(ExtensionArray): """ _data: pa.ChunkedArray + _dtype: pa.DataType - def __init__(self, values: pa.ChunkedArray) -> None: + def __init__(self, values: pa.ChunkedArray, pa_dtype: pa.DataType) -> None: self._data = values + self._dtype = ArrowDtype(pa_dtype=pa_dtype, storage="pyarrow") def __arrow_array__(self, type=None): """Convert myself to a pyarrow Array or ChunkedArray.""" @@ -468,3 +471,51 @@ def _replace_with_indices( return pc.if_else(mask, None, chunk) return pc.replace_with_mask(chunk, mask, value) + + @property + def dtype(self) -> ArrowDtype: + return self._dtype + + @classmethod + def _from_sequence_of_strings( + self, cls, strings, *, dtype=None, copy: bool = False + ): + if self.dtype._is_numeric: + from pandas.core.tools.numeric import to_numeric + + scalars = to_numeric(strings, errors="raise") + elif self.dtype._is_temporal: + from pandas.core.tools.datetimes import to_datetime + + scalars = to_datetime(strings, error="raise") + return cls._from_sequence(scalars, dtype=dtype, copy=copy) + + def mean(self, skipna: bool = True): + if self.dtype._is_numeric: + return pa.compute.mean(self._data, skip_nulls=skipna) + else: + raise TypeError("Cannot compute mean") + + def max(self, skipna: bool = True): + if self.dtype._is_numeric: + return pa.compute.max(self._data, skip_nulls=skipna) + else: + raise TypeError("Cannot compute max") + + def min(self, skipna: bool = True): + if self.dtype._is_numeric: + return pa.compute.min(self._data, skip_nulls=skipna) + else: + raise TypeError("Cannot compute min") + + def mode(self, skipna: bool = True): + if self.dtype._is_numeric: + return pa.compute.mode(self._data, skip_nulls=skipna) + else: + raise TypeError("Cannot compute mode") + + def quantile(self, q: float = 0.5, interpolation: str = "linear"): + if self.dtype._is_numeric: + return pa.compute.quantile(self._data, q=q, interpolation=interpolation) + else: + raise TypeError("Cannot compute quantile") diff --git a/pandas/core/arrays/arrow/dtype.py b/pandas/core/arrays/arrow/dtype.py index c0ecb0856f27f..4aebb5a6f3e97 100644 --- a/pandas/core/arrays/arrow/dtype.py +++ b/pandas/core/arrays/arrow/dtype.py @@ -23,9 +23,38 @@ class ArrowDtype(StorageExtensionDtype): na_value = pa.NA - def __init__(self, storage="pyarrow") -> None: + def __init__(self, pa_dtype, storage="pyarrow") -> None: + self.pa_dtype = pa_dtype + self.storage = storage super().__init__(storage) + def _is_numeric(self): + return pa.types.is_integer(self.pa_dtype) or pa.types.is_float(self.pa_dtype) + + def _is_integer(self): + return pa.types.is_integer(self.pa_dtype) + + def _is_boolean(self): + return pa.types.is_boolean(self.pa_dtype) + + def _is_floating(self): + return pa.types.is_floating(self.pa_dtype) + + def _is_temporal(self): + return pa.types.is_temporal(self.pa_dtype) + + def _is_timestamp(self): + return pa.types.is_timestamp(self.pa_dtype) + + def _is_date(self): + return pa.types.is_date(self.pa_dtype) + + def _is_time(self): + return pa.types.is_time(self.pa_dtype) + + def _is_string(self): + return pa.types.is_string(self.pa_dtype) + @cache_readonly def numpy_dtype(self) -> np.dtype: """Return an instance of the related numpy dtype""" diff --git a/pandas/core/arrays/arrow/floating.py b/pandas/core/arrays/arrow/floating.py new file mode 100644 index 0000000000000..504822fae5dee --- /dev/null +++ b/pandas/core/arrays/arrow/floating.py @@ -0,0 +1,52 @@ +from __future__ import annotations + +import pyarrow as pa + +from pandas.core.dtypes.dtypes import register_extension_dtype + +from pandas.core.arrays.arrow.numeric import FloatingArrowDtype + +_dtype_docstring = """ +An ExtensionDtype for {dtype} data. + +This dtype uses ``pa.null`` as missing value indicator. + +Attributes +---------- +None + +Methods +------- +None +""" + + +@register_extension_dtype +class Float16ArrowDtype(FloatingArrowDtype): + name = "float16" + type = pa.float16() + __doc__ = _dtype_docstring.format(dtype="float16") + _dtype_checker = pa.is_float16() + + +@register_extension_dtype +class Float32ArrowDtype(FloatingArrowDtype): + name = "float32" + type = pa.float32() + __doc__ = _dtype_docstring.format(dtype="float32") + _dtype_checker = pa.is_float32() + + +@register_extension_dtype +class Float64ArrowDtype(FloatingArrowDtype): + name = "float64" + type = pa.float64() + __doc__ = _dtype_docstring.format(dtype="float64") + _dtype_checker = pa.is_float64() + + +INT_STR_TO_DTYPE: dict[str, FloatingArrowDtype] = { + "float16": Float16ArrowDtype(), + "float32": Float32ArrowDtype(), + "float64": Float64ArrowDtype(), +} diff --git a/pandas/core/arrays/arrow/numeric.py b/pandas/core/arrays/arrow/numeric.py new file mode 100644 index 0000000000000..c02b6cefec6b3 --- /dev/null +++ b/pandas/core/arrays/arrow/numeric.py @@ -0,0 +1,67 @@ +from __future__ import annotations + +from typing import ( + Any, + Callable, + TypeVar, +) + +import pyarrow as pa + +from pandas.errors import AbstractMethodError +from pandas.util._decorators import cache_readonly + +from pandas.core.arrays.arrow.array import ArrowExtensionArray +from pandas.core.arrays.arrow.dtype import ArrowDtype + +T = TypeVar("T", bound="FloatingArrowArray") + + +class FloatingArrowDtype(ArrowDtype): + _default_pa_dtype: pa.null() + _dtype_checker: Callable[[Any], bool] # pa.types.is_ + + @property + def _is_numeric(self) -> bool: + return True + + @property + def _is_float(self) -> bool: + return True + + @classmethod + def _str_to_dtype_mapping(cls): + raise AbstractMethodError(cls) + + +class FloatingArrowArray(ArrowExtensionArray): + """ + Base class for Floating dtypes. + """ + + _dtype_cls: type[FloatingArrowDtype] + + def __init__(self, values: pa.ChunkedArray) -> None: + checker = self._dtype_cls._dtype_checker + if not (isinstance(values, pa.ChunkedArray) and checker(values.type)): + descr = "floating" + raise TypeError(f"values should be {descr} arrow array.") + super().__init__(values) + + @cache_readonly + def dtype(self) -> FloatingArrowDtype: + mapping = self._dtype_cls._str_to_dtype_mapping() + return mapping[str(self._data.type)] + + @classmethod + def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False): + if dtype is None: + dtype = cls._dtype_cls._default_pa_dtype + return cls(pa.chunked_array([scalars], type=dtype.type)) + + @classmethod + def _from_sequence_of_strings(cls, strings, *, dtype=None, copy: bool = False): + from pandas.core.tools.numeric import to_numeric + + scalars = to_numeric(strings, errors="raise") + return cls._from_sequence(scalars, dtype=dtype, copy=copy)