diff --git a/pandas/__init__.py b/pandas/__init__.py index 3645e8744d8af..1423e65bf52b7 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -47,6 +47,14 @@ from pandas.core.api import ( # dtype + Int8ArrowDtype, + Int16ArrowDtype, + Int32ArrowDtype, + Int64ArrowDtype, + UInt8ArrowDtype, + UInt16ArrowDtype, + UInt32ArrowDtype, + UInt64ArrowDtype, Int8Dtype, Int16Dtype, Int32Dtype, diff --git a/pandas/core/api.py b/pandas/core/api.py index cf082d2013d3b..373003e2b42a1 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -27,6 +27,16 @@ value_counts, ) from pandas.core.arrays import Categorical +from pandas.core.arrays.arrow.integer import ( + Int8ArrowDtype, + Int16ArrowDtype, + Int32ArrowDtype, + Int64ArrowDtype, + UInt8ArrowDtype, + UInt16ArrowDtype, + UInt32ArrowDtype, + UInt64ArrowDtype, +) from pandas.core.arrays.boolean import BooleanDtype from pandas.core.arrays.floating import ( Float32Dtype, diff --git a/pandas/core/arrays/arrow/integer.py b/pandas/core/arrays/arrow/integer.py new file mode 100644 index 0000000000000..e7de9c03ec711 --- /dev/null +++ b/pandas/core/arrays/arrow/integer.py @@ -0,0 +1,150 @@ +from __future__ import annotations + +import pyarrow as pa + +from pandas.core.dtypes.base import register_extension_dtype + +from pandas.core.arrays.arrow.numeric import ( + NumericArrowArray, + NumericArrowDtype, +) + + +class IntegerArrowDtype(NumericArrowDtype): + """ + An ExtensionDtype to hold a single size & kind of integer Arrow dtype. + + These specific implementations are subclasses of the non-public + IntegerArrowDtype. For example we have Int8ArrowDtype to represent signed int 8s. + + The attributes name & type are set when these subclasses are created. + """ + + _default_pa_dtype = pa.int64() + _dtype_checker = pa.types.is_integer + + @classmethod + def construct_array_type(cls) -> type[IntegerArrowArray]: + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + return IntegerArrowArray + + @classmethod + def _str_to_dtype_mapping(cls): + return INT_STR_TO_DTYPE + + +class IntegerArrowArray(NumericArrowArray): + """ + Array of pyarrow integer values. + + To construct an IntegerArray from generic array-like ipaut, use + :func:`pandas.array` with one of the integer dtypes (see examples). + + Parameters + ---------- + values : pa.ChunkedArray + A 1-d integer-dtype array. + + Attributes + ---------- + None + + Methods + ------- + None + + Returns + ------- + IntegerArrowArray + """ + + _dtype_cls = IntegerArrowDtype + + +_dtype_docstring = """ +An ExtensionDtype for {dtype} integer pyarrow data. + +Attributes +---------- +None + +Methods +------- +None +""" + +# create the Dtype + + +@register_extension_dtype +class Int8ArrowDtype(IntegerArrowDtype): + type = pa.int8() + name = "int8" + __doc__ = _dtype_docstring.format(dtype="int8") + + +@register_extension_dtype +class Int16ArrowDtype(IntegerArrowDtype): + type = pa.int16() + name = "int16" + __doc__ = _dtype_docstring.format(dtype="int16") + + +@register_extension_dtype +class Int32ArrowDtype(IntegerArrowDtype): + type = pa.int32() + name = "int32" + __doc__ = _dtype_docstring.format(dtype="int32") + + +@register_extension_dtype +class Int64ArrowDtype(IntegerArrowDtype): + type = pa.int64() + name = "int64" + __doc__ = _dtype_docstring.format(dtype="int64") + + +@register_extension_dtype +class UInt8ArrowDtype(IntegerArrowDtype): + type = pa.uint8() + name = "uint8" + __doc__ = _dtype_docstring.format(dtype="uint8") + + +@register_extension_dtype +class UInt16ArrowDtype(IntegerArrowDtype): + type = pa.uint16() + name = "uint16" + __doc__ = _dtype_docstring.format(dtype="uint16") + + +@register_extension_dtype +class UInt32ArrowDtype(IntegerArrowDtype): + type = pa.uint32() + name = "uint32" + __doc__ = _dtype_docstring.format(dtype="uint32") + + +@register_extension_dtype +class UInt64ArrowDtype(IntegerArrowDtype): + type = pa.uint64() + name = "uint64" + __doc__ = _dtype_docstring.format(dtype="uint64") + + +INT_STR_TO_DTYPE: dict[str, IntegerArrowDtype] = { + "int8": Int8ArrowDtype(), + "int16": Int16ArrowDtype(), + "int32": Int32ArrowDtype(), + "int64": Int64ArrowDtype(), + "uint8": UInt8ArrowDtype(), + "uint16": UInt16ArrowDtype(), + "uint32": UInt32ArrowDtype(), + "uint64": UInt64ArrowDtype(), +} diff --git a/pandas/core/arrays/arrow/numeric.py b/pandas/core/arrays/arrow/numeric.py new file mode 100644 index 0000000000000..3b55d34f6e67b --- /dev/null +++ b/pandas/core/arrays/arrow/numeric.py @@ -0,0 +1,75 @@ +from __future__ import annotations + +from typing import ( + Any, + Callable, + TypeVar, +) + +import pyarrow as pa + +from pandas.errors import AbstractMethodError +from pandas.util._decorators import cache_readonly + +from pandas.core.arrays.arrow.array import ArrowExtensionArray +from pandas.core.arrays.arrow.dtype import ArrowDtype + +T = TypeVar("T", bound="NumericArrowArray") + + +class NumericArrowDtype(ArrowDtype): + _default_pa_dtype: pa.null() + _dtype_checker: Callable[[Any], bool] # pa.types.is_ + + @property + def _is_numeric(self) -> bool: + return True + + @cache_readonly + def is_signed_integer(self) -> bool: + return self.kind == "i" + + @cache_readonly + def is_unsigned_integer(self) -> bool: + return self.kind == "u" + + @classmethod + def _str_to_dtype_mapping(cls): + raise AbstractMethodError(cls) + + +class NumericArrowArray(ArrowExtensionArray): + """ + Base class for Integer and Floating and Boolean dtypes. + """ + + _dtype_cls: type[NumericArrowDtype] + + def __init__(self, values: pa.ChunkedArray) -> None: + checker = self._dtype_cls._dtype_checker + if not (isinstance(values, pa.ChunkedArray) and checker(values.type)): + descr = ( + "floating" + if self._dtype_cls.kind == "f" # type: ignore[comparison-overlap] + else "integer" + ) + raise TypeError(f"values should be {descr} arrow array.") + super().__init__(values) + + @cache_readonly + def dtype(self) -> NumericArrowDtype: + mapping = self._dtype_cls._str_to_dtype_mapping() + return mapping[str(self._data.type)] + + @classmethod + def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False): + if dtype is None: + dtype = cls._dtype_cls._default_pa_dtype + return cls(pa.chunked_array([scalars], type=dtype.type)) + + @classmethod + def _from_sequence_of_strings(cls, strings, *, dtype=None, copy: bool = False): + from pandas.core.tools.numeric import to_numeric + + scalars = to_numeric(strings, errors="raise") + return cls._from_sequence(scalars, dtype=dtype, copy=copy)