Skip to content

WIP: ENH Add float[pyarrow] dtype #47027

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 22 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions pandas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@
UInt16Dtype,
UInt32Dtype,
UInt64Dtype,
Float16ArrowDtype,
Float32ArrowDtype,
Float64ArrowDtype,
Float32Dtype,
Float64Dtype,
CategoricalDtype,
Expand Down Expand Up @@ -317,6 +320,9 @@ def __getattr__(name):
"ExcelFile",
"ExcelWriter",
"Flags",
"Float16ArrowDtype",
"Float32ArrowDtype",
"Float64ArrowDtype",
"Float32Dtype",
"Float64Dtype",
"Grouper",
Expand Down
5 changes: 5 additions & 0 deletions pandas/core/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@
value_counts,
)
from pandas.core.arrays import Categorical
from pandas.core.arrays.arrow.floating import (
Float16ArrowDtype,
Float32ArrowDtype,
Float64ArrowDtype,
)
from pandas.core.arrays.boolean import BooleanDtype
from pandas.core.arrays.floating import (
Float32Dtype,
Expand Down
53 changes: 52 additions & 1 deletion pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
import pyarrow.compute as pc

from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning
from pandas.core.arrays.arrow.dtype import ArrowDtype

if TYPE_CHECKING:
from pandas import Series
Expand All @@ -52,9 +53,11 @@ class ArrowExtensionArray(ExtensionArray):
"""

_data: pa.ChunkedArray
_dtype: pa.DataType

def __init__(self, values: pa.ChunkedArray) -> None:
def __init__(self, values: pa.ChunkedArray, pa_dtype: pa.DataType) -> None:
self._data = values
self._dtype = ArrowDtype(pa_dtype=pa_dtype, storage="pyarrow")

def __arrow_array__(self, type=None):
"""Convert myself to a pyarrow Array or ChunkedArray."""
Expand Down Expand Up @@ -468,3 +471,51 @@ def _replace_with_indices(
return pc.if_else(mask, None, chunk)

return pc.replace_with_mask(chunk, mask, value)

@property
def dtype(self) -> ArrowDtype:
return self._dtype

@classmethod
def _from_sequence_of_strings(
self, cls, strings, *, dtype=None, copy: bool = False
):
if self.dtype._is_numeric:
from pandas.core.tools.numeric import to_numeric

scalars = to_numeric(strings, errors="raise")
elif self.dtype._is_temporal:
from pandas.core.tools.datetimes import to_datetime

scalars = to_datetime(strings, error="raise")
return cls._from_sequence(scalars, dtype=dtype, copy=copy)

def mean(self, skipna: bool = True):
if self.dtype._is_numeric:
return pa.compute.mean(self._data, skip_nulls=skipna)
else:
raise TypeError("Cannot compute mean")

def max(self, skipna: bool = True):
if self.dtype._is_numeric:
return pa.compute.max(self._data, skip_nulls=skipna)
else:
raise TypeError("Cannot compute max")

def min(self, skipna: bool = True):
if self.dtype._is_numeric:
return pa.compute.min(self._data, skip_nulls=skipna)
else:
raise TypeError("Cannot compute min")

def mode(self, skipna: bool = True):
if self.dtype._is_numeric:
return pa.compute.mode(self._data, skip_nulls=skipna)
else:
raise TypeError("Cannot compute mode")

def quantile(self, q: float = 0.5, interpolation: str = "linear"):
if self.dtype._is_numeric:
return pa.compute.quantile(self._data, q=q, interpolation=interpolation)
else:
raise TypeError("Cannot compute quantile")
31 changes: 30 additions & 1 deletion pandas/core/arrays/arrow/dtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,38 @@ class ArrowDtype(StorageExtensionDtype):

na_value = pa.NA

def __init__(self, storage="pyarrow") -> None:
def __init__(self, pa_dtype, storage="pyarrow") -> None:
self.pa_dtype = pa_dtype
self.storage = storage
super().__init__(storage)

def _is_numeric(self):
return pa.types.is_integer(self.pa_dtype) or pa.types.is_float(self.pa_dtype)

def _is_integer(self):
return pa.types.is_integer(self.pa_dtype)

def _is_boolean(self):
return pa.types.is_boolean(self.pa_dtype)

def _is_floating(self):
return pa.types.is_floating(self.pa_dtype)

def _is_temporal(self):
return pa.types.is_temporal(self.pa_dtype)

def _is_timestamp(self):
return pa.types.is_timestamp(self.pa_dtype)

def _is_date(self):
return pa.types.is_date(self.pa_dtype)

def _is_time(self):
return pa.types.is_time(self.pa_dtype)

def _is_string(self):
return pa.types.is_string(self.pa_dtype)

@cache_readonly
def numpy_dtype(self) -> np.dtype:
"""Return an instance of the related numpy dtype"""
Expand Down
52 changes: 52 additions & 0 deletions pandas/core/arrays/arrow/floating.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from __future__ import annotations

import pyarrow as pa

from pandas.core.dtypes.dtypes import register_extension_dtype

from pandas.core.arrays.arrow.numeric import FloatingArrowDtype

_dtype_docstring = """
An ExtensionDtype for {dtype} data.

This dtype uses ``pa.null`` as missing value indicator.

Attributes
----------
None

Methods
-------
None
"""


@register_extension_dtype
class Float16ArrowDtype(FloatingArrowDtype):
name = "float16"
type = pa.float16()
__doc__ = _dtype_docstring.format(dtype="float16")
_dtype_checker = pa.is_float16()


@register_extension_dtype
class Float32ArrowDtype(FloatingArrowDtype):
name = "float32"
type = pa.float32()
__doc__ = _dtype_docstring.format(dtype="float32")
_dtype_checker = pa.is_float32()


@register_extension_dtype
class Float64ArrowDtype(FloatingArrowDtype):
name = "float64"
type = pa.float64()
__doc__ = _dtype_docstring.format(dtype="float64")
_dtype_checker = pa.is_float64()


INT_STR_TO_DTYPE: dict[str, FloatingArrowDtype] = {
"float16": Float16ArrowDtype(),
"float32": Float32ArrowDtype(),
"float64": Float64ArrowDtype(),
}
67 changes: 67 additions & 0 deletions pandas/core/arrays/arrow/numeric.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from __future__ import annotations

from typing import (
Any,
Callable,
TypeVar,
)

import pyarrow as pa

from pandas.errors import AbstractMethodError
from pandas.util._decorators import cache_readonly

from pandas.core.arrays.arrow.array import ArrowExtensionArray
from pandas.core.arrays.arrow.dtype import ArrowDtype

T = TypeVar("T", bound="FloatingArrowArray")


class FloatingArrowDtype(ArrowDtype):
_default_pa_dtype: pa.null()
_dtype_checker: Callable[[Any], bool] # pa.types.is_<type>

@property
def _is_numeric(self) -> bool:
return True

@property
def _is_float(self) -> bool:
return True

@classmethod
def _str_to_dtype_mapping(cls):
raise AbstractMethodError(cls)


class FloatingArrowArray(ArrowExtensionArray):
"""
Base class for Floating dtypes.
"""

_dtype_cls: type[FloatingArrowDtype]

def __init__(self, values: pa.ChunkedArray) -> None:
checker = self._dtype_cls._dtype_checker
if not (isinstance(values, pa.ChunkedArray) and checker(values.type)):
descr = "floating"
raise TypeError(f"values should be {descr} arrow array.")
super().__init__(values)

@cache_readonly
def dtype(self) -> FloatingArrowDtype:
mapping = self._dtype_cls._str_to_dtype_mapping()
return mapping[str(self._data.type)]

@classmethod
def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False):
if dtype is None:
dtype = cls._dtype_cls._default_pa_dtype
return cls(pa.chunked_array([scalars], type=dtype.type))

@classmethod
def _from_sequence_of_strings(cls, strings, *, dtype=None, copy: bool = False):
from pandas.core.tools.numeric import to_numeric

scalars = to_numeric(strings, errors="raise")
return cls._from_sequence(scalars, dtype=dtype, copy=copy)