Skip to content

ENH: Incorproate ArrowDtype into ArrowExtensionArray #47034

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 48 commits into from
Jun 9, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
9053263
Add other dtype attributes
mroeschke May 16, 2022
088f72e
Merge remote-tracking branch 'upstream/main' into enh/arrowdtype_support
mroeschke May 16, 2022
aee3dc8
add pa_type in the constructor and modify methods of needed
mroeschke May 16, 2022
aa13af8
Have ArrowExtensionArray support ArrowDtype
mroeschke May 16, 2022
d521264
Fix tests
mroeschke May 16, 2022
ce05407
add impoterror raise
mroeschke May 17, 2022
bf0365b
Just partial match
mroeschke May 17, 2022
01e4a4b
Address typing
mroeschke May 17, 2022
cc1c687
Merge remote-tracking branch 'upstream/main' into enh/arrowdtype_support
mroeschke May 17, 2022
97967a5
Merge remote-tracking branch 'upstream/main' into enh/arrowdtype_support
mroeschke May 17, 2022
f2d872d
Merge remote-tracking branch 'upstream/main' into enh/arrowdtype_support
mroeschke May 19, 2022
a77ea6b
Merge remote-tracking branch 'upstream/main' into enh/arrowdtype_support
mroeschke May 19, 2022
26e8998
Merge remote-tracking branch 'upstream/main' into enh/arrowdtype_support
mroeschke May 19, 2022
a157e51
Merge remote-tracking branch 'upstream/main' into enh/arrowdtype_support
mroeschke May 20, 2022
baeae04
Merge remote-tracking branch 'upstream/main' into enh/arrowdtype_support
mroeschke May 23, 2022
c33c345
Complete more methods of extentionarrow
mroeschke May 24, 2022
901e9b0
Add types and first test
mroeschke May 24, 2022
b3f6d93
Fix getitem type thing
mroeschke May 24, 2022
80059d5
Merge remote-tracking branch 'upstream/main' into enh/arrowdtype_support
mroeschke May 24, 2022
5c873d5
Try import or skip:
mroeschke May 24, 2022
1160bff
Merge remote-tracking branch 'upstream/main' into enh/arrowdtype_support
mroeschke May 24, 2022
68bb030
Fix typo
mroeschke May 24, 2022
9fd9161
Fix data size, coersion of pa.NA in lists
mroeschke May 24, 2022
939e751
change pa_dtype to pyarrow dtype
mroeschke May 25, 2022
1a5d3ff
Address more tests
mroeschke May 25, 2022
01ca1c7
Merge remote-tracking branch 'upstream/main' into enh/arrowdtype_support
mroeschke May 25, 2022
f2dda8c
Add register_extension_dtype
mroeschke May 25, 2022
26b2f1c
Address Joris' comments
mroeschke May 25, 2022
95bd38f
Revert to self.name, xfail the dtype test due to conflict
mroeschke May 25, 2022
a455b50
Add getitem tests
mroeschke May 25, 2022
8d6ebb5
Merge remote-tracking branch 'upstream/main' into enh/arrowdtype_support
mroeschke May 25, 2022
b6972a5
Add conditions when fails for other pyarrow versions
mroeschke May 25, 2022
0024d9e
Merge remote-tracking branch 'upstream/main' into enh/arrowdtype_support
mroeschke May 26, 2022
a18fd6f
Fix docstring validate
mroeschke May 26, 2022
f6b779d
Merge remote-tracking branch 'upstream/main' into enh/arrowdtype_support
mroeschke May 26, 2022
9edb6a4
Fix typing errors
mroeschke May 26, 2022
d074188
Merge remote-tracking branch 'upstream/main' into enh/arrowdtype_support
mroeschke May 27, 2022
f8983ad
Merge remote-tracking branch 'upstream/main' into enh/arrowdtype_support
mroeschke May 30, 2022
1b6fe93
Merge remote-tracking branch 'upstream/main' into enh/arrowdtype_support
mroeschke May 31, 2022
eedffc2
Merge remote-tracking branch 'upstream/main' into enh/arrowdtype_support
mroeschke May 31, 2022
c69d70e
Remove incorrectly implemented _from_factorized
mroeschke May 31, 2022
245fbe6
Add notimplementederror for construct_from_string with parameters
mroeschke May 31, 2022
91aaaab
Merge remote-tracking branch 'upstream/main' into enh/arrowdtype_support
mroeschke Jun 1, 2022
1a44a6d
Address review
mroeschke Jun 1, 2022
86e178c
Add pyarrow_dtype to _metadata
mroeschke Jun 1, 2022
4129e37
Merge remote-tracking branch 'upstream/main' into enh/arrowdtype_support
mroeschke Jun 1, 2022
c5d029f
Address typing and fix data fixture
mroeschke Jun 1, 2022
4743781
Merge remote-tracking branch 'upstream/main' into enh/arrowdtype_support
mroeschke Jun 5, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions pandas/_testing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
)

from pandas._typing import Dtype
from pandas.compat import pa_version_under1p01

from pandas.core.dtypes.common import (
is_float_dtype,
Expand Down Expand Up @@ -193,6 +194,45 @@
]
]

if not pa_version_under1p01:
import pyarrow as pa

UNSIGNED_INT_PYARROW_DTYPES = [pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()]
SIGNED_INT_PYARROW_DTYPES = [pa.uint8(), pa.int16(), pa.int32(), pa.uint64()]
ALL_INT_PYARROW_DTYPES = UNSIGNED_INT_PYARROW_DTYPES + SIGNED_INT_PYARROW_DTYPES

FLOAT_PYARROW_DTYPES = [pa.float32(), pa.float64()]
STRING_PYARROW_DTYPES = [pa.string(), pa.utf8()]

TIME_PYARROW_DTYPES = [
pa.time32("s"),
pa.time32("ms"),
pa.time64("us"),
pa.time64("ns"),
]
DATE_PYARROW_DTYPES = [pa.date32(), pa.date64()]
DATETIME_PYARROW_DTYPES = [
pa.timestamp(unit=unit, tz=tz)
for unit in ["s", "ms", "us", "ns"]
for tz in [None, "UTC", "US/Pacific", "US/Eastern"]
]
TIMEDELTA_PYARROW_DTYPES = [pa.duration(unit) for unit in ["s", "ms", "us", "ns"]]

BOOL_PYARROW_DTYPES = [pa.bool_()]

# TODO: Add container like pyarrow types:
# https://arrow.apache.org/docs/python/api/datatypes.html#factory-functions
ALL_PYARROW_DTYPES = (
ALL_INT_PYARROW_DTYPES
+ FLOAT_PYARROW_DTYPES
+ TIME_PYARROW_DTYPES
+ DATE_PYARROW_DTYPES
+ DATETIME_PYARROW_DTYPES
+ TIMEDELTA_PYARROW_DTYPES
+ BOOL_PYARROW_DTYPES
)


EMPTY_STRING_PATTERN = re.compile("^$")

# set testing_mode
Expand Down
137 changes: 132 additions & 5 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
import numpy as np

from pandas._typing import (
Dtype,
PositionalIndexer,
TakeIndexer,
npt,
)
Expand All @@ -24,13 +26,15 @@
is_array_like,
is_bool_dtype,
is_integer,
is_integer_dtype,
is_scalar,
)
from pandas.core.dtypes.missing import isna

from pandas.core.arrays.base import ExtensionArray
from pandas.core.indexers import (
check_array_indexer,
unpack_tuple_and_ellipses,
validate_indices,
)

Expand All @@ -39,6 +43,7 @@
import pyarrow.compute as pc

from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning
from pandas.core.arrays.arrow.dtype import ArrowDtype

if TYPE_CHECKING:
from pandas import Series
Expand All @@ -48,16 +53,130 @@

class ArrowExtensionArray(ExtensionArray):
"""
Base class for ExtensionArray backed by Arrow array.
Base class for ExtensionArray backed by Arrow ChunkedArray.
"""

_data: pa.ChunkedArray

def __init__(self, values: pa.ChunkedArray) -> None:
self._data = values
def __init__(self, values: pa.Array | pa.ChunkedArray) -> None:
if pa_version_under1p01:
msg = "pyarrow>=1.0.0 is required for PyArrow backed ArrowExtensionArray."
raise ImportError(msg)
if isinstance(values, pa.Array):
self._data = pa.chunked_array([values])
elif isinstance(values, pa.ChunkedArray):
self._data = values
else:
raise ValueError(
f"Unsupported type '{type(values)}' for ArrowExtensionArray"
)
self._dtype = ArrowDtype(self._data.type)

@classmethod
def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mroeschke i just tried the following and got an ArrowInvalid exception

arr = pa.array([1, 2, 3])
ea = pd.core.arrays.ArrowExtensionArray._from_sequence(arr)

should this work?

update: looks like just __init__ works fine here. still surprising that from_sequence doesnt

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Related to #48238, I hadn't really anticipated users passing pyarrow arrays but I suppose this should be supported.

"""
Construct a new ExtensionArray from a sequence of scalars.
"""
if isinstance(dtype, ArrowDtype):
pa_dtype = dtype.pyarrow_dtype
elif dtype:
pa_dtype = pa.from_numpy_dtype(dtype)
else:
pa_dtype = None

if isinstance(scalars, cls):
data = scalars._data
if pa_dtype:
data = data.cast(pa_dtype)
return cls(data)
else:
return cls(
pa.chunked_array(pa.array(scalars, type=pa_dtype, from_pandas=True))
)

@classmethod
def _from_sequence_of_strings(
cls, strings, *, dtype: Dtype | None = None, copy=False
):
"""
Construct a new ExtensionArray from a sequence of strings.
"""
return cls._from_sequence(strings, dtype=dtype, copy=copy)

def __getitem__(self, item: PositionalIndexer):
"""Select a subset of self.

Parameters
----------
item : int, slice, or ndarray
* int: The position in 'self' to get.
* slice: A slice object, where 'start', 'stop', and 'step' are
integers or None
* ndarray: A 1-d boolean NumPy ndarray the same length as 'self'

Returns
-------
item : scalar or ExtensionArray

Notes
-----
For scalar ``item``, return a scalar value suitable for the array's
type. This should be an instance of ``self.dtype.type``.
For slice ``key``, return an instance of ``ExtensionArray``, even
if the slice is length 0 or 1.
For a boolean mask, return an instance of ``ExtensionArray``, filtered
to the values where ``item`` is True.
"""
item = check_array_indexer(self, item)

if isinstance(item, np.ndarray):
if not len(item):
# Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string]
if self._dtype.name == "string" and self._dtype.storage == "pyarrow":
pa_dtype = pa.string()
else:
pa_dtype = self._dtype.pyarrow_dtype
return type(self)(pa.chunked_array([], type=pa_dtype))
elif is_integer_dtype(item.dtype):
return self.take(item)
elif is_bool_dtype(item.dtype):
return type(self)(self._data.filter(item))
else:
raise IndexError(
"Only integers, slices and integer or "
"boolean arrays are valid indices."
)
elif isinstance(item, tuple):
item = unpack_tuple_and_ellipses(item)

# error: Non-overlapping identity check (left operand type:
# "Union[Union[int, integer[Any]], Union[slice, List[int],
# ndarray[Any, Any]]]", right operand type: "ellipsis")
if item is Ellipsis: # type: ignore[comparison-overlap]
# TODO: should be handled by pyarrow?
item = slice(None)

if is_scalar(item) and not is_integer(item):
# e.g. "foo" or 2.5
# exception message copied from numpy
raise IndexError(
r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis "
r"(`None`) and integer or boolean arrays are valid indices"
)
# We are not an array indexer, so maybe e.g. a slice or integer
# indexer. We dispatch to pyarrow.
value = self._data[item]
if isinstance(value, pa.ChunkedArray):
return type(self)(value)
else:
scalar = value.as_py()
if scalar is None:
return self._dtype.na_value
else:
return scalar

def __arrow_array__(self, type=None):
"""Convert myself to a pyarrow Array or ChunkedArray."""
"""Convert myself to a pyarrow ChunkedArray."""
return self._data

def equals(self, other) -> bool:
Expand All @@ -67,6 +186,13 @@ def equals(self, other) -> bool:
# TODO: is this documented somewhere?
return self._data == other._data

@property
def dtype(self) -> ArrowDtype:
"""
An instance of 'ExtensionDtype'.
"""
return self._dtype

@property
def nbytes(self) -> int:
"""
Expand Down Expand Up @@ -377,7 +503,8 @@ def _indexing_key_to_indices(

def _maybe_convert_setitem_value(self, value):
"""Maybe convert value to be pyarrow compatible."""
raise NotImplementedError()
# TODO: Make more robust like ArrowStringArray._maybe_convert_setitem_value
return value

def _set_via_chunk_iteration(
self, indices: npt.NDArray[np.intp], value: npt.NDArray[Any]
Expand Down
Loading