- 
          
- 
                Notifications
    You must be signed in to change notification settings 
- Fork 19.2k
ENH: Incorproate ArrowDtype into ArrowExtensionArray #47034
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
9053263
              088f72e
              aee3dc8
              aa13af8
              d521264
              ce05407
              bf0365b
              01e4a4b
              cc1c687
              97967a5
              f2d872d
              a77ea6b
              26e8998
              a157e51
              baeae04
              c33c345
              901e9b0
              b3f6d93
              80059d5
              5c873d5
              1160bff
              68bb030
              9fd9161
              939e751
              1a5d3ff
              01ca1c7
              f2dda8c
              26b2f1c
              95bd38f
              a455b50
              8d6ebb5
              b6972a5
              0024d9e
              a18fd6f
              f6b779d
              9edb6a4
              d074188
              f8983ad
              1b6fe93
              eedffc2
              c69d70e
              245fbe6
              91aaaab
              1a44a6d
              86e178c
              4129e37
              c5d029f
              4743781
              File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
|  | @@ -9,6 +9,8 @@ | |
| import numpy as np | ||
|  | ||
| from pandas._typing import ( | ||
| Dtype, | ||
| PositionalIndexer, | ||
| TakeIndexer, | ||
| npt, | ||
| ) | ||
|  | @@ -24,13 +26,15 @@ | |
| is_array_like, | ||
| is_bool_dtype, | ||
| is_integer, | ||
| is_integer_dtype, | ||
| is_scalar, | ||
| ) | ||
| from pandas.core.dtypes.missing import isna | ||
|  | ||
| from pandas.core.arrays.base import ExtensionArray | ||
| from pandas.core.indexers import ( | ||
| check_array_indexer, | ||
| unpack_tuple_and_ellipses, | ||
| validate_indices, | ||
| ) | ||
|  | ||
|  | @@ -39,6 +43,7 @@ | |
| import pyarrow.compute as pc | ||
|  | ||
| from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning | ||
| from pandas.core.arrays.arrow.dtype import ArrowDtype | ||
|  | ||
| if TYPE_CHECKING: | ||
| from pandas import Series | ||
|  | @@ -48,16 +53,130 @@ | |
|  | ||
| class ArrowExtensionArray(ExtensionArray): | ||
| """ | ||
| Base class for ExtensionArray backed by Arrow array. | ||
| Base class for ExtensionArray backed by Arrow ChunkedArray. | ||
| """ | ||
|  | ||
| _data: pa.ChunkedArray | ||
|  | ||
| def __init__(self, values: pa.ChunkedArray) -> None: | ||
| self._data = values | ||
| def __init__(self, values: pa.Array | pa.ChunkedArray) -> None: | ||
| if pa_version_under1p01: | ||
| msg = "pyarrow>=1.0.0 is required for PyArrow backed ArrowExtensionArray." | ||
| raise ImportError(msg) | ||
| if isinstance(values, pa.Array): | ||
|         
                  mroeschke marked this conversation as resolved.
              Show resolved
            Hide resolved | ||
| self._data = pa.chunked_array([values]) | ||
| elif isinstance(values, pa.ChunkedArray): | ||
| self._data = values | ||
| else: | ||
| raise ValueError( | ||
| f"Unsupported type '{type(values)}' for ArrowExtensionArray" | ||
| ) | ||
| self._dtype = ArrowDtype(self._data.type) | ||
|  | ||
| @classmethod | ||
| def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): | ||
| There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @mroeschke i just tried the following and got an ArrowInvalid exception should this work? update: looks like just  There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Related to #48238, I hadn't really anticipated users passing pyarrow arrays but I suppose this should be supported. | ||
| """ | ||
| Construct a new ExtensionArray from a sequence of scalars. | ||
| """ | ||
| if isinstance(dtype, ArrowDtype): | ||
| pa_dtype = dtype.pyarrow_dtype | ||
| elif dtype: | ||
| pa_dtype = pa.from_numpy_dtype(dtype) | ||
| else: | ||
| pa_dtype = None | ||
|  | ||
| if isinstance(scalars, cls): | ||
| data = scalars._data | ||
| if pa_dtype: | ||
| data = data.cast(pa_dtype) | ||
| return cls(data) | ||
| else: | ||
| return cls( | ||
| pa.chunked_array(pa.array(scalars, type=pa_dtype, from_pandas=True)) | ||
| ) | ||
|  | ||
| @classmethod | ||
| def _from_sequence_of_strings( | ||
| cls, strings, *, dtype: Dtype | None = None, copy=False | ||
| ): | ||
| """ | ||
| Construct a new ExtensionArray from a sequence of strings. | ||
| """ | ||
| return cls._from_sequence(strings, dtype=dtype, copy=copy) | ||
|  | ||
| def __getitem__(self, item: PositionalIndexer): | ||
| """Select a subset of self. | ||
|  | ||
| Parameters | ||
| ---------- | ||
| item : int, slice, or ndarray | ||
| * int: The position in 'self' to get. | ||
| * slice: A slice object, where 'start', 'stop', and 'step' are | ||
| integers or None | ||
| * ndarray: A 1-d boolean NumPy ndarray the same length as 'self' | ||
|  | ||
| Returns | ||
| ------- | ||
| item : scalar or ExtensionArray | ||
|  | ||
| Notes | ||
| ----- | ||
| For scalar ``item``, return a scalar value suitable for the array's | ||
| type. This should be an instance of ``self.dtype.type``. | ||
| For slice ``key``, return an instance of ``ExtensionArray``, even | ||
| if the slice is length 0 or 1. | ||
| For a boolean mask, return an instance of ``ExtensionArray``, filtered | ||
| to the values where ``item`` is True. | ||
| """ | ||
| item = check_array_indexer(self, item) | ||
|  | ||
| if isinstance(item, np.ndarray): | ||
| if not len(item): | ||
| # Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string] | ||
| if self._dtype.name == "string" and self._dtype.storage == "pyarrow": | ||
| pa_dtype = pa.string() | ||
| else: | ||
| pa_dtype = self._dtype.pyarrow_dtype | ||
| return type(self)(pa.chunked_array([], type=pa_dtype)) | ||
| elif is_integer_dtype(item.dtype): | ||
| return self.take(item) | ||
| elif is_bool_dtype(item.dtype): | ||
| return type(self)(self._data.filter(item)) | ||
| else: | ||
| raise IndexError( | ||
| "Only integers, slices and integer or " | ||
| "boolean arrays are valid indices." | ||
| ) | ||
| elif isinstance(item, tuple): | ||
| item = unpack_tuple_and_ellipses(item) | ||
|  | ||
| # error: Non-overlapping identity check (left operand type: | ||
| # "Union[Union[int, integer[Any]], Union[slice, List[int], | ||
| # ndarray[Any, Any]]]", right operand type: "ellipsis") | ||
| if item is Ellipsis: # type: ignore[comparison-overlap] | ||
| # TODO: should be handled by pyarrow? | ||
| item = slice(None) | ||
|  | ||
| if is_scalar(item) and not is_integer(item): | ||
| # e.g. "foo" or 2.5 | ||
| # exception message copied from numpy | ||
| raise IndexError( | ||
| r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis " | ||
| r"(`None`) and integer or boolean arrays are valid indices" | ||
| ) | ||
| # We are not an array indexer, so maybe e.g. a slice or integer | ||
| # indexer. We dispatch to pyarrow. | ||
| value = self._data[item] | ||
| if isinstance(value, pa.ChunkedArray): | ||
| return type(self)(value) | ||
| else: | ||
| scalar = value.as_py() | ||
| if scalar is None: | ||
| return self._dtype.na_value | ||
| else: | ||
| return scalar | ||
|  | ||
| def __arrow_array__(self, type=None): | ||
| """Convert myself to a pyarrow Array or ChunkedArray.""" | ||
| """Convert myself to a pyarrow ChunkedArray.""" | ||
| return self._data | ||
|  | ||
| def equals(self, other) -> bool: | ||
|  | @@ -67,6 +186,13 @@ def equals(self, other) -> bool: | |
| # TODO: is this documented somewhere? | ||
| return self._data == other._data | ||
|  | ||
| @property | ||
| def dtype(self) -> ArrowDtype: | ||
| """ | ||
| An instance of 'ExtensionDtype'. | ||
| """ | ||
| return self._dtype | ||
|  | ||
| @property | ||
| def nbytes(self) -> int: | ||
| """ | ||
|  | @@ -377,7 +503,8 @@ def _indexing_key_to_indices( | |
|  | ||
| def _maybe_convert_setitem_value(self, value): | ||
| """Maybe convert value to be pyarrow compatible.""" | ||
| raise NotImplementedError() | ||
| # TODO: Make more robust like ArrowStringArray._maybe_convert_setitem_value | ||
| return value | ||
|  | ||
| def _set_via_chunk_iteration( | ||
| self, indices: npt.NDArray[np.intp], value: npt.NDArray[Any] | ||
|  | ||
Uh oh!
There was an error while loading. Please reload this page.