From 9053263c4c82af45f274c7bd230b61bdc2ed4621 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sun, 15 May 2022 17:55:52 -0700 Subject: [PATCH 01/27] Add other dtype attributes --- pandas/core/arrays/arrow/dtype.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/dtype.py b/pandas/core/arrays/arrow/dtype.py index c0ecb0856f27f..06eb6102bfbe7 100644 --- a/pandas/core/arrays/arrow/dtype.py +++ b/pandas/core/arrays/arrow/dtype.py @@ -68,12 +68,30 @@ def construct_from_string(cls, string: str): return cls(storage="pyarrow") raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") + @property + def _is_numeric(self) -> bool: + """ + Whether columns with this dtype should be considered numeric. + """ + # TODO: pa.types.is_boolean? + return ( + pa.types.is_integer(self.type) + or pa.types.is_floating(self.type) + or pa.types.is_decimal(self.type) + ) + + @property + def _is_boolean(self) -> bool: + """ + Whether this dtype should be considered boolean. + """ + return pa.types.is_boolean(self.type) + @classmethod def from_numpy_dtype(cls, dtype: np.dtype) -> ArrowDtype: """ Construct the ArrowDtype corresponding to the given numpy dtype. """ - # TODO: This may be incomplete pa_dtype = pa.from_numpy_dtype(dtype) if pa_dtype is cls.type: return cls() @@ -82,6 +100,7 @@ def from_numpy_dtype(cls, dtype: np.dtype) -> ArrowDtype: def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: # We unwrap any masked dtypes, find the common dtype we would use # for that, then re-mask the result. + # Mirrors BaseMaskedDtype from pandas.core.dtypes.cast import find_common_type new_dtype = find_common_type( From aee3dc8bef39f0fff3d00e1117bfa0a0339c2fa3 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 16 May 2022 11:02:06 -0700 Subject: [PATCH 02/27] add pa_type in the constructor and modify methods of needed --- pandas/core/arrays/arrow/dtype.py | 64 +++++++++++++++++-------------- 1 file changed, 35 insertions(+), 29 deletions(-) diff --git a/pandas/core/arrays/arrow/dtype.py b/pandas/core/arrays/arrow/dtype.py index 06eb6102bfbe7..30d45fc6cc316 100644 --- a/pandas/core/arrays/arrow/dtype.py +++ b/pandas/core/arrays/arrow/dtype.py @@ -13,18 +13,31 @@ class ArrowDtype(StorageExtensionDtype): """ - Base class for dtypes for BaseArrowArray subclasses. + Base class for dtypes for ArrowExtensionArray. Modeled after BaseMaskedDtype """ - name: str - base = None - type: pa.DataType - na_value = pa.NA - def __init__(self, storage="pyarrow") -> None: - super().__init__(storage) + def __init__(self, pa_dtype: pa.DataType) -> None: + super().__init__("pyarrow") + if not isinstance(pa_dtype, pa.DataType): + raise ValueError("pa_dtype must be an instance of a pyarrow.DataType") + self.pa_dtype = pa_dtype + + @property + def type(self): + """ + The scalar type for the array, e.g. ``int`` + """ + return self.pa_dtype + + @property + def name(self) -> str: + """ + A string identifying the data type. + """ + return str(self.pa_dtype) @cache_readonly def numpy_dtype(self) -> np.dtype: @@ -59,14 +72,20 @@ def construct_from_string(cls, string: str): Parameters ---------- string : str + string should follow the format f"{pyarrow_type}[pyarrow]" + e.g. int64[pyarrow] """ if not isinstance(string, str): raise TypeError( f"'construct_from_string' expects a string, got {type(string)}" ) - if string == f"{cls.name}[pyarrow]": - return cls(storage="pyarrow") - raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") + if not string.endswith("[pyarrow]"): + raise TypeError(f"string {string} must end with '[pyarrow]'") + base_type = string.split("[pyarrow]")[0] + pa_dtype = getattr(pa, base_type, None) + if pa_dtype is None: + raise TypeError(f"'{base_type}' is not a valid pyarrow data type.") + return cls(pa_dtype()) @property def _is_numeric(self) -> bool: @@ -75,9 +94,9 @@ def _is_numeric(self) -> bool: """ # TODO: pa.types.is_boolean? return ( - pa.types.is_integer(self.type) - or pa.types.is_floating(self.type) - or pa.types.is_decimal(self.type) + pa.types.is_integer(self.pa_dtype) + or pa.types.is_floating(self.pa_dtype) + or pa.types.is_decimal(self.pa_dtype) ) @property @@ -85,17 +104,7 @@ def _is_boolean(self) -> bool: """ Whether this dtype should be considered boolean. """ - return pa.types.is_boolean(self.type) - - @classmethod - def from_numpy_dtype(cls, dtype: np.dtype) -> ArrowDtype: - """ - Construct the ArrowDtype corresponding to the given numpy dtype. - """ - pa_dtype = pa.from_numpy_dtype(dtype) - if pa_dtype is cls.type: - return cls() - raise NotImplementedError(dtype) + return pa.types.is_boolean(self.pa_dtype) def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: # We unwrap any masked dtypes, find the common dtype we would use @@ -110,12 +119,9 @@ def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: ] ) if not isinstance(new_dtype, np.dtype): - # If we ever support e.g. Masked[DatetimeArray] then this will change - return None - try: - return type(self).from_numpy_dtype(new_dtype) - except (KeyError, NotImplementedError): return None + pa_dtype = pa.from_numpy_dtype(new_dtype) + return type(self)(pa_dtype) def __from_arrow__(self, array: pa.Array | pa.ChunkedArray): """ From aa13af86a01e45d1f95e45fd109b187c788e40f9 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 16 May 2022 11:49:51 -0700 Subject: [PATCH 03/27] Have ArrowExtensionArray support ArrowDtype --- pandas/core/arrays/arrow/array.py | 13 +++++++++++-- pandas/core/arrays/arrow/dtype.py | 4 ++-- pandas/core/arrays/string_arrow.py | 8 ++------ 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index fdd505e259dd9..cddd213a961de 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -39,6 +39,7 @@ import pyarrow.compute as pc from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning + from pandas.core.arrays.arrow.dtype import ArrowDtype if TYPE_CHECKING: from pandas import Series @@ -54,10 +55,18 @@ class ArrowExtensionArray(ExtensionArray): _data: pa.ChunkedArray def __init__(self, values: pa.ChunkedArray) -> None: - self._data = values + if isinstance(values, pa.Array): + self._data = pa.chunked_array([values]) + elif isinstance(values, pa.ChunkedArray): + self._data = values + else: + raise ValueError( + f"Unsupported type '{type(values)}' for ArrowExtensionArray" + ) + self._dtype = ArrowDtype(self._data.type) def __arrow_array__(self, type=None): - """Convert myself to a pyarrow Array or ChunkedArray.""" + """Convert myself to a pyarrow ChunkedArray.""" return self._data def equals(self, other) -> bool: diff --git a/pandas/core/arrays/arrow/dtype.py b/pandas/core/arrays/arrow/dtype.py index 30d45fc6cc316..bb5560d9cd4d4 100644 --- a/pandas/core/arrays/arrow/dtype.py +++ b/pandas/core/arrays/arrow/dtype.py @@ -8,8 +8,6 @@ from pandas.core.dtypes.base import StorageExtensionDtype -from pandas.core.arrays.arrow import ArrowExtensionArray - class ArrowDtype(StorageExtensionDtype): """ @@ -62,6 +60,8 @@ def construct_array_type(cls): ------- type """ + from pandas.core.arrays.arrow import ArrowExtensionArray + return ArrowExtensionArray @classmethod diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 8b6f1ffcfa59b..7e8fd211e5e56 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -132,13 +132,9 @@ class ArrowStringArray( """ def __init__(self, values) -> None: + super().__init__(values) + # TODO: Migrate to ArrowDtype instead self._dtype = StringDtype(storage="pyarrow") - if isinstance(values, pa.Array): - self._data = pa.chunked_array([values]) - elif isinstance(values, pa.ChunkedArray): - self._data = values - else: - raise ValueError(f"Unsupported type '{type(values)}' for ArrowStringArray") if not pa.types.is_string(self._data.type): raise ValueError( From d521264672172ed16828369c4590f5ff7ef5b33d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 16 May 2022 13:06:39 -0700 Subject: [PATCH 04/27] Fix tests --- pandas/tests/arrays/string_/test_string.py | 2 +- pandas/tests/arrays/string_/test_string_arrow.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 5442f96ab2d22..86d6d88280d64 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -260,7 +260,7 @@ def test_constructor_raises(cls): if cls is pd.arrays.StringArray: msg = "StringArray requires a sequence of strings or pandas.NA" else: - msg = "Unsupported type '' for ArrowStringArray" + msg = "Unsupported type '' for ArrowExtensionArray" with pytest.raises(ValueError, match=msg): cls(np.array(["a", "b"], dtype="S1")) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index de1b7a9c603af..45fa8d0ac14c8 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -59,7 +59,7 @@ def test_constructor_not_string_type_raises(array, chunked): pytest.skip("chunked not applicable to numpy array") arr = pa.chunked_array(arr) if array is np: - msg = "Unsupported type '' for ArrowStringArray" + msg = "Unsupported type '' for ArrowExtensionArray" else: msg = re.escape( "ArrowStringArray requires a PyArrow (chunked) array of string type" From ce0540745edc9e49bfdfff7a9c665da2c539f66d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 16 May 2022 18:06:34 -0700 Subject: [PATCH 05/27] add impoterror raise --- pandas/core/arrays/arrow/array.py | 7 +++++-- pandas/core/arrays/string_arrow.py | 2 +- pandas/tests/arrays/string_/test_string_arrow.py | 2 +- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index cddd213a961de..e0cfe1aa4c913 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -49,12 +49,15 @@ class ArrowExtensionArray(ExtensionArray): """ - Base class for ExtensionArray backed by Arrow array. + Base class for ExtensionArray backed by Arrow ChunkedArray. """ _data: pa.ChunkedArray - def __init__(self, values: pa.ChunkedArray) -> None: + def __init__(self, values: pa.Array | pa.ChunkedArray) -> None: + if pa_version_under1p01: + msg = "pyarrow>=1.0.0 is required for PyArrow backed ArrowExtensionArray." + raise ImportError(msg) if isinstance(values, pa.Array): self._data = pa.chunked_array([values]) elif isinstance(values, pa.ChunkedArray): diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 7e8fd211e5e56..537b791ffc02c 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -76,7 +76,7 @@ def _chk_pyarrow_available() -> None: if pa_version_under1p01: - msg = "pyarrow>=1.0.0 is required for PyArrow backed StringArray." + msg = "pyarrow>=1.0.0 is required for PyArrow backed ArrowExtensionArray." raise ImportError(msg) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 45fa8d0ac14c8..0ae8a51782f5c 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -122,7 +122,7 @@ def test_from_sequence_wrong_dtype_raises(): reason="pyarrow is installed", ) def test_pyarrow_not_installed_raises(): - msg = re.escape("pyarrow>=1.0.0 is required for PyArrow backed StringArray") + msg = re.escape("pyarrow>=1.0.0 is required for PyArrow backed ArrowExtensionArray") with pytest.raises(ImportError, match=msg): StringDtype(storage="pyarrow") From bf0365bec1c80575eacb8708e5952aa0d7338add Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 16 May 2022 18:07:55 -0700 Subject: [PATCH 06/27] Just partial match --- pandas/tests/arrays/string_/test_string_arrow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 0ae8a51782f5c..f43cf298857a0 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -122,7 +122,7 @@ def test_from_sequence_wrong_dtype_raises(): reason="pyarrow is installed", ) def test_pyarrow_not_installed_raises(): - msg = re.escape("pyarrow>=1.0.0 is required for PyArrow backed ArrowExtensionArray") + msg = re.escape("pyarrow>=1.0.0 is required for PyArrow backed") with pytest.raises(ImportError, match=msg): StringDtype(storage="pyarrow") From 01e4a4baae7bf83440a6cb47566b28cc7dcdbb6b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 16 May 2022 22:20:17 -0700 Subject: [PATCH 07/27] Address typing --- pandas/core/arrays/arrow/dtype.py | 2 +- pandas/tests/extension/arrow/arrays.py | 4 ++-- pandas/tests/extension/arrow/test_timestamp.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/arrow/dtype.py b/pandas/core/arrays/arrow/dtype.py index bb5560d9cd4d4..8ba8db5ba56f4 100644 --- a/pandas/core/arrays/arrow/dtype.py +++ b/pandas/core/arrays/arrow/dtype.py @@ -31,7 +31,7 @@ def type(self): return self.pa_dtype @property - def name(self) -> str: + def name(self) -> str: # type: ignore[override] """ A string identifying the data type. """ diff --git a/pandas/tests/extension/arrow/arrays.py b/pandas/tests/extension/arrow/arrays.py index d19a6245809be..22595c4e461d7 100644 --- a/pandas/tests/extension/arrow/arrays.py +++ b/pandas/tests/extension/arrow/arrays.py @@ -185,7 +185,7 @@ def __init__(self, values) -> None: assert values.type == pa.bool_() self._data = values - self._dtype = ArrowBoolDtype() + self._dtype = ArrowBoolDtype() # type: ignore[assignment] class ArrowStringArray(ArrowExtensionArray): @@ -195,4 +195,4 @@ def __init__(self, values) -> None: assert values.type == pa.string() self._data = values - self._dtype = ArrowStringDtype() + self._dtype = ArrowStringDtype() # type: ignore[assignment] diff --git a/pandas/tests/extension/arrow/test_timestamp.py b/pandas/tests/extension/arrow/test_timestamp.py index b2750784ab3d6..5b81940e5a6c0 100644 --- a/pandas/tests/extension/arrow/test_timestamp.py +++ b/pandas/tests/extension/arrow/test_timestamp.py @@ -46,7 +46,7 @@ def __init__(self, values) -> None: assert values.type == pa.timestamp("us") self._data = values - self._dtype = ArrowTimestampUSDtype() + self._dtype = ArrowTimestampUSDtype() # type: ignore[assignment] def test_constructor_extensionblock(): From c33c345d6a89e9dbb39fb7fe3e5e6522a431da5a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 23 May 2022 17:32:53 -0700 Subject: [PATCH 08/27] Complete more methods of extentionarrow --- pandas/_testing/__init__.py | 40 ++++++ pandas/core/arrays/arrow/array.py | 122 ++++++++++++++++++ pandas/core/arrays/arrow/dtype.py | 5 +- pandas/core/arrays/string_arrow.py | 93 +------------ .../extension/arrow/test_constructors.py | 61 +++++++++ 5 files changed, 228 insertions(+), 93 deletions(-) create mode 100644 pandas/tests/extension/arrow/test_constructors.py diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 53e003e2ed7dd..760aaf274d5e6 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -26,6 +26,7 @@ ) from pandas._typing import Dtype +from pandas.compat import pa_version_under1p01 from pandas.core.dtypes.common import ( is_float_dtype, @@ -193,6 +194,45 @@ ] ] +if not pa_version_under1p01: + import pyarrow as pa + + UNSIGNED_INT_PYARROW_DTYPES = [pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()] + SIGNED_INT_NUMPY_DTYPES = [pa.uint8(), pa.int16(), pa.int32(), pa.uint64()] + ALL_INT_PYARROW_DTYPES = UNSIGNED_INT_PYARROW_DTYPES + SIGNED_INT_NUMPY_DTYPES + + FLOAT_PYARROW_DTYPES = [pa.float16(), pa.float32(), pa.float64()] + STRING_PYARROW_DTYPES = [pa.string(), pa.utf8()] + + TIME_PYARROW_DTYPES = [ + pa.time32("s"), + pa.time32("ms"), + pa.time64("us"), + pa.time64("ns"), + ] + DATE_PYARROW_DTYPES = [pa.date32(), pa.date64()] + DATETIME_PYARROW_DTYPES = [ + pa.timestamp(unit=unit, tz=tz) + for unit in ["s", "ms", "us", "ns"] + for tz in [None, "UTC", "US/Pacific", "US/Eastern"] + ] + TIMEDELTA_PYARROW_DTYPES = [pa.duration(unit) for unit in ["s", "ms", "us", "ns"]] + + BOOL_PYARROW_DTYPES = [pa.bool_()] + + # TODO: Add container like pyarrow types: + # https://arrow.apache.org/docs/python/api/datatypes.html#factory-functions + ALL_PYARROW_DTYPES = ( + ALL_INT_PYARROW_DTYPES + + FLOAT_PYARROW_DTYPES + + TIME_PYARROW_DTYPES + + DATE_PYARROW_DTYPES + + DATETIME_PYARROW_DTYPES + + TIMEDELTA_PYARROW_DTYPES + + BOOL_PYARROW_DTYPES + ) + + EMPTY_STRING_PATTERN = re.compile("^$") # set testing_mode diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index e0cfe1aa4c913..7fe86aa70aa15 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -9,6 +9,8 @@ import numpy as np from pandas._typing import ( + Dtype, + PositionalIndexer, TakeIndexer, npt, ) @@ -24,6 +26,7 @@ is_array_like, is_bool_dtype, is_integer, + is_integer_dtype, is_scalar, ) from pandas.core.dtypes.missing import isna @@ -31,6 +34,7 @@ from pandas.core.arrays.base import ExtensionArray from pandas.core.indexers import ( check_array_indexer, + unpack_tuple_and_ellipses, validate_indices, ) @@ -68,6 +72,117 @@ def __init__(self, values: pa.Array | pa.ChunkedArray) -> None: ) self._dtype = ArrowDtype(self._data.type) + @classmethod + def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): + """ + Construct a new ExtensionArray from a sequence of scalars. + """ + if isinstance(dtype, ArrowDtype): + pa_type = dtype.pa_dtype + elif not dtype: + pa_type = None + else: + try: + pa_type = pa.from_numpy_dtype(dtype) + except TypeError: + pa_type = None + return cls(pa.chunked_array([scalars], type=pa_type)) + + @classmethod + def _from_sequence_of_strings( + cls, strings, *, dtype: Dtype | None = None, copy=False + ): + """ + Construct a new ExtensionArray from a sequence of strings. + """ + return cls._from_sequence(strings, dtype, copy) + + @classmethod + def _from_factorized(cls, values, original): + """ + Reconstruct an ExtensionArray after factorization. + + Parameters + ---------- + values : ndarray + An integer ndarray with the factorized values. + original : ExtensionArray + The original ExtensionArray that factorize was called on. + + See Also + -------- + factorize : Top-level factorize method that dispatches here. + ExtensionArray.factorize : Encode the extension array as an enumerated type. + """ + return original.take(values) + + def __getitem__(self, item: PositionalIndexer): + """Select a subset of self. + + Parameters + ---------- + item : int, slice, or ndarray + * int: The position in 'self' to get. + * slice: A slice object, where 'start', 'stop', and 'step' are + integers or None + * ndarray: A 1-d boolean NumPy ndarray the same length as 'self' + + Returns + ------- + item : scalar or ExtensionArray + + Notes + ----- + For scalar ``item``, return a scalar value suitable for the array's + type. This should be an instance of ``self.dtype.type``. + For slice ``key``, return an instance of ``ExtensionArray``, even + if the slice is length 0 or 1. + For a boolean mask, return an instance of ``ExtensionArray``, filtered + to the values where ``item`` is True. + """ + item = check_array_indexer(self, item) + + if isinstance(item, np.ndarray): + if not len(item): + return type(self)(pa.chunked_array([], type=self._dtype.pa_dtype)) + elif is_integer_dtype(item.dtype): + return self.take(item) + elif is_bool_dtype(item.dtype): + return type(self)(self._data.filter(item)) + else: + raise IndexError( + "Only integers, slices and integer or " + "boolean arrays are valid indices." + ) + elif isinstance(item, tuple): + item = unpack_tuple_and_ellipses(item) + + # error: Non-overlapping identity check (left operand type: + # "Union[Union[int, integer[Any]], Union[slice, List[int], + # ndarray[Any, Any]]]", right operand type: "ellipsis") + if item is Ellipsis: # type: ignore[comparison-overlap] + # TODO: should be handled by pyarrow? + item = slice(None) + + if is_scalar(item) and not is_integer(item): + # e.g. "foo" or 2.5 + # exception message copied from numpy + raise IndexError( + r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis " + r"(`None`) and integer or boolean arrays are valid indices" + ) + # We are not an array indexer, so maybe e.g. a slice or integer + # indexer. We dispatch to pyarrow. + value = self._data[item] + if isinstance(value, pa.ChunkedArray): + return type(self)(value) + else: + scalar = value.as_py() + if scalar is None: + return self._dtype.na_value + else: + return scalar + def __arrow_array__(self, type=None): """Convert myself to a pyarrow ChunkedArray.""" return self._data @@ -79,6 +194,13 @@ def equals(self, other) -> bool: # TODO: is this documented somewhere? return self._data == other._data + @property + def dtype(self) -> ArrowDtype: + """ + An instance of 'ExtensionDtype'. + """ + return self._dtype + @property def nbytes(self) -> int: """ diff --git a/pandas/core/arrays/arrow/dtype.py b/pandas/core/arrays/arrow/dtype.py index 8ba8db5ba56f4..987722d967bcf 100644 --- a/pandas/core/arrays/arrow/dtype.py +++ b/pandas/core/arrays/arrow/dtype.py @@ -20,7 +20,10 @@ class ArrowDtype(StorageExtensionDtype): def __init__(self, pa_dtype: pa.DataType) -> None: super().__init__("pyarrow") if not isinstance(pa_dtype, pa.DataType): - raise ValueError("pa_dtype must be an instance of a pyarrow.DataType") + raise ValueError( + f"pa_dtype ({pa_dtype}) must be an instance " + f"of a pyarrow.DataType. Got {type(pa_dtype)} instead." + ) self.pa_dtype = pa_dtype @property diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 537b791ffc02c..3ee959f5cd931 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -2,10 +2,7 @@ from collections.abc import Callable # noqa: PDF001 import re -from typing import ( - Union, - overload, -) +from typing import Union import numpy as np @@ -16,10 +13,7 @@ from pandas._typing import ( Dtype, NpDtype, - PositionalIndexer, Scalar, - ScalarIndexer, - SequenceIndexer, npt, ) from pandas.compat import ( @@ -32,7 +26,6 @@ from pandas.core.dtypes.common import ( is_bool_dtype, is_dtype_equal, - is_integer, is_integer_dtype, is_object_dtype, is_scalar, @@ -50,10 +43,6 @@ BaseStringArray, StringDtype, ) -from pandas.core.indexers import ( - check_array_indexer, - unpack_tuple_and_ellipses, -) from pandas.core.strings.object_array import ObjectStringArrayMixin if not pa_version_under1p01: @@ -201,86 +190,6 @@ def to_numpy( result[mask] = na_value return result - @overload - def __getitem__(self, item: ScalarIndexer) -> ArrowStringScalarOrNAT: - ... - - @overload - def __getitem__(self: ArrowStringArray, item: SequenceIndexer) -> ArrowStringArray: - ... - - def __getitem__( - self: ArrowStringArray, item: PositionalIndexer - ) -> ArrowStringArray | ArrowStringScalarOrNAT: - """Select a subset of self. - - Parameters - ---------- - item : int, slice, or ndarray - * int: The position in 'self' to get. - * slice: A slice object, where 'start', 'stop', and 'step' are - integers or None - * ndarray: A 1-d boolean NumPy ndarray the same length as 'self' - - Returns - ------- - item : scalar or ExtensionArray - - Notes - ----- - For scalar ``item``, return a scalar value suitable for the array's - type. This should be an instance of ``self.dtype.type``. - For slice ``key``, return an instance of ``ExtensionArray``, even - if the slice is length 0 or 1. - For a boolean mask, return an instance of ``ExtensionArray``, filtered - to the values where ``item`` is True. - """ - item = check_array_indexer(self, item) - - if isinstance(item, np.ndarray): - if not len(item): - return type(self)(pa.chunked_array([], type=pa.string())) - elif is_integer_dtype(item.dtype): - return self.take(item) - elif is_bool_dtype(item.dtype): - return type(self)(self._data.filter(item)) - else: - raise IndexError( - "Only integers, slices and integer or " - "boolean arrays are valid indices." - ) - elif isinstance(item, tuple): - item = unpack_tuple_and_ellipses(item) - - # error: Non-overlapping identity check (left operand type: - # "Union[Union[int, integer[Any]], Union[slice, List[int], - # ndarray[Any, Any]]]", right operand type: "ellipsis") - if item is Ellipsis: # type: ignore[comparison-overlap] - # TODO: should be handled by pyarrow? - item = slice(None) - - if is_scalar(item) and not is_integer(item): - # e.g. "foo" or 2.5 - # exception message copied from numpy - raise IndexError( - r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis " - r"(`None`) and integer or boolean arrays are valid indices" - ) - # We are not an array indexer, so maybe e.g. a slice or integer - # indexer. We dispatch to pyarrow. - value = self._data[item] - if isinstance(value, pa.ChunkedArray): - return type(self)(value) - else: - return self._as_pandas_scalar(value) - - def _as_pandas_scalar(self, arrow_scalar: pa.Scalar): - scalar = arrow_scalar.as_py() - if scalar is None: - return self._dtype.na_value - else: - return scalar - def _cmp_method(self, other, op): from pandas.arrays import BooleanArray diff --git a/pandas/tests/extension/arrow/test_constructors.py b/pandas/tests/extension/arrow/test_constructors.py new file mode 100644 index 0000000000000..54d163e7200c5 --- /dev/null +++ b/pandas/tests/extension/arrow/test_constructors.py @@ -0,0 +1,61 @@ +from datetime import ( + date, + datetime, + time, + timedelta, +) + +import pytest + +from pandas.compat import pa_version_under1p01 + +import pandas as pd +import pandas._testing as tm +from pandas.tests.extension import base + +pytestmark = pytest.mark.skipif( + pa_version_under1p01, reason="Minimum required pyarrow version not available" +) + +import pyarrow as pa # isort:skip + +from pandas.core.arrays.arrow.dtype import ArrowDtype # isort:skip + + +@pytest.fixture(params=tm.ALL_PYARROW_DTYPES) +def dtype(request): + return ArrowDtype(pa_dtype=request.param) + + +@pytest.fixture +def data(dtype): + pa_dtype = dtype.pa_dtype + if pa.types.is_boolean(pa_dtype): + data = [True, None, False, None] + elif pa.types.is_floating(pa_dtype): + data = [1.0, None, 0.0, None, -2.0, None, 0.5, None, 99.9, None] + elif pa.types.is_signed_integer(pa_dtype): + data = [1, None, 0, None, -2, None, 10] + elif pa.types.is_unsigned_integer(pa_dtype): + data = [1, None, 0, None, 2, None, 10] + elif pa.types.is_date(pa_dtype): + data = [date(2022, 1, 1), None, date.min, None, date.max] + elif pa.types.is_timestamp(pa_dtype): + data = [ + datetime(2020, 1, 1, 1, 1, 1, 1), + None, + datetime.min, + None, + datetime.max, + ] + elif pa.types.is_duration(pa_dtype): + data = [timedelta(1, 1, 1), None, timedelta.min, None, timedelta.max] + elif pa.types.is_time(pa_dtype): + data = [time(12, 0), None, time(0, 12)] + else: + data = [] + return pd.array(data, dtype=dtype) + + +class TestConstructors(base.BaseConstructorsTests): + pass From 901e9b09ee21e99415698a184cd6fce1e9cf2246 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 23 May 2022 23:00:45 -0700 Subject: [PATCH 09/27] Add types and first test --- pandas/_testing/__init__.py | 2 +- pandas/core/arrays/arrow/array.py | 20 +++++++++++-------- pandas/core/arrays/arrow/dtype.py | 4 ++-- .../extension/arrow/test_constructors.py | 15 +++++++------- 4 files changed, 22 insertions(+), 19 deletions(-) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 760aaf274d5e6..2d562af141c14 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -201,7 +201,7 @@ SIGNED_INT_NUMPY_DTYPES = [pa.uint8(), pa.int16(), pa.int32(), pa.uint64()] ALL_INT_PYARROW_DTYPES = UNSIGNED_INT_PYARROW_DTYPES + SIGNED_INT_NUMPY_DTYPES - FLOAT_PYARROW_DTYPES = [pa.float16(), pa.float32(), pa.float64()] + FLOAT_PYARROW_DTYPES = [pa.float32(), pa.float64()] STRING_PYARROW_DTYPES = [pa.string(), pa.utf8()] TIME_PYARROW_DTYPES = [ diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 7fe86aa70aa15..8ced3633f8a7e 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -78,15 +78,19 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): Construct a new ExtensionArray from a sequence of scalars. """ if isinstance(dtype, ArrowDtype): - pa_type = dtype.pa_dtype - elif not dtype: - pa_type = None + pa_dtype = dtype.pa_dtype + elif dtype: + pa_dtype = pa.from_numpy_dtype(dtype) else: - try: - pa_type = pa.from_numpy_dtype(dtype) - except TypeError: - pa_type = None - return cls(pa.chunked_array([scalars], type=pa_type)) + pa_dtype = None + + if isinstance(scalars, cls): + data = scalars._data + if pa_dtype: + data = data.cast(pa_dtype) + return cls(data) + else: + return cls(pa.chunked_array([scalars], type=pa_dtype)) @classmethod def _from_sequence_of_strings( diff --git a/pandas/core/arrays/arrow/dtype.py b/pandas/core/arrays/arrow/dtype.py index 987722d967bcf..f4251f64d772d 100644 --- a/pandas/core/arrays/arrow/dtype.py +++ b/pandas/core/arrays/arrow/dtype.py @@ -31,7 +31,7 @@ def type(self): """ The scalar type for the array, e.g. ``int`` """ - return self.pa_dtype + return type(self.pa_dtype) @property def name(self) -> str: # type: ignore[override] @@ -43,7 +43,7 @@ def name(self) -> str: # type: ignore[override] @cache_readonly def numpy_dtype(self) -> np.dtype: """Return an instance of the related numpy dtype""" - return self.type.to_pandas_dtype() + return self.pa_dtype.to_pandas_dtype() @cache_readonly def kind(self) -> str: diff --git a/pandas/tests/extension/arrow/test_constructors.py b/pandas/tests/extension/arrow/test_constructors.py index 54d163e7200c5..c5b5d6869da5f 100644 --- a/pandas/tests/extension/arrow/test_constructors.py +++ b/pandas/tests/extension/arrow/test_constructors.py @@ -13,9 +13,11 @@ import pandas._testing as tm from pandas.tests.extension import base -pytestmark = pytest.mark.skipif( - pa_version_under1p01, reason="Minimum required pyarrow version not available" -) +pytestmark = [ + pytest.mark.skipif( + pa_version_under1p01, reason="Minimum required pyarrow version not available" + ) +] import pyarrow as pa # isort:skip @@ -39,17 +41,14 @@ def data(dtype): elif pa.types.is_unsigned_integer(pa_dtype): data = [1, None, 0, None, 2, None, 10] elif pa.types.is_date(pa_dtype): - data = [date(2022, 1, 1), None, date.min, None, date.max] + data = [date(2022, 1, 1), None] elif pa.types.is_timestamp(pa_dtype): data = [ datetime(2020, 1, 1, 1, 1, 1, 1), None, - datetime.min, - None, - datetime.max, ] elif pa.types.is_duration(pa_dtype): - data = [timedelta(1, 1, 1), None, timedelta.min, None, timedelta.max] + data = [timedelta(1), None] elif pa.types.is_time(pa_dtype): data = [time(12, 0), None, time(0, 12)] else: From b3f6d93087ab9c235a9c3981d2cc7d268ee91e9c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 24 May 2022 09:01:43 -0700 Subject: [PATCH 10/27] Fix getitem type thing --- pandas/core/arrays/arrow/array.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 8ced3633f8a7e..d4c165579e04d 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -148,7 +148,12 @@ def __getitem__(self, item: PositionalIndexer): if isinstance(item, np.ndarray): if not len(item): - return type(self)(pa.chunked_array([], type=self._dtype.pa_dtype)) + # Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string] + if self._dtype.name == "string" and self._dtype.storage == "pyarrow": + pa_dtype = pa.string() + else: + pa_dtype = self._dtype.pa_dtype + return type(self)(pa.chunked_array([], type=pa_dtype)) elif is_integer_dtype(item.dtype): return self.take(item) elif is_bool_dtype(item.dtype): From 5c873d5be8dba68565752b5b949777abfcb651f4 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 24 May 2022 10:01:03 -0700 Subject: [PATCH 11/27] Try import or skip: --- pandas/tests/extension/arrow/test_constructors.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/pandas/tests/extension/arrow/test_constructors.py b/pandas/tests/extension/arrow/test_constructors.py index c5b5d6869da5f..41511f94993c9 100644 --- a/pandas/tests/extension/arrow/test_constructors.py +++ b/pandas/tests/extension/arrow/test_constructors.py @@ -7,19 +7,11 @@ import pytest -from pandas.compat import pa_version_under1p01 - import pandas as pd import pandas._testing as tm from pandas.tests.extension import base -pytestmark = [ - pytest.mark.skipif( - pa_version_under1p01, reason="Minimum required pyarrow version not available" - ) -] - -import pyarrow as pa # isort:skip +pa = pytest.importorskip("pyarrow", minversion="1.0.1") from pandas.core.arrays.arrow.dtype import ArrowDtype # isort:skip From 68bb030a077aa3a3c0da5a1cbe19da6dc6d2af64 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 24 May 2022 12:37:32 -0700 Subject: [PATCH 12/27] Fix typo --- pandas/_testing/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 2d562af141c14..d64229da4a7e5 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -198,7 +198,7 @@ import pyarrow as pa UNSIGNED_INT_PYARROW_DTYPES = [pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()] - SIGNED_INT_NUMPY_DTYPES = [pa.uint8(), pa.int16(), pa.int32(), pa.uint64()] + SIGNED_INT_PYARROW_DTYPES = [pa.uint8(), pa.int16(), pa.int32(), pa.uint64()] ALL_INT_PYARROW_DTYPES = UNSIGNED_INT_PYARROW_DTYPES + SIGNED_INT_NUMPY_DTYPES FLOAT_PYARROW_DTYPES = [pa.float32(), pa.float64()] From 9fd91618f3e3fa9c47d347fcf5d49b03e4b10e10 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 24 May 2022 16:51:04 -0700 Subject: [PATCH 13/27] Fix data size, coersion of pa.NA in lists --- pandas/_testing/__init__.py | 2 +- pandas/core/arrays/arrow/array.py | 4 +++- pandas/tests/extension/arrow/test_constructors.py | 5 +++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index d64229da4a7e5..fbf1cea670c5c 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -199,7 +199,7 @@ UNSIGNED_INT_PYARROW_DTYPES = [pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()] SIGNED_INT_PYARROW_DTYPES = [pa.uint8(), pa.int16(), pa.int32(), pa.uint64()] - ALL_INT_PYARROW_DTYPES = UNSIGNED_INT_PYARROW_DTYPES + SIGNED_INT_NUMPY_DTYPES + ALL_INT_PYARROW_DTYPES = UNSIGNED_INT_PYARROW_DTYPES + SIGNED_INT_PYARROW_DTYPES FLOAT_PYARROW_DTYPES = [pa.float32(), pa.float64()] STRING_PYARROW_DTYPES = [pa.string(), pa.utf8()] diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index d4c165579e04d..3a51186240d37 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -90,7 +90,9 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): data = data.cast(pa_dtype) return cls(data) else: - return cls(pa.chunked_array([scalars], type=pa_dtype)) + # https://issues.apache.org/jira/browse/ARROW-16645 + mask = [scalar is pa.NA or isna(scalar) for scalar in scalars] + return cls(pa.chunked_array(pa.array(scalars, mask=mask, type=pa_dtype))) @classmethod def _from_sequence_of_strings( diff --git a/pandas/tests/extension/arrow/test_constructors.py b/pandas/tests/extension/arrow/test_constructors.py index 41511f94993c9..8ab5a11c8f25a 100644 --- a/pandas/tests/extension/arrow/test_constructors.py +++ b/pandas/tests/extension/arrow/test_constructors.py @@ -33,14 +33,15 @@ def data(dtype): elif pa.types.is_unsigned_integer(pa_dtype): data = [1, None, 0, None, 2, None, 10] elif pa.types.is_date(pa_dtype): - data = [date(2022, 1, 1), None] + data = [date(2022, 1, 1), None, date(1999, 12, 31)] elif pa.types.is_timestamp(pa_dtype): data = [ datetime(2020, 1, 1, 1, 1, 1, 1), None, + datetime(1999, 1, 1, 1, 1, 1, 1), ] elif pa.types.is_duration(pa_dtype): - data = [timedelta(1), None] + data = [timedelta(1), None, timedelta(1, 1)] elif pa.types.is_time(pa_dtype): data = [time(12, 0), None, time(0, 12)] else: From 939e751e694d69ea3ba673f91eab96258cb56f25 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 24 May 2022 17:30:28 -0700 Subject: [PATCH 14/27] change pa_dtype to pyarrow dtype --- pandas/core/arrays/arrow/dtype.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/pandas/core/arrays/arrow/dtype.py b/pandas/core/arrays/arrow/dtype.py index f4251f64d772d..71e5ef65eba54 100644 --- a/pandas/core/arrays/arrow/dtype.py +++ b/pandas/core/arrays/arrow/dtype.py @@ -17,33 +17,33 @@ class ArrowDtype(StorageExtensionDtype): na_value = pa.NA - def __init__(self, pa_dtype: pa.DataType) -> None: + def __init__(self, pyarrow_dtype: pa.DataType) -> None: super().__init__("pyarrow") - if not isinstance(pa_dtype, pa.DataType): + if not isinstance(pyarrow_dtype, pa.DataType): raise ValueError( - f"pa_dtype ({pa_dtype}) must be an instance " - f"of a pyarrow.DataType. Got {type(pa_dtype)} instead." + f"pyarrow_dtype ({pyarrow_dtype}) must be an instance " + f"of a pyarrow.DataType. Got {type(pyarrow_dtype)} instead." ) - self.pa_dtype = pa_dtype + self.pyarrow_dtype = pyarrow_dtype @property def type(self): """ The scalar type for the array, e.g. ``int`` """ - return type(self.pa_dtype) + return type(self.pyarrow_dtype) @property def name(self) -> str: # type: ignore[override] """ A string identifying the data type. """ - return str(self.pa_dtype) + return str(self.pyarrow_dtype) @cache_readonly def numpy_dtype(self) -> np.dtype: """Return an instance of the related numpy dtype""" - return self.pa_dtype.to_pandas_dtype() + return self.pyarrow_dtype.to_pandas_dtype() @cache_readonly def kind(self) -> str: @@ -97,9 +97,9 @@ def _is_numeric(self) -> bool: """ # TODO: pa.types.is_boolean? return ( - pa.types.is_integer(self.pa_dtype) - or pa.types.is_floating(self.pa_dtype) - or pa.types.is_decimal(self.pa_dtype) + pa.types.is_integer(self.pyarrow_dtype) + or pa.types.is_floating(self.pyarrow_dtype) + or pa.types.is_decimal(self.pyarrow_dtype) ) @property @@ -107,7 +107,7 @@ def _is_boolean(self) -> bool: """ Whether this dtype should be considered boolean. """ - return pa.types.is_boolean(self.pa_dtype) + return pa.types.is_boolean(self.pyarrow_dtype) def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: # We unwrap any masked dtypes, find the common dtype we would use From 1a5d3ff72b6befda79d7931cc16d7ccf4994378c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 24 May 2022 22:48:09 -0700 Subject: [PATCH 15/27] Address more tests --- pandas/core/arrays/arrow/array.py | 4 ++-- pandas/core/arrays/arrow/dtype.py | 10 +++++++--- pandas/core/dtypes/base.py | 2 +- pandas/tests/extension/arrow/test_constructors.py | 4 ++-- 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 3a51186240d37..06e87ded2aa37 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -78,7 +78,7 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): Construct a new ExtensionArray from a sequence of scalars. """ if isinstance(dtype, ArrowDtype): - pa_dtype = dtype.pa_dtype + pa_dtype = dtype.pyarrow_dtype elif dtype: pa_dtype = pa.from_numpy_dtype(dtype) else: @@ -154,7 +154,7 @@ def __getitem__(self, item: PositionalIndexer): if self._dtype.name == "string" and self._dtype.storage == "pyarrow": pa_dtype = pa.string() else: - pa_dtype = self._dtype.pa_dtype + pa_dtype = self._dtype.pyarrow_dtype return type(self)(pa.chunked_array([], type=pa_dtype)) elif is_integer_dtype(item.dtype): return self.take(item) diff --git a/pandas/core/arrays/arrow/dtype.py b/pandas/core/arrays/arrow/dtype.py index 71e5ef65eba54..84be3bf5a2edb 100644 --- a/pandas/core/arrays/arrow/dtype.py +++ b/pandas/core/arrays/arrow/dtype.py @@ -3,6 +3,7 @@ import numpy as np import pyarrow as pa +from pandas._libs import missing as libmissing from pandas._typing import DtypeObj from pandas.util._decorators import cache_readonly @@ -15,7 +16,7 @@ class ArrowDtype(StorageExtensionDtype): Modeled after BaseMaskedDtype """ - na_value = pa.NA + na_value = libmissing.NA def __init__(self, pyarrow_dtype: pa.DataType) -> None: super().__init__("pyarrow") @@ -29,7 +30,7 @@ def __init__(self, pyarrow_dtype: pa.DataType) -> None: @property def type(self): """ - The scalar type for the array, e.g. ``int`` + Returns pyarrow.DataType. """ return type(self.pyarrow_dtype) @@ -43,7 +44,10 @@ def name(self) -> str: # type: ignore[override] @cache_readonly def numpy_dtype(self) -> np.dtype: """Return an instance of the related numpy dtype""" - return self.pyarrow_dtype.to_pandas_dtype() + try: + return np.dtype(self.pyarrow_dtype.to_pandas_dtype()) + except (NotImplementedError, TypeError): + return np.dtype(object) @cache_readonly def kind(self) -> str: diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index 9762b779477e4..c114e8a9d316b 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -405,7 +405,7 @@ def __repr__(self): return f"{self.name}[{self.storage}]" def __str__(self): - return self.name + return f"{self.name}[{self.storage}]" def __eq__(self, other: Any) -> bool: if isinstance(other, self.type) and other == self.name: diff --git a/pandas/tests/extension/arrow/test_constructors.py b/pandas/tests/extension/arrow/test_constructors.py index 8ab5a11c8f25a..8084f1a00e20a 100644 --- a/pandas/tests/extension/arrow/test_constructors.py +++ b/pandas/tests/extension/arrow/test_constructors.py @@ -18,12 +18,12 @@ @pytest.fixture(params=tm.ALL_PYARROW_DTYPES) def dtype(request): - return ArrowDtype(pa_dtype=request.param) + return ArrowDtype(pyarrow_dtype=request.param) @pytest.fixture def data(dtype): - pa_dtype = dtype.pa_dtype + pa_dtype = dtype.pyarrow_dtype if pa.types.is_boolean(pa_dtype): data = [True, None, False, None] elif pa.types.is_floating(pa_dtype): From f2dda8c6698b26259b392d631796276f5c1fb59e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 25 May 2022 11:12:15 -0700 Subject: [PATCH 16/27] Add register_extension_dtype --- pandas/core/arrays/arrow/dtype.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/dtype.py b/pandas/core/arrays/arrow/dtype.py index 84be3bf5a2edb..3c3b12b6f0142 100644 --- a/pandas/core/arrays/arrow/dtype.py +++ b/pandas/core/arrays/arrow/dtype.py @@ -7,9 +7,13 @@ from pandas._typing import DtypeObj from pandas.util._decorators import cache_readonly -from pandas.core.dtypes.base import StorageExtensionDtype +from pandas.core.dtypes.base import ( + StorageExtensionDtype, + register_extension_dtype, +) +@register_extension_dtype class ArrowDtype(StorageExtensionDtype): """ Base class for dtypes for ArrowExtensionArray. From 26b2f1c441fa0e41ce1df87ee3694ff02b39226f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 25 May 2022 11:22:59 -0700 Subject: [PATCH 17/27] Address Joris' comments --- pandas/core/arrays/arrow/array.py | 6 +++--- pandas/core/arrays/string_.py | 2 -- pandas/core/dtypes/base.py | 2 +- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 06e87ded2aa37..78cdc1e3200fd 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -90,9 +90,9 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): data = data.cast(pa_dtype) return cls(data) else: - # https://issues.apache.org/jira/browse/ARROW-16645 - mask = [scalar is pa.NA or isna(scalar) for scalar in scalars] - return cls(pa.chunked_array(pa.array(scalars, mask=mask, type=pa_dtype))) + return cls( + pa.chunked_array(pa.array(scalars, type=pa_dtype, from_pandas=True)) + ) @classmethod def _from_sequence_of_strings( diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 21b5dc625956e..01102043fcf25 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -64,8 +64,6 @@ class StringDtype(StorageExtensionDtype): StringDtype is considered experimental. The implementation and parts of the API may change without warning. - In particular, StringDtype.na_value may change to no longer be - ``pd.NA``. Parameters ---------- diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index c114e8a9d316b..7bdad2f28c587 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -408,7 +408,7 @@ def __str__(self): return f"{self.name}[{self.storage}]" def __eq__(self, other: Any) -> bool: - if isinstance(other, self.type) and other == self.name: + if isinstance(other, str) and other == self.name: return True return super().__eq__(other) From 95bd38fe1e18cc1f672686963a9e6c9f65a78be4 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 25 May 2022 12:10:56 -0700 Subject: [PATCH 18/27] Revert to self.name, xfail the dtype test due to conflict --- pandas/core/dtypes/base.py | 2 +- pandas/tests/extension/arrow/test_constructors.py | 10 +++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index 7bdad2f28c587..cffac15ef6496 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -405,7 +405,7 @@ def __repr__(self): return f"{self.name}[{self.storage}]" def __str__(self): - return f"{self.name}[{self.storage}]" + return self.name def __eq__(self, other: Any) -> bool: if isinstance(other, str) and other == self.name: diff --git a/pandas/tests/extension/arrow/test_constructors.py b/pandas/tests/extension/arrow/test_constructors.py index 8084f1a00e20a..9f83dbb0890a1 100644 --- a/pandas/tests/extension/arrow/test_constructors.py +++ b/pandas/tests/extension/arrow/test_constructors.py @@ -50,4 +50,12 @@ def data(dtype): class TestConstructors(base.BaseConstructorsTests): - pass + @pytest.mark.xfail( + reason=( + "str(dtype) constructs " + "e.g. in64[pyarrow] like int64 (numpy) " + "due to StorageExtensionDtype.__str__" + ) + ) + def test_from_dtype(self, data): + super().test_from_dtype(data) From a455b50edb30ab7bf2e41259ef238ec1525dd24d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 25 May 2022 13:51:28 -0700 Subject: [PATCH 19/27] Add getitem tests --- pandas/core/arrays/arrow/array.py | 3 +- pandas/tests/extension/arrow/test_arrow.py | 119 ++++++++++++++++++ .../extension/arrow/test_constructors.py | 61 --------- 3 files changed, 121 insertions(+), 62 deletions(-) create mode 100644 pandas/tests/extension/arrow/test_arrow.py delete mode 100644 pandas/tests/extension/arrow/test_constructors.py diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 78cdc1e3200fd..25e7f0f71a73b 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -522,7 +522,8 @@ def _indexing_key_to_indices( def _maybe_convert_setitem_value(self, value): """Maybe convert value to be pyarrow compatible.""" - raise NotImplementedError() + # TODO: Make more robust like ArrowStringArray._maybe_convert_setitem_value + return value def _set_via_chunk_iteration( self, indices: npt.NDArray[np.intp], value: npt.NDArray[Any] diff --git a/pandas/tests/extension/arrow/test_arrow.py b/pandas/tests/extension/arrow/test_arrow.py new file mode 100644 index 0000000000000..92bf3f6b9f445 --- /dev/null +++ b/pandas/tests/extension/arrow/test_arrow.py @@ -0,0 +1,119 @@ +from datetime import ( + date, + datetime, + time, + timedelta, +) + +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.tests.extension import base + +pa = pytest.importorskip("pyarrow", minversion="1.0.1") + +from pandas.core.arrays.arrow.dtype import ArrowDtype # isort:skip + + +@pytest.fixture(params=tm.ALL_PYARROW_DTYPES) +def dtype(request): + return ArrowDtype(pyarrow_dtype=request.param) + + +@pytest.fixture +def data(dtype): + pa_dtype = dtype.pyarrow_dtype + if pa.types.is_boolean(pa_dtype): + data = [True, None, False, None, False, None] + elif pa.types.is_floating(pa_dtype): + data = [1.0, None, 0.0, None, -2.0, None, 0.5, None, 99.9, None] + elif pa.types.is_signed_integer(pa_dtype): + data = [1, None, 0, None, -2, None, 10] + elif pa.types.is_unsigned_integer(pa_dtype): + data = [1, None, 0, None, 2, None, 10] + elif pa.types.is_date(pa_dtype): + data = [ + date(2022, 1, 1), + None, + date(1999, 12, 31), + None, + date(2000, 1, 1), + None, + ] + elif pa.types.is_timestamp(pa_dtype): + data = [ + datetime(2020, 1, 1, 1, 1, 1, 1), + None, + datetime(1999, 1, 1, 1, 1, 1, 1), + None, + datetime(2000, 1, 1, 1, 1, 1, 1), + None, + ] + elif pa.types.is_duration(pa_dtype): + data = [timedelta(1), None, timedelta(1, 1), None, timedelta(-1), None] + elif pa.types.is_time(pa_dtype): + data = [time(12, 0), None, time(0, 12), None, time(0, 0), None] + else: + data = [] + return pd.array(data, dtype=dtype) + + +@pytest.fixture +def data_not_missing(data): + data = data.take( + indices=np.full(len(data), -1), allow_fill=True, fill_value=data[0] + ) + return data + + +@pytest.fixture +def data_missing(data): + """Length-2 array with [NA, Valid]""" + return type(data)._from_sequence([data[1], data[0]]) + + +@pytest.fixture +def na_value(): + """The scalar missing value for this type. Default 'None'""" + return pd.NA + + +class TestConstructors(base.BaseConstructorsTests): + @pytest.mark.xfail( + reason=( + "str(dtype) constructs " + "e.g. in64[pyarrow] like int64 (numpy) " + "due to StorageExtensionDtype.__str__" + ) + ) + def test_from_dtype(self, data): + super().test_from_dtype(data) + + +class TestGetitemTests(base.BaseGetitemTests): + @pytest.mark.xfail( + reason=( + "data.dtype.type return pyarrow.DataType " + "but this (intentionally) returns " + "Python scalars or pd.Na" + ) + ) + def test_getitem_scalar(self, data): + super().test_getitem_scalar(data) + + def test_get(self, data_not_missing): + super().test_get(data_not_missing) + + def test_take_sequence(self, data_not_missing): + super().test_take_sequence(data_not_missing) + + def test_take(self, data_not_missing, na_value, na_cmp): + super().test_take(data_not_missing, na_value, na_cmp) + + def test_take_non_na_fill_value(self, data_missing): + super().test_take_non_na_fill_value(data_missing) + + def test_reindex_non_na_fill_value(self, data_missing): + super().test_reindex_non_na_fill_value(data_missing) diff --git a/pandas/tests/extension/arrow/test_constructors.py b/pandas/tests/extension/arrow/test_constructors.py deleted file mode 100644 index 9f83dbb0890a1..0000000000000 --- a/pandas/tests/extension/arrow/test_constructors.py +++ /dev/null @@ -1,61 +0,0 @@ -from datetime import ( - date, - datetime, - time, - timedelta, -) - -import pytest - -import pandas as pd -import pandas._testing as tm -from pandas.tests.extension import base - -pa = pytest.importorskip("pyarrow", minversion="1.0.1") - -from pandas.core.arrays.arrow.dtype import ArrowDtype # isort:skip - - -@pytest.fixture(params=tm.ALL_PYARROW_DTYPES) -def dtype(request): - return ArrowDtype(pyarrow_dtype=request.param) - - -@pytest.fixture -def data(dtype): - pa_dtype = dtype.pyarrow_dtype - if pa.types.is_boolean(pa_dtype): - data = [True, None, False, None] - elif pa.types.is_floating(pa_dtype): - data = [1.0, None, 0.0, None, -2.0, None, 0.5, None, 99.9, None] - elif pa.types.is_signed_integer(pa_dtype): - data = [1, None, 0, None, -2, None, 10] - elif pa.types.is_unsigned_integer(pa_dtype): - data = [1, None, 0, None, 2, None, 10] - elif pa.types.is_date(pa_dtype): - data = [date(2022, 1, 1), None, date(1999, 12, 31)] - elif pa.types.is_timestamp(pa_dtype): - data = [ - datetime(2020, 1, 1, 1, 1, 1, 1), - None, - datetime(1999, 1, 1, 1, 1, 1, 1), - ] - elif pa.types.is_duration(pa_dtype): - data = [timedelta(1), None, timedelta(1, 1)] - elif pa.types.is_time(pa_dtype): - data = [time(12, 0), None, time(0, 12)] - else: - data = [] - return pd.array(data, dtype=dtype) - - -class TestConstructors(base.BaseConstructorsTests): - @pytest.mark.xfail( - reason=( - "str(dtype) constructs " - "e.g. in64[pyarrow] like int64 (numpy) " - "due to StorageExtensionDtype.__str__" - ) - ) - def test_from_dtype(self, data): - super().test_from_dtype(data) From b6972a53c8f015a5cc5f0e8842e1f9f74daf3816 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 25 May 2022 16:34:53 -0700 Subject: [PATCH 20/27] Add conditions when fails for other pyarrow versions --- pandas/tests/extension/arrow/test_arrow.py | 61 ++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/pandas/tests/extension/arrow/test_arrow.py b/pandas/tests/extension/arrow/test_arrow.py index 92bf3f6b9f445..52e4cd2ba723b 100644 --- a/pandas/tests/extension/arrow/test_arrow.py +++ b/pandas/tests/extension/arrow/test_arrow.py @@ -8,6 +8,11 @@ import numpy as np import pytest +from pandas.compat import ( + pa_version_under2p0, + pa_version_under3p0, +) + import pandas as pd import pandas._testing as tm from pandas.tests.extension import base @@ -117,3 +122,59 @@ def test_take_non_na_fill_value(self, data_missing): def test_reindex_non_na_fill_value(self, data_missing): super().test_reindex_non_na_fill_value(data_missing) + + def test_take_series(self, request, data): + tz = getattr(data._dtype.pyarrow_dtype, "tz", None) + unit = getattr(data._dtype.pyarrow_dtype, "unit", None) + bad_units = ["ns"] + if pa_version_under2p0: + bad_units.extend(["s", "ms", "us"]) + if pa_version_under3p0 and tz not in (None, "UTC") and unit in bad_units: + request.node.add_marker( + pytest.mark.xfail( + reason=( + f"Not supported by pyarrow < 3.0 " + f"with timestamp type {tz} and {unit}" + ) + ) + ) + super().test_take_series(data) + + def test_reindex(self, request, data, na_value): + tz = getattr(data._dtype.pyarrow_dtype, "tz", None) + unit = getattr(data._dtype.pyarrow_dtype, "unit", None) + bad_units = ["ns"] + if pa_version_under2p0: + bad_units.extend(["s", "ms", "us"]) + if pa_version_under3p0 and tz not in (None, "UTC") and unit in bad_units: + request.node.add_marker( + pytest.mark.xfail( + reason=( + f"Not supported by pyarrow < 3.0 " + f"with timestamp type {tz} and {unit}" + ) + ) + ) + super().test_reindex(data, na_value) + + def test_loc_iloc_frame_single_dtype(self, request, using_array_manager, data): + tz = getattr(data._dtype.pyarrow_dtype, "tz", None) + unit = getattr(data._dtype.pyarrow_dtype, "unit", None) + bad_units = ["ns"] + if pa_version_under2p0: + bad_units.extend(["s", "ms", "us"]) + if ( + pa_version_under3p0 + and not using_array_manager + and tz not in (None, "UTC") + and unit in bad_units + ): + request.node.add_marker( + pytest.mark.xfail( + reason=( + f"Not supported by pyarrow < 3.0 " + f"with timestamp type {tz} and {unit}" + ) + ) + ) + super().test_loc_iloc_frame_single_dtype(data) From a18fd6f2da40108589dcc660b4024538d95d1d0f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 26 May 2022 11:16:22 -0700 Subject: [PATCH 21/27] Fix docstring validate --- pandas/core/arrays/string_.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 01102043fcf25..45683d83a1303 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -64,7 +64,6 @@ class StringDtype(StorageExtensionDtype): StringDtype is considered experimental. The implementation and parts of the API may change without warning. - Parameters ---------- storage : {"python", "pyarrow"}, optional From 9edb6a47bc9feade097fdbdbcec28595bce23107 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 26 May 2022 16:13:55 -0700 Subject: [PATCH 22/27] Fix typing errors --- pandas/core/arrays/arrow/array.py | 2 +- pandas/core/arrays/string_arrow.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 25e7f0f71a73b..a44aafb356e95 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -101,7 +101,7 @@ def _from_sequence_of_strings( """ Construct a new ExtensionArray from a sequence of strings. """ - return cls._from_sequence(strings, dtype, copy) + return cls._from_sequence(strings, dtype=dtype, copy=copy) @classmethod def _from_factorized(cls, values, original): diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 3ee959f5cd931..a07f748fa0c8c 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -159,7 +159,7 @@ def _from_sequence_of_strings( return cls._from_sequence(strings, dtype=dtype, copy=copy) @property - def dtype(self) -> StringDtype: + def dtype(self) -> StringDtype: # type: ignore[override] """ An instance of 'string[pyarrow]'. """ From c69d70e8b63bf3e7ef8cbcfe660b764cd5b3e2af Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 31 May 2022 13:33:53 -0700 Subject: [PATCH 23/27] Remove incorrectly implemented _from_factorized --- pandas/core/arrays/arrow/array.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index a44aafb356e95..66bb12db277fc 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -103,25 +103,6 @@ def _from_sequence_of_strings( """ return cls._from_sequence(strings, dtype=dtype, copy=copy) - @classmethod - def _from_factorized(cls, values, original): - """ - Reconstruct an ExtensionArray after factorization. - - Parameters - ---------- - values : ndarray - An integer ndarray with the factorized values. - original : ExtensionArray - The original ExtensionArray that factorize was called on. - - See Also - -------- - factorize : Top-level factorize method that dispatches here. - ExtensionArray.factorize : Encode the extension array as an enumerated type. - """ - return original.take(values) - def __getitem__(self, item: PositionalIndexer): """Select a subset of self. From 245fbe6a3baf65691b23cae085df6035218d0c67 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 31 May 2022 13:53:17 -0700 Subject: [PATCH 24/27] Add notimplementederror for construct_from_string with parameters --- pandas/core/arrays/arrow/dtype.py | 10 ++++++++++ pandas/tests/extension/arrow/test_arrow.py | 5 +++++ 2 files changed, 15 insertions(+) diff --git a/pandas/core/arrays/arrow/dtype.py b/pandas/core/arrays/arrow/dtype.py index 3c3b12b6f0142..e611d495e84eb 100644 --- a/pandas/core/arrays/arrow/dtype.py +++ b/pandas/core/arrays/arrow/dtype.py @@ -1,5 +1,7 @@ from __future__ import annotations +import re + import numpy as np import pyarrow as pa @@ -95,6 +97,14 @@ def construct_from_string(cls, string: str): base_type = string.split("[pyarrow]")[0] pa_dtype = getattr(pa, base_type, None) if pa_dtype is None: + has_parameters = re.search(r"\[.*\]", base_type) + if has_parameters: + raise NotImplementedError( + "Passing pyarrow type specific parameters " + f"({has_parameters.group()}) in the string is not supported. " + "Please construct an ArrowDtype object with a pyarrow_dtype " + "instance with specific parameters." + ) raise TypeError(f"'{base_type}' is not a valid pyarrow data type.") return cls(pa_dtype()) diff --git a/pandas/tests/extension/arrow/test_arrow.py b/pandas/tests/extension/arrow/test_arrow.py index 52e4cd2ba723b..f422c7a0fef74 100644 --- a/pandas/tests/extension/arrow/test_arrow.py +++ b/pandas/tests/extension/arrow/test_arrow.py @@ -178,3 +178,8 @@ def test_loc_iloc_frame_single_dtype(self, request, using_array_manager, data): ) ) super().test_loc_iloc_frame_single_dtype(data) + + +def test_arrowdtype_construct_from_string_type_with_parameters(): + with pytest.raises(NotImplementedError, match="Passing pyarrow type"): + ArrowDtype.construct_from_string("timestamp[s][pyarrow]") From 1a44a6d3841e1e379410f71c359a48e55288e065 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 31 May 2022 18:25:28 -0700 Subject: [PATCH 25/27] Address review --- pandas/core/arrays/arrow/dtype.py | 7 ++++-- .../tests/extension/{arrow => }/test_arrow.py | 25 ++++++++++++++----- 2 files changed, 24 insertions(+), 8 deletions(-) rename pandas/tests/extension/{arrow => }/test_arrow.py (84%) diff --git a/pandas/core/arrays/arrow/dtype.py b/pandas/core/arrays/arrow/dtype.py index e611d495e84eb..eecd183175acc 100644 --- a/pandas/core/arrays/arrow/dtype.py +++ b/pandas/core/arrays/arrow/dtype.py @@ -141,8 +141,11 @@ def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: ) if not isinstance(new_dtype, np.dtype): return None - pa_dtype = pa.from_numpy_dtype(new_dtype) - return type(self)(pa_dtype) + try: + pa_dtype = pa.from_numpy_dtype(new_dtype) + return type(self)(pa_dtype) + except NotImplementedError: + return None def __from_arrow__(self, array: pa.Array | pa.ChunkedArray): """ diff --git a/pandas/tests/extension/arrow/test_arrow.py b/pandas/tests/extension/test_arrow.py similarity index 84% rename from pandas/tests/extension/arrow/test_arrow.py rename to pandas/tests/extension/test_arrow.py index f422c7a0fef74..0b3470dac5f76 100644 --- a/pandas/tests/extension/arrow/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1,3 +1,16 @@ +""" +This file contains a minimal set of tests for compliance with the extension +array interface test suite, and should contain no other tests. +The test suite for the full functionality of the array is located in +`pandas/tests/arrays/`. +The tests in this file are inherited from the BaseExtensionTests, and only +minimal tweaks should be applied to get the tests passing (by overwriting a +parent method). +Additional tests should either be added to one of the BaseExtensionTests +classes (if they are relevant for the extension interface for all dtypes), or +be added to the array-specific tests in `pandas/tests/arrays/`. +""" + from datetime import ( date, datetime, @@ -124,8 +137,8 @@ def test_reindex_non_na_fill_value(self, data_missing): super().test_reindex_non_na_fill_value(data_missing) def test_take_series(self, request, data): - tz = getattr(data._dtype.pyarrow_dtype, "tz", None) - unit = getattr(data._dtype.pyarrow_dtype, "unit", None) + tz = getattr(data.dtype.pyarrow_dtype, "tz", None) + unit = getattr(data.dtype.pyarrow_dtype, "unit", None) bad_units = ["ns"] if pa_version_under2p0: bad_units.extend(["s", "ms", "us"]) @@ -141,8 +154,8 @@ def test_take_series(self, request, data): super().test_take_series(data) def test_reindex(self, request, data, na_value): - tz = getattr(data._dtype.pyarrow_dtype, "tz", None) - unit = getattr(data._dtype.pyarrow_dtype, "unit", None) + tz = getattr(data.dtype.pyarrow_dtype, "tz", None) + unit = getattr(data.dtype.pyarrow_dtype, "unit", None) bad_units = ["ns"] if pa_version_under2p0: bad_units.extend(["s", "ms", "us"]) @@ -158,8 +171,8 @@ def test_reindex(self, request, data, na_value): super().test_reindex(data, na_value) def test_loc_iloc_frame_single_dtype(self, request, using_array_manager, data): - tz = getattr(data._dtype.pyarrow_dtype, "tz", None) - unit = getattr(data._dtype.pyarrow_dtype, "unit", None) + tz = getattr(data.dtype.pyarrow_dtype, "tz", None) + unit = getattr(data.dtype.pyarrow_dtype, "unit", None) bad_units = ["ns"] if pa_version_under2p0: bad_units.extend(["s", "ms", "us"]) From 86e178c06c5f5987ba13e76add0fe487e5046b2c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 31 May 2022 18:35:48 -0700 Subject: [PATCH 26/27] Add pyarrow_dtype to _metadata --- pandas/core/arrays/arrow/dtype.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/arrays/arrow/dtype.py b/pandas/core/arrays/arrow/dtype.py index eecd183175acc..ddaa1b8f2337b 100644 --- a/pandas/core/arrays/arrow/dtype.py +++ b/pandas/core/arrays/arrow/dtype.py @@ -23,6 +23,7 @@ class ArrowDtype(StorageExtensionDtype): """ na_value = libmissing.NA + _metadata = ("storage", "pyarrow_dtype") def __init__(self, pyarrow_dtype: pa.DataType) -> None: super().__init__("pyarrow") From c5d029f4be52e824d636135138133dec28f14f14 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 1 Jun 2022 10:44:10 -0700 Subject: [PATCH 27/27] Address typing and fix data fixture --- pandas/core/arrays/arrow/dtype.py | 2 +- pandas/tests/extension/test_arrow.py | 82 ++++++++++++---------------- 2 files changed, 35 insertions(+), 49 deletions(-) diff --git a/pandas/core/arrays/arrow/dtype.py b/pandas/core/arrays/arrow/dtype.py index ddaa1b8f2337b..6c932f3b94e53 100644 --- a/pandas/core/arrays/arrow/dtype.py +++ b/pandas/core/arrays/arrow/dtype.py @@ -23,7 +23,7 @@ class ArrowDtype(StorageExtensionDtype): """ na_value = libmissing.NA - _metadata = ("storage", "pyarrow_dtype") + _metadata = ("storage", "pyarrow_dtype") # type: ignore[assignment] def __init__(self, pyarrow_dtype: pa.DataType) -> None: super().__init__("pyarrow") diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 0b3470dac5f76..4047c0db1fee4 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -18,7 +18,6 @@ timedelta, ) -import numpy as np import pytest from pandas.compat import ( @@ -44,52 +43,54 @@ def dtype(request): def data(dtype): pa_dtype = dtype.pyarrow_dtype if pa.types.is_boolean(pa_dtype): - data = [True, None, False, None, False, None] + data = [True, False] * 4 + [None] + [True, False] * 44 + [None] + [True, False] elif pa.types.is_floating(pa_dtype): - data = [1.0, None, 0.0, None, -2.0, None, 0.5, None, 99.9, None] + data = [1.0, 0.0] * 4 + [None] + [-2.0, -1.0] * 44 + [None] + [0.5, 99.5] elif pa.types.is_signed_integer(pa_dtype): - data = [1, None, 0, None, -2, None, 10] + data = [1, 0] * 4 + [None] + [-2, -1] * 44 + [None] + [1, 99] elif pa.types.is_unsigned_integer(pa_dtype): - data = [1, None, 0, None, 2, None, 10] + data = [1, 0] * 4 + [None] + [2, 1] * 44 + [None] + [1, 99] elif pa.types.is_date(pa_dtype): - data = [ - date(2022, 1, 1), - None, - date(1999, 12, 31), - None, - date(2000, 1, 1), - None, - ] + data = ( + [date(2022, 1, 1), date(1999, 12, 31)] * 4 + + [None] + + [date(2022, 1, 1), date(2022, 1, 1)] * 44 + + [None] + + [date(1999, 12, 31), date(1999, 12, 31)] + ) elif pa.types.is_timestamp(pa_dtype): - data = [ - datetime(2020, 1, 1, 1, 1, 1, 1), - None, - datetime(1999, 1, 1, 1, 1, 1, 1), - None, - datetime(2000, 1, 1, 1, 1, 1, 1), - None, - ] + data = ( + [datetime(2020, 1, 1, 1, 1, 1, 1), datetime(1999, 1, 1, 1, 1, 1, 1)] * 4 + + [None] + + [datetime(2020, 1, 1, 1), datetime(1999, 1, 1, 1)] * 44 + + [None] + + [datetime(2020, 1, 1), datetime(1999, 1, 1)] + ) elif pa.types.is_duration(pa_dtype): - data = [timedelta(1), None, timedelta(1, 1), None, timedelta(-1), None] + data = ( + [timedelta(1), timedelta(1, 1)] * 4 + + [None] + + [timedelta(-1), timedelta(0)] * 44 + + [None] + + [timedelta(-10), timedelta(10)] + ) elif pa.types.is_time(pa_dtype): - data = [time(12, 0), None, time(0, 12), None, time(0, 0), None] + data = ( + [time(12, 0), time(0, 12)] * 4 + + [None] + + [time(0, 0), time(1, 1)] * 44 + + [None] + + [time(0, 5), time(5, 0)] + ) else: - data = [] + raise NotImplementedError return pd.array(data, dtype=dtype) -@pytest.fixture -def data_not_missing(data): - data = data.take( - indices=np.full(len(data), -1), allow_fill=True, fill_value=data[0] - ) - return data - - @pytest.fixture def data_missing(data): """Length-2 array with [NA, Valid]""" - return type(data)._from_sequence([data[1], data[0]]) + return type(data)._from_sequence([None, data[0]]) @pytest.fixture @@ -121,21 +122,6 @@ class TestGetitemTests(base.BaseGetitemTests): def test_getitem_scalar(self, data): super().test_getitem_scalar(data) - def test_get(self, data_not_missing): - super().test_get(data_not_missing) - - def test_take_sequence(self, data_not_missing): - super().test_take_sequence(data_not_missing) - - def test_take(self, data_not_missing, na_value, na_cmp): - super().test_take(data_not_missing, na_value, na_cmp) - - def test_take_non_na_fill_value(self, data_missing): - super().test_take_non_na_fill_value(data_missing) - - def test_reindex_non_na_fill_value(self, data_missing): - super().test_reindex_non_na_fill_value(data_missing) - def test_take_series(self, request, data): tz = getattr(data.dtype.pyarrow_dtype, "tz", None) unit = getattr(data.dtype.pyarrow_dtype, "unit", None)