diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 53e003e2ed7dd..fbf1cea670c5c 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -26,6 +26,7 @@ ) from pandas._typing import Dtype +from pandas.compat import pa_version_under1p01 from pandas.core.dtypes.common import ( is_float_dtype, @@ -193,6 +194,45 @@ ] ] +if not pa_version_under1p01: + import pyarrow as pa + + UNSIGNED_INT_PYARROW_DTYPES = [pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()] + SIGNED_INT_PYARROW_DTYPES = [pa.uint8(), pa.int16(), pa.int32(), pa.uint64()] + ALL_INT_PYARROW_DTYPES = UNSIGNED_INT_PYARROW_DTYPES + SIGNED_INT_PYARROW_DTYPES + + FLOAT_PYARROW_DTYPES = [pa.float32(), pa.float64()] + STRING_PYARROW_DTYPES = [pa.string(), pa.utf8()] + + TIME_PYARROW_DTYPES = [ + pa.time32("s"), + pa.time32("ms"), + pa.time64("us"), + pa.time64("ns"), + ] + DATE_PYARROW_DTYPES = [pa.date32(), pa.date64()] + DATETIME_PYARROW_DTYPES = [ + pa.timestamp(unit=unit, tz=tz) + for unit in ["s", "ms", "us", "ns"] + for tz in [None, "UTC", "US/Pacific", "US/Eastern"] + ] + TIMEDELTA_PYARROW_DTYPES = [pa.duration(unit) for unit in ["s", "ms", "us", "ns"]] + + BOOL_PYARROW_DTYPES = [pa.bool_()] + + # TODO: Add container like pyarrow types: + # https://arrow.apache.org/docs/python/api/datatypes.html#factory-functions + ALL_PYARROW_DTYPES = ( + ALL_INT_PYARROW_DTYPES + + FLOAT_PYARROW_DTYPES + + TIME_PYARROW_DTYPES + + DATE_PYARROW_DTYPES + + DATETIME_PYARROW_DTYPES + + TIMEDELTA_PYARROW_DTYPES + + BOOL_PYARROW_DTYPES + ) + + EMPTY_STRING_PATTERN = re.compile("^$") # set testing_mode diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index fdd505e259dd9..66bb12db277fc 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -9,6 +9,8 @@ import numpy as np from pandas._typing import ( + Dtype, + PositionalIndexer, TakeIndexer, npt, ) @@ -24,6 +26,7 @@ is_array_like, is_bool_dtype, is_integer, + is_integer_dtype, is_scalar, ) from pandas.core.dtypes.missing import isna @@ -31,6 +34,7 @@ from pandas.core.arrays.base import ExtensionArray from pandas.core.indexers import ( check_array_indexer, + unpack_tuple_and_ellipses, validate_indices, ) @@ -39,6 +43,7 @@ import pyarrow.compute as pc from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning + from pandas.core.arrays.arrow.dtype import ArrowDtype if TYPE_CHECKING: from pandas import Series @@ -48,16 +53,130 @@ class ArrowExtensionArray(ExtensionArray): """ - Base class for ExtensionArray backed by Arrow array. + Base class for ExtensionArray backed by Arrow ChunkedArray. """ _data: pa.ChunkedArray - def __init__(self, values: pa.ChunkedArray) -> None: - self._data = values + def __init__(self, values: pa.Array | pa.ChunkedArray) -> None: + if pa_version_under1p01: + msg = "pyarrow>=1.0.0 is required for PyArrow backed ArrowExtensionArray." + raise ImportError(msg) + if isinstance(values, pa.Array): + self._data = pa.chunked_array([values]) + elif isinstance(values, pa.ChunkedArray): + self._data = values + else: + raise ValueError( + f"Unsupported type '{type(values)}' for ArrowExtensionArray" + ) + self._dtype = ArrowDtype(self._data.type) + + @classmethod + def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): + """ + Construct a new ExtensionArray from a sequence of scalars. + """ + if isinstance(dtype, ArrowDtype): + pa_dtype = dtype.pyarrow_dtype + elif dtype: + pa_dtype = pa.from_numpy_dtype(dtype) + else: + pa_dtype = None + + if isinstance(scalars, cls): + data = scalars._data + if pa_dtype: + data = data.cast(pa_dtype) + return cls(data) + else: + return cls( + pa.chunked_array(pa.array(scalars, type=pa_dtype, from_pandas=True)) + ) + + @classmethod + def _from_sequence_of_strings( + cls, strings, *, dtype: Dtype | None = None, copy=False + ): + """ + Construct a new ExtensionArray from a sequence of strings. + """ + return cls._from_sequence(strings, dtype=dtype, copy=copy) + + def __getitem__(self, item: PositionalIndexer): + """Select a subset of self. + + Parameters + ---------- + item : int, slice, or ndarray + * int: The position in 'self' to get. + * slice: A slice object, where 'start', 'stop', and 'step' are + integers or None + * ndarray: A 1-d boolean NumPy ndarray the same length as 'self' + + Returns + ------- + item : scalar or ExtensionArray + + Notes + ----- + For scalar ``item``, return a scalar value suitable for the array's + type. This should be an instance of ``self.dtype.type``. + For slice ``key``, return an instance of ``ExtensionArray``, even + if the slice is length 0 or 1. + For a boolean mask, return an instance of ``ExtensionArray``, filtered + to the values where ``item`` is True. + """ + item = check_array_indexer(self, item) + + if isinstance(item, np.ndarray): + if not len(item): + # Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string] + if self._dtype.name == "string" and self._dtype.storage == "pyarrow": + pa_dtype = pa.string() + else: + pa_dtype = self._dtype.pyarrow_dtype + return type(self)(pa.chunked_array([], type=pa_dtype)) + elif is_integer_dtype(item.dtype): + return self.take(item) + elif is_bool_dtype(item.dtype): + return type(self)(self._data.filter(item)) + else: + raise IndexError( + "Only integers, slices and integer or " + "boolean arrays are valid indices." + ) + elif isinstance(item, tuple): + item = unpack_tuple_and_ellipses(item) + + # error: Non-overlapping identity check (left operand type: + # "Union[Union[int, integer[Any]], Union[slice, List[int], + # ndarray[Any, Any]]]", right operand type: "ellipsis") + if item is Ellipsis: # type: ignore[comparison-overlap] + # TODO: should be handled by pyarrow? + item = slice(None) + + if is_scalar(item) and not is_integer(item): + # e.g. "foo" or 2.5 + # exception message copied from numpy + raise IndexError( + r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis " + r"(`None`) and integer or boolean arrays are valid indices" + ) + # We are not an array indexer, so maybe e.g. a slice or integer + # indexer. We dispatch to pyarrow. + value = self._data[item] + if isinstance(value, pa.ChunkedArray): + return type(self)(value) + else: + scalar = value.as_py() + if scalar is None: + return self._dtype.na_value + else: + return scalar def __arrow_array__(self, type=None): - """Convert myself to a pyarrow Array or ChunkedArray.""" + """Convert myself to a pyarrow ChunkedArray.""" return self._data def equals(self, other) -> bool: @@ -67,6 +186,13 @@ def equals(self, other) -> bool: # TODO: is this documented somewhere? return self._data == other._data + @property + def dtype(self) -> ArrowDtype: + """ + An instance of 'ExtensionDtype'. + """ + return self._dtype + @property def nbytes(self) -> int: """ @@ -377,7 +503,8 @@ def _indexing_key_to_indices( def _maybe_convert_setitem_value(self, value): """Maybe convert value to be pyarrow compatible.""" - raise NotImplementedError() + # TODO: Make more robust like ArrowStringArray._maybe_convert_setitem_value + return value def _set_via_chunk_iteration( self, indices: npt.NDArray[np.intp], value: npt.NDArray[Any] diff --git a/pandas/core/arrays/arrow/dtype.py b/pandas/core/arrays/arrow/dtype.py index c0ecb0856f27f..6c932f3b94e53 100644 --- a/pandas/core/arrays/arrow/dtype.py +++ b/pandas/core/arrays/arrow/dtype.py @@ -1,35 +1,60 @@ from __future__ import annotations +import re + import numpy as np import pyarrow as pa +from pandas._libs import missing as libmissing from pandas._typing import DtypeObj from pandas.util._decorators import cache_readonly -from pandas.core.dtypes.base import StorageExtensionDtype - -from pandas.core.arrays.arrow import ArrowExtensionArray +from pandas.core.dtypes.base import ( + StorageExtensionDtype, + register_extension_dtype, +) +@register_extension_dtype class ArrowDtype(StorageExtensionDtype): """ - Base class for dtypes for BaseArrowArray subclasses. + Base class for dtypes for ArrowExtensionArray. Modeled after BaseMaskedDtype """ - name: str - base = None - type: pa.DataType + na_value = libmissing.NA + _metadata = ("storage", "pyarrow_dtype") # type: ignore[assignment] - na_value = pa.NA + def __init__(self, pyarrow_dtype: pa.DataType) -> None: + super().__init__("pyarrow") + if not isinstance(pyarrow_dtype, pa.DataType): + raise ValueError( + f"pyarrow_dtype ({pyarrow_dtype}) must be an instance " + f"of a pyarrow.DataType. Got {type(pyarrow_dtype)} instead." + ) + self.pyarrow_dtype = pyarrow_dtype - def __init__(self, storage="pyarrow") -> None: - super().__init__(storage) + @property + def type(self): + """ + Returns pyarrow.DataType. + """ + return type(self.pyarrow_dtype) + + @property + def name(self) -> str: # type: ignore[override] + """ + A string identifying the data type. + """ + return str(self.pyarrow_dtype) @cache_readonly def numpy_dtype(self) -> np.dtype: """Return an instance of the related numpy dtype""" - return self.type.to_pandas_dtype() + try: + return np.dtype(self.pyarrow_dtype.to_pandas_dtype()) + except (NotImplementedError, TypeError): + return np.dtype(object) @cache_readonly def kind(self) -> str: @@ -49,6 +74,8 @@ def construct_array_type(cls): ------- type """ + from pandas.core.arrays.arrow import ArrowExtensionArray + return ArrowExtensionArray @classmethod @@ -59,29 +86,52 @@ def construct_from_string(cls, string: str): Parameters ---------- string : str + string should follow the format f"{pyarrow_type}[pyarrow]" + e.g. int64[pyarrow] """ if not isinstance(string, str): raise TypeError( f"'construct_from_string' expects a string, got {type(string)}" ) - if string == f"{cls.name}[pyarrow]": - return cls(storage="pyarrow") - raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") + if not string.endswith("[pyarrow]"): + raise TypeError(f"string {string} must end with '[pyarrow]'") + base_type = string.split("[pyarrow]")[0] + pa_dtype = getattr(pa, base_type, None) + if pa_dtype is None: + has_parameters = re.search(r"\[.*\]", base_type) + if has_parameters: + raise NotImplementedError( + "Passing pyarrow type specific parameters " + f"({has_parameters.group()}) in the string is not supported. " + "Please construct an ArrowDtype object with a pyarrow_dtype " + "instance with specific parameters." + ) + raise TypeError(f"'{base_type}' is not a valid pyarrow data type.") + return cls(pa_dtype()) + + @property + def _is_numeric(self) -> bool: + """ + Whether columns with this dtype should be considered numeric. + """ + # TODO: pa.types.is_boolean? + return ( + pa.types.is_integer(self.pyarrow_dtype) + or pa.types.is_floating(self.pyarrow_dtype) + or pa.types.is_decimal(self.pyarrow_dtype) + ) - @classmethod - def from_numpy_dtype(cls, dtype: np.dtype) -> ArrowDtype: + @property + def _is_boolean(self) -> bool: """ - Construct the ArrowDtype corresponding to the given numpy dtype. + Whether this dtype should be considered boolean. """ - # TODO: This may be incomplete - pa_dtype = pa.from_numpy_dtype(dtype) - if pa_dtype is cls.type: - return cls() - raise NotImplementedError(dtype) + return pa.types.is_boolean(self.pyarrow_dtype) def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: # We unwrap any masked dtypes, find the common dtype we would use # for that, then re-mask the result. + # Mirrors BaseMaskedDtype from pandas.core.dtypes.cast import find_common_type new_dtype = find_common_type( @@ -91,11 +141,11 @@ def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: ] ) if not isinstance(new_dtype, np.dtype): - # If we ever support e.g. Masked[DatetimeArray] then this will change return None try: - return type(self).from_numpy_dtype(new_dtype) - except (KeyError, NotImplementedError): + pa_dtype = pa.from_numpy_dtype(new_dtype) + return type(self)(pa_dtype) + except NotImplementedError: return None def __from_arrow__(self, array: pa.Array | pa.ChunkedArray): diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 21b5dc625956e..45683d83a1303 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -64,9 +64,6 @@ class StringDtype(StorageExtensionDtype): StringDtype is considered experimental. The implementation and parts of the API may change without warning. - In particular, StringDtype.na_value may change to no longer be - ``pd.NA``. - Parameters ---------- storage : {"python", "pyarrow"}, optional diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 8b6f1ffcfa59b..a07f748fa0c8c 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -2,10 +2,7 @@ from collections.abc import Callable # noqa: PDF001 import re -from typing import ( - Union, - overload, -) +from typing import Union import numpy as np @@ -16,10 +13,7 @@ from pandas._typing import ( Dtype, NpDtype, - PositionalIndexer, Scalar, - ScalarIndexer, - SequenceIndexer, npt, ) from pandas.compat import ( @@ -32,7 +26,6 @@ from pandas.core.dtypes.common import ( is_bool_dtype, is_dtype_equal, - is_integer, is_integer_dtype, is_object_dtype, is_scalar, @@ -50,10 +43,6 @@ BaseStringArray, StringDtype, ) -from pandas.core.indexers import ( - check_array_indexer, - unpack_tuple_and_ellipses, -) from pandas.core.strings.object_array import ObjectStringArrayMixin if not pa_version_under1p01: @@ -76,7 +65,7 @@ def _chk_pyarrow_available() -> None: if pa_version_under1p01: - msg = "pyarrow>=1.0.0 is required for PyArrow backed StringArray." + msg = "pyarrow>=1.0.0 is required for PyArrow backed ArrowExtensionArray." raise ImportError(msg) @@ -132,13 +121,9 @@ class ArrowStringArray( """ def __init__(self, values) -> None: + super().__init__(values) + # TODO: Migrate to ArrowDtype instead self._dtype = StringDtype(storage="pyarrow") - if isinstance(values, pa.Array): - self._data = pa.chunked_array([values]) - elif isinstance(values, pa.ChunkedArray): - self._data = values - else: - raise ValueError(f"Unsupported type '{type(values)}' for ArrowStringArray") if not pa.types.is_string(self._data.type): raise ValueError( @@ -174,7 +159,7 @@ def _from_sequence_of_strings( return cls._from_sequence(strings, dtype=dtype, copy=copy) @property - def dtype(self) -> StringDtype: + def dtype(self) -> StringDtype: # type: ignore[override] """ An instance of 'string[pyarrow]'. """ @@ -205,86 +190,6 @@ def to_numpy( result[mask] = na_value return result - @overload - def __getitem__(self, item: ScalarIndexer) -> ArrowStringScalarOrNAT: - ... - - @overload - def __getitem__(self: ArrowStringArray, item: SequenceIndexer) -> ArrowStringArray: - ... - - def __getitem__( - self: ArrowStringArray, item: PositionalIndexer - ) -> ArrowStringArray | ArrowStringScalarOrNAT: - """Select a subset of self. - - Parameters - ---------- - item : int, slice, or ndarray - * int: The position in 'self' to get. - * slice: A slice object, where 'start', 'stop', and 'step' are - integers or None - * ndarray: A 1-d boolean NumPy ndarray the same length as 'self' - - Returns - ------- - item : scalar or ExtensionArray - - Notes - ----- - For scalar ``item``, return a scalar value suitable for the array's - type. This should be an instance of ``self.dtype.type``. - For slice ``key``, return an instance of ``ExtensionArray``, even - if the slice is length 0 or 1. - For a boolean mask, return an instance of ``ExtensionArray``, filtered - to the values where ``item`` is True. - """ - item = check_array_indexer(self, item) - - if isinstance(item, np.ndarray): - if not len(item): - return type(self)(pa.chunked_array([], type=pa.string())) - elif is_integer_dtype(item.dtype): - return self.take(item) - elif is_bool_dtype(item.dtype): - return type(self)(self._data.filter(item)) - else: - raise IndexError( - "Only integers, slices and integer or " - "boolean arrays are valid indices." - ) - elif isinstance(item, tuple): - item = unpack_tuple_and_ellipses(item) - - # error: Non-overlapping identity check (left operand type: - # "Union[Union[int, integer[Any]], Union[slice, List[int], - # ndarray[Any, Any]]]", right operand type: "ellipsis") - if item is Ellipsis: # type: ignore[comparison-overlap] - # TODO: should be handled by pyarrow? - item = slice(None) - - if is_scalar(item) and not is_integer(item): - # e.g. "foo" or 2.5 - # exception message copied from numpy - raise IndexError( - r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis " - r"(`None`) and integer or boolean arrays are valid indices" - ) - # We are not an array indexer, so maybe e.g. a slice or integer - # indexer. We dispatch to pyarrow. - value = self._data[item] - if isinstance(value, pa.ChunkedArray): - return type(self)(value) - else: - return self._as_pandas_scalar(value) - - def _as_pandas_scalar(self, arrow_scalar: pa.Scalar): - scalar = arrow_scalar.as_py() - if scalar is None: - return self._dtype.na_value - else: - return scalar - def _cmp_method(self, other, op): from pandas.arrays import BooleanArray diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index 9762b779477e4..cffac15ef6496 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -408,7 +408,7 @@ def __str__(self): return self.name def __eq__(self, other: Any) -> bool: - if isinstance(other, self.type) and other == self.name: + if isinstance(other, str) and other == self.name: return True return super().__eq__(other) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index e9d48eb937b36..b563f84207b22 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -262,7 +262,7 @@ def test_constructor_raises(cls): if cls is pd.arrays.StringArray: msg = "StringArray requires a sequence of strings or pandas.NA" else: - msg = "Unsupported type '' for ArrowStringArray" + msg = "Unsupported type '' for ArrowExtensionArray" with pytest.raises(ValueError, match=msg): cls(np.array(["a", "b"], dtype="S1")) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index de1b7a9c603af..f43cf298857a0 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -59,7 +59,7 @@ def test_constructor_not_string_type_raises(array, chunked): pytest.skip("chunked not applicable to numpy array") arr = pa.chunked_array(arr) if array is np: - msg = "Unsupported type '' for ArrowStringArray" + msg = "Unsupported type '' for ArrowExtensionArray" else: msg = re.escape( "ArrowStringArray requires a PyArrow (chunked) array of string type" @@ -122,7 +122,7 @@ def test_from_sequence_wrong_dtype_raises(): reason="pyarrow is installed", ) def test_pyarrow_not_installed_raises(): - msg = re.escape("pyarrow>=1.0.0 is required for PyArrow backed StringArray") + msg = re.escape("pyarrow>=1.0.0 is required for PyArrow backed") with pytest.raises(ImportError, match=msg): StringDtype(storage="pyarrow") diff --git a/pandas/tests/extension/arrow/arrays.py b/pandas/tests/extension/arrow/arrays.py index d19a6245809be..22595c4e461d7 100644 --- a/pandas/tests/extension/arrow/arrays.py +++ b/pandas/tests/extension/arrow/arrays.py @@ -185,7 +185,7 @@ def __init__(self, values) -> None: assert values.type == pa.bool_() self._data = values - self._dtype = ArrowBoolDtype() + self._dtype = ArrowBoolDtype() # type: ignore[assignment] class ArrowStringArray(ArrowExtensionArray): @@ -195,4 +195,4 @@ def __init__(self, values) -> None: assert values.type == pa.string() self._data = values - self._dtype = ArrowStringDtype() + self._dtype = ArrowStringDtype() # type: ignore[assignment] diff --git a/pandas/tests/extension/arrow/test_timestamp.py b/pandas/tests/extension/arrow/test_timestamp.py index b2750784ab3d6..5b81940e5a6c0 100644 --- a/pandas/tests/extension/arrow/test_timestamp.py +++ b/pandas/tests/extension/arrow/test_timestamp.py @@ -46,7 +46,7 @@ def __init__(self, values) -> None: assert values.type == pa.timestamp("us") self._data = values - self._dtype = ArrowTimestampUSDtype() + self._dtype = ArrowTimestampUSDtype() # type: ignore[assignment] def test_constructor_extensionblock(): diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py new file mode 100644 index 0000000000000..4047c0db1fee4 --- /dev/null +++ b/pandas/tests/extension/test_arrow.py @@ -0,0 +1,184 @@ +""" +This file contains a minimal set of tests for compliance with the extension +array interface test suite, and should contain no other tests. +The test suite for the full functionality of the array is located in +`pandas/tests/arrays/`. +The tests in this file are inherited from the BaseExtensionTests, and only +minimal tweaks should be applied to get the tests passing (by overwriting a +parent method). +Additional tests should either be added to one of the BaseExtensionTests +classes (if they are relevant for the extension interface for all dtypes), or +be added to the array-specific tests in `pandas/tests/arrays/`. +""" + +from datetime import ( + date, + datetime, + time, + timedelta, +) + +import pytest + +from pandas.compat import ( + pa_version_under2p0, + pa_version_under3p0, +) + +import pandas as pd +import pandas._testing as tm +from pandas.tests.extension import base + +pa = pytest.importorskip("pyarrow", minversion="1.0.1") + +from pandas.core.arrays.arrow.dtype import ArrowDtype # isort:skip + + +@pytest.fixture(params=tm.ALL_PYARROW_DTYPES) +def dtype(request): + return ArrowDtype(pyarrow_dtype=request.param) + + +@pytest.fixture +def data(dtype): + pa_dtype = dtype.pyarrow_dtype + if pa.types.is_boolean(pa_dtype): + data = [True, False] * 4 + [None] + [True, False] * 44 + [None] + [True, False] + elif pa.types.is_floating(pa_dtype): + data = [1.0, 0.0] * 4 + [None] + [-2.0, -1.0] * 44 + [None] + [0.5, 99.5] + elif pa.types.is_signed_integer(pa_dtype): + data = [1, 0] * 4 + [None] + [-2, -1] * 44 + [None] + [1, 99] + elif pa.types.is_unsigned_integer(pa_dtype): + data = [1, 0] * 4 + [None] + [2, 1] * 44 + [None] + [1, 99] + elif pa.types.is_date(pa_dtype): + data = ( + [date(2022, 1, 1), date(1999, 12, 31)] * 4 + + [None] + + [date(2022, 1, 1), date(2022, 1, 1)] * 44 + + [None] + + [date(1999, 12, 31), date(1999, 12, 31)] + ) + elif pa.types.is_timestamp(pa_dtype): + data = ( + [datetime(2020, 1, 1, 1, 1, 1, 1), datetime(1999, 1, 1, 1, 1, 1, 1)] * 4 + + [None] + + [datetime(2020, 1, 1, 1), datetime(1999, 1, 1, 1)] * 44 + + [None] + + [datetime(2020, 1, 1), datetime(1999, 1, 1)] + ) + elif pa.types.is_duration(pa_dtype): + data = ( + [timedelta(1), timedelta(1, 1)] * 4 + + [None] + + [timedelta(-1), timedelta(0)] * 44 + + [None] + + [timedelta(-10), timedelta(10)] + ) + elif pa.types.is_time(pa_dtype): + data = ( + [time(12, 0), time(0, 12)] * 4 + + [None] + + [time(0, 0), time(1, 1)] * 44 + + [None] + + [time(0, 5), time(5, 0)] + ) + else: + raise NotImplementedError + return pd.array(data, dtype=dtype) + + +@pytest.fixture +def data_missing(data): + """Length-2 array with [NA, Valid]""" + return type(data)._from_sequence([None, data[0]]) + + +@pytest.fixture +def na_value(): + """The scalar missing value for this type. Default 'None'""" + return pd.NA + + +class TestConstructors(base.BaseConstructorsTests): + @pytest.mark.xfail( + reason=( + "str(dtype) constructs " + "e.g. in64[pyarrow] like int64 (numpy) " + "due to StorageExtensionDtype.__str__" + ) + ) + def test_from_dtype(self, data): + super().test_from_dtype(data) + + +class TestGetitemTests(base.BaseGetitemTests): + @pytest.mark.xfail( + reason=( + "data.dtype.type return pyarrow.DataType " + "but this (intentionally) returns " + "Python scalars or pd.Na" + ) + ) + def test_getitem_scalar(self, data): + super().test_getitem_scalar(data) + + def test_take_series(self, request, data): + tz = getattr(data.dtype.pyarrow_dtype, "tz", None) + unit = getattr(data.dtype.pyarrow_dtype, "unit", None) + bad_units = ["ns"] + if pa_version_under2p0: + bad_units.extend(["s", "ms", "us"]) + if pa_version_under3p0 and tz not in (None, "UTC") and unit in bad_units: + request.node.add_marker( + pytest.mark.xfail( + reason=( + f"Not supported by pyarrow < 3.0 " + f"with timestamp type {tz} and {unit}" + ) + ) + ) + super().test_take_series(data) + + def test_reindex(self, request, data, na_value): + tz = getattr(data.dtype.pyarrow_dtype, "tz", None) + unit = getattr(data.dtype.pyarrow_dtype, "unit", None) + bad_units = ["ns"] + if pa_version_under2p0: + bad_units.extend(["s", "ms", "us"]) + if pa_version_under3p0 and tz not in (None, "UTC") and unit in bad_units: + request.node.add_marker( + pytest.mark.xfail( + reason=( + f"Not supported by pyarrow < 3.0 " + f"with timestamp type {tz} and {unit}" + ) + ) + ) + super().test_reindex(data, na_value) + + def test_loc_iloc_frame_single_dtype(self, request, using_array_manager, data): + tz = getattr(data.dtype.pyarrow_dtype, "tz", None) + unit = getattr(data.dtype.pyarrow_dtype, "unit", None) + bad_units = ["ns"] + if pa_version_under2p0: + bad_units.extend(["s", "ms", "us"]) + if ( + pa_version_under3p0 + and not using_array_manager + and tz not in (None, "UTC") + and unit in bad_units + ): + request.node.add_marker( + pytest.mark.xfail( + reason=( + f"Not supported by pyarrow < 3.0 " + f"with timestamp type {tz} and {unit}" + ) + ) + ) + super().test_loc_iloc_frame_single_dtype(data) + + +def test_arrowdtype_construct_from_string_type_with_parameters(): + with pytest.raises(NotImplementedError, match="Passing pyarrow type"): + ArrowDtype.construct_from_string("timestamp[s][pyarrow]")