diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index c3355757350b9..1e1c9517d5ef7 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -85,6 +85,7 @@ Other enhancements - Add dtype of categories to ``repr`` information of :class:`CategoricalDtype` (:issue:`52179`) - Added to the escape mode "latex-math" preserving without escaping all characters between "\(" and "\)" in formatter (:issue:`51903`) - Adding ``engine_kwargs`` parameter to :meth:`DataFrame.read_excel` (:issue:`52214`) +- Implemented ``__from_arrow__`` on :class:`DatetimeTZDtype`. (:issue:`52201`) - Implemented ``__pandas_priority__`` to allow custom types to take precedence over :class:`DataFrame`, :class:`Series`, :class:`Index`, or :class:`ExtensionArray` for arithmetic operations, :ref:`see the developer guide ` (:issue:`48347`) - Improve error message when having incompatible columns using :meth:`DataFrame.merge` (:issue:`51861`) - Improve error message when setting :class:`DataFrame` with wrong number of columns through :meth:`DataFrame.isetitem` (:issue:`51701`) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 12245a144ec2a..5f655a89abf99 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -2344,7 +2344,9 @@ def _validate_dt64_dtype(dtype): # a tz-aware Timestamp (with a tz specific to its datetime) will # be incorrect(ish?) for the array as a whole dtype = cast(DatetimeTZDtype, dtype) - dtype = DatetimeTZDtype(tz=timezones.tz_standardize(dtype.tz)) + dtype = DatetimeTZDtype( + unit=dtype.unit, tz=timezones.tz_standardize(dtype.tz) + ) return dtype diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 4d336f1edbb2d..65b612ce019dd 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -817,6 +817,40 @@ def __eq__(self, other: Any) -> bool: and tz_compare(self.tz, other.tz) ) + def __from_arrow__( + self, array: pyarrow.Array | pyarrow.ChunkedArray + ) -> DatetimeArray: + """ + Construct DatetimeArray from pyarrow Array/ChunkedArray. + + Note: If the units in the pyarrow Array are the same as this + DatetimeDtype, then values corresponding to the integer representation + of ``NaT`` (e.g. one nanosecond before :attr:`pandas.Timestamp.min`) + are converted to ``NaT``, regardless of the null indicator in the + pyarrow array. + + Parameters + ---------- + array : pyarrow.Array or pyarrow.ChunkedArray + The Arrow array to convert to DatetimeArray. + + Returns + ------- + extension array : DatetimeArray + """ + import pyarrow + + from pandas.core.arrays import DatetimeArray + + array = array.cast(pyarrow.timestamp(unit=self._unit), safe=True) + + if isinstance(array, pyarrow.Array): + np_arr = array.to_numpy(zero_copy_only=False) + else: + np_arr = array.to_numpy() + + return DatetimeArray(np_arr, dtype=self, copy=False) + def __setstate__(self, state) -> None: # for pickle compat. __get_state__ is defined in the # PandasExtensionDtype superclass and uses the public properties to diff --git a/pandas/tests/arrays/datetimes/test_constructors.py b/pandas/tests/arrays/datetimes/test_constructors.py index bbc66dcd328c3..30f47e37fedf5 100644 --- a/pandas/tests/arrays/datetimes/test_constructors.py +++ b/pandas/tests/arrays/datetimes/test_constructors.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._libs import iNaT + from pandas.core.dtypes.dtypes import DatetimeTZDtype import pandas as pd @@ -168,3 +170,87 @@ def test_2d(self, order): res = DatetimeArray._from_sequence(arr) expected = DatetimeArray._from_sequence(arr.ravel()).reshape(arr.shape) tm.assert_datetime_array_equal(res, expected) + + +# ---------------------------------------------------------------------------- +# Arrow interaction + + +EXTREME_VALUES = [0, 123456789, None, iNaT, 2**63 - 1, -(2**63) + 1] +FINE_TO_COARSE_SAFE = [123_000_000_000, None, -123_000_000_000] +COARSE_TO_FINE_SAFE = [123, None, -123] + + +@pytest.mark.parametrize( + ("pa_unit", "pd_unit", "pa_tz", "pd_tz", "data"), + [ + ("s", "s", "UTC", "UTC", EXTREME_VALUES), + ("ms", "ms", "UTC", "Europe/Berlin", EXTREME_VALUES), + ("us", "us", "US/Eastern", "UTC", EXTREME_VALUES), + ("ns", "ns", "US/Central", "Asia/Kolkata", EXTREME_VALUES), + ("ns", "s", "UTC", "UTC", FINE_TO_COARSE_SAFE), + ("us", "ms", "UTC", "Europe/Berlin", FINE_TO_COARSE_SAFE), + ("ms", "us", "US/Eastern", "UTC", COARSE_TO_FINE_SAFE), + ("s", "ns", "US/Central", "Asia/Kolkata", COARSE_TO_FINE_SAFE), + ], +) +def test_from_arrowtest_from_arrow_with_different_units_and_timezones_with_( + pa_unit, pd_unit, pa_tz, pd_tz, data +): + pa = pytest.importorskip("pyarrow") + + pa_type = pa.timestamp(pa_unit, tz=pa_tz) + arr = pa.array(data, type=pa_type) + dtype = DatetimeTZDtype(unit=pd_unit, tz=pd_tz) + + result = dtype.__from_arrow__(arr) + expected = DatetimeArray( + np.array(data, dtype=f"datetime64[{pa_unit}]").astype(f"datetime64[{pd_unit}]"), + dtype=dtype, + ) + tm.assert_extension_array_equal(result, expected) + + result = dtype.__from_arrow__(pa.chunked_array([arr])) + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize( + ("unit", "tz"), + [ + ("s", "UTC"), + ("ms", "Europe/Berlin"), + ("us", "US/Eastern"), + ("ns", "Asia/Kolkata"), + ("ns", "UTC"), + ], +) +def test_from_arrow_from_empty(unit, tz): + pa = pytest.importorskip("pyarrow") + + data = [] + arr = pa.array(data) + dtype = DatetimeTZDtype(unit=unit, tz=tz) + + result = dtype.__from_arrow__(arr) + expected = DatetimeArray(np.array(data, dtype=f"datetime64[{unit}]")) + expected = expected.tz_localize(tz=tz) + tm.assert_extension_array_equal(result, expected) + + result = dtype.__from_arrow__(pa.chunked_array([arr])) + tm.assert_extension_array_equal(result, expected) + + +def test_from_arrow_from_integers(): + pa = pytest.importorskip("pyarrow") + + data = [0, 123456789, None, 2**63 - 1, iNaT, -123456789] + arr = pa.array(data) + dtype = DatetimeTZDtype(unit="ns", tz="UTC") + + result = dtype.__from_arrow__(arr) + expected = DatetimeArray(np.array(data, dtype="datetime64[ns]")) + expected = expected.tz_localize("UTC") + tm.assert_extension_array_equal(result, expected) + + result = dtype.__from_arrow__(pa.chunked_array([arr])) + tm.assert_extension_array_equal(result, expected)