Skip to content

ENH: add __from_pyarrow__ support to DatetimeTZDtype #52201

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 34 commits into from
Apr 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
6985319
ENH: add `__from_pyarrow__` support to `DatetimeTZDtype`
tswast Mar 25, 2023
a16e9b1
handle empty pyarrow arrays
tswast Mar 25, 2023
01f5b32
Merge branch 'main' into DatetimeTZDtype-from_arrow
tswast Mar 25, 2023
080d2a4
add test with iNaT
tswast Mar 25, 2023
52874c8
Merge remote-tracking branch 'origin/DatetimeTZDtype-from_arrow' into…
tswast Mar 25, 2023
4e305af
mypy
tswast Mar 25, 2023
2010147
Merge branch 'main' into DatetimeTZDtype-from_arrow
tswast Mar 25, 2023
ea66833
nits
tswast Mar 25, 2023
0b0cfa0
Merge branch 'main' into DatetimeTZDtype-from_arrow
tswast Mar 27, 2023
773ac7a
Merge branch 'main' into DatetimeTZDtype-from_arrow
tswast Mar 27, 2023
963978a
Merge branch 'main' into DatetimeTZDtype-from_arrow
tswast Mar 28, 2023
7c4925e
Merge branch 'main' of https://github.com/pandas-dev/pandas into Date…
tswast Mar 28, 2023
8f89a9c
add docs for NaT handling
tswast Mar 28, 2023
2cea00d
Merge branch 'DatetimeTZDtype-from_arrow' of github.com:tswast/pandas…
tswast Mar 28, 2023
338aa59
Merge branch 'main' into DatetimeTZDtype-from_arrow
tswast Mar 29, 2023
92b0935
Merge branch 'main' into DatetimeTZDtype-from_arrow
tswast Mar 29, 2023
bdbb87f
Merge branch 'main' into DatetimeTZDtype-from_arrow
tswast Mar 29, 2023
8b598c7
sort by issue number
tswast Mar 29, 2023
587573c
Merge branch 'main' into DatetimeTZDtype-from_arrow
tswast Mar 30, 2023
b8ad477
use arrow to_numpy
tswast Mar 30, 2023
2fef3e3
Merge remote-tracking branch 'upstream/main' into DatetimeTZDtype-fro…
tswast Mar 30, 2023
ab60ab7
don't copy
tswast Mar 30, 2023
3951f68
Merge remote-tracking branch 'upstream/main' into DatetimeTZDtype-fro…
tswast Mar 30, 2023
ccb1085
use safe cast
tswast Mar 30, 2023
7aca8f5
Merge branch 'main' into DatetimeTZDtype-from_arrow
tswast Mar 30, 2023
a24de30
Merge branch 'main' into DatetimeTZDtype-from_arrow
tswast Apr 3, 2023
4671c54
Merge branch 'main' into DatetimeTZDtype-from_arrow
tswast Apr 3, 2023
eb8013c
Merge branch 'main' into DatetimeTZDtype-from_arrow
tswast Apr 6, 2023
6b1d800
Merge branch 'main' into DatetimeTZDtype-from_arrow
tswast Apr 7, 2023
32dd0cf
Merge branch 'main' into DatetimeTZDtype-from_arrow
tswast Apr 12, 2023
f683bdd
Merge branch 'main' into DatetimeTZDtype-from_arrow
tswast Apr 13, 2023
f639656
don't localize
tswast Apr 13, 2023
f528a48
don't convert to ns in DatetimeArray constructor
tswast Apr 14, 2023
8c50925
Merge branch 'main' into DatetimeTZDtype-from_arrow
tswast Apr 17, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ Other enhancements
- Add dtype of categories to ``repr`` information of :class:`CategoricalDtype` (:issue:`52179`)
- Added to the escape mode "latex-math" preserving without escaping all characters between "\(" and "\)" in formatter (:issue:`51903`)
- Adding ``engine_kwargs`` parameter to :meth:`DataFrame.read_excel` (:issue:`52214`)
- Implemented ``__from_arrow__`` on :class:`DatetimeTZDtype`. (:issue:`52201`)
- Implemented ``__pandas_priority__`` to allow custom types to take precedence over :class:`DataFrame`, :class:`Series`, :class:`Index`, or :class:`ExtensionArray` for arithmetic operations, :ref:`see the developer guide <extending.pandas_priority>` (:issue:`48347`)
- Improve error message when having incompatible columns using :meth:`DataFrame.merge` (:issue:`51861`)
- Improve error message when setting :class:`DataFrame` with wrong number of columns through :meth:`DataFrame.isetitem` (:issue:`51701`)
Expand Down
4 changes: 3 additions & 1 deletion pandas/core/arrays/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -2344,7 +2344,9 @@ def _validate_dt64_dtype(dtype):
# a tz-aware Timestamp (with a tz specific to its datetime) will
# be incorrect(ish?) for the array as a whole
dtype = cast(DatetimeTZDtype, dtype)
dtype = DatetimeTZDtype(tz=timezones.tz_standardize(dtype.tz))
dtype = DatetimeTZDtype(
unit=dtype.unit, tz=timezones.tz_standardize(dtype.tz)
)

return dtype

Expand Down
34 changes: 34 additions & 0 deletions pandas/core/dtypes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -817,6 +817,40 @@ def __eq__(self, other: Any) -> bool:
and tz_compare(self.tz, other.tz)
)

def __from_arrow__(
self, array: pyarrow.Array | pyarrow.ChunkedArray
) -> DatetimeArray:
"""
Construct DatetimeArray from pyarrow Array/ChunkedArray.

Note: If the units in the pyarrow Array are the same as this
DatetimeDtype, then values corresponding to the integer representation
of ``NaT`` (e.g. one nanosecond before :attr:`pandas.Timestamp.min`)
are converted to ``NaT``, regardless of the null indicator in the
pyarrow array.

Parameters
----------
array : pyarrow.Array or pyarrow.ChunkedArray
The Arrow array to convert to DatetimeArray.

Returns
-------
extension array : DatetimeArray
"""
import pyarrow

from pandas.core.arrays import DatetimeArray

array = array.cast(pyarrow.timestamp(unit=self._unit), safe=True)

if isinstance(array, pyarrow.Array):
np_arr = array.to_numpy(zero_copy_only=False)
else:
np_arr = array.to_numpy()

return DatetimeArray(np_arr, dtype=self, copy=False)

def __setstate__(self, state) -> None:
# for pickle compat. __get_state__ is defined in the
# PandasExtensionDtype superclass and uses the public properties to
Expand Down
86 changes: 86 additions & 0 deletions pandas/tests/arrays/datetimes/test_constructors.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import numpy as np
import pytest

from pandas._libs import iNaT

from pandas.core.dtypes.dtypes import DatetimeTZDtype

import pandas as pd
Expand Down Expand Up @@ -168,3 +170,87 @@ def test_2d(self, order):
res = DatetimeArray._from_sequence(arr)
expected = DatetimeArray._from_sequence(arr.ravel()).reshape(arr.shape)
tm.assert_datetime_array_equal(res, expected)


# ----------------------------------------------------------------------------
# Arrow interaction


EXTREME_VALUES = [0, 123456789, None, iNaT, 2**63 - 1, -(2**63) + 1]
FINE_TO_COARSE_SAFE = [123_000_000_000, None, -123_000_000_000]
COARSE_TO_FINE_SAFE = [123, None, -123]


@pytest.mark.parametrize(
("pa_unit", "pd_unit", "pa_tz", "pd_tz", "data"),
[
("s", "s", "UTC", "UTC", EXTREME_VALUES),
("ms", "ms", "UTC", "Europe/Berlin", EXTREME_VALUES),
("us", "us", "US/Eastern", "UTC", EXTREME_VALUES),
("ns", "ns", "US/Central", "Asia/Kolkata", EXTREME_VALUES),
("ns", "s", "UTC", "UTC", FINE_TO_COARSE_SAFE),
("us", "ms", "UTC", "Europe/Berlin", FINE_TO_COARSE_SAFE),
("ms", "us", "US/Eastern", "UTC", COARSE_TO_FINE_SAFE),
("s", "ns", "US/Central", "Asia/Kolkata", COARSE_TO_FINE_SAFE),
],
)
def test_from_arrowtest_from_arrow_with_different_units_and_timezones_with_(
pa_unit, pd_unit, pa_tz, pd_tz, data
):
pa = pytest.importorskip("pyarrow")

pa_type = pa.timestamp(pa_unit, tz=pa_tz)
arr = pa.array(data, type=pa_type)
dtype = DatetimeTZDtype(unit=pd_unit, tz=pd_tz)

result = dtype.__from_arrow__(arr)
expected = DatetimeArray(
np.array(data, dtype=f"datetime64[{pa_unit}]").astype(f"datetime64[{pd_unit}]"),
dtype=dtype,
)
tm.assert_extension_array_equal(result, expected)

result = dtype.__from_arrow__(pa.chunked_array([arr]))
tm.assert_extension_array_equal(result, expected)


@pytest.mark.parametrize(
("unit", "tz"),
[
("s", "UTC"),
("ms", "Europe/Berlin"),
("us", "US/Eastern"),
("ns", "Asia/Kolkata"),
("ns", "UTC"),
],
)
def test_from_arrow_from_empty(unit, tz):
pa = pytest.importorskip("pyarrow")

data = []
arr = pa.array(data)
dtype = DatetimeTZDtype(unit=unit, tz=tz)

result = dtype.__from_arrow__(arr)
expected = DatetimeArray(np.array(data, dtype=f"datetime64[{unit}]"))
expected = expected.tz_localize(tz=tz)
tm.assert_extension_array_equal(result, expected)

result = dtype.__from_arrow__(pa.chunked_array([arr]))
tm.assert_extension_array_equal(result, expected)


def test_from_arrow_from_integers():
pa = pytest.importorskip("pyarrow")

data = [0, 123456789, None, 2**63 - 1, iNaT, -123456789]
arr = pa.array(data)
dtype = DatetimeTZDtype(unit="ns", tz="UTC")

result = dtype.__from_arrow__(arr)
expected = DatetimeArray(np.array(data, dtype="datetime64[ns]"))
expected = expected.tz_localize("UTC")
tm.assert_extension_array_equal(result, expected)

result = dtype.__from_arrow__(pa.chunked_array([arr]))
tm.assert_extension_array_equal(result, expected)