Skip to content

Commit

Permalink
apacheGH-33321: [Python] Support converting to non-nano datetime64 fo…
Browse files Browse the repository at this point in the history
…r pandas >= 2.0 (apache#35656)

Do not coerce temporal types to nanosecond when pandas >= 2.0 is imported, since pandas now supports s/ms/us time units.

This PR adds support for the following Arrow -> Pandas conversions, which previously all defaulted to `datetime64[ns]` or `datetime64[ns, <TZ>]`:
```
date32 -> datetime64[ms]
date64 -> datetime64[ms]
datetime64[s] -> datetime64[s]
datetime64[ms] -> datetime64[ms]
datetime64[us] -> datetime64[us]
datetime64[s, <TZ>] -> datetime64[s, <TZ>]
datetime64[ms, <TZ>] -> datetime64[ms, <TZ>]
datetime64[us, <TZ>] -> datetime64[us, <TZ>]
```
### Rationale for this change

Pandas 2.0 introduces proper support for temporal types.

### Are these changes tested?

Yes. Pytests added and updated.

### Are there any user-facing changes?

Yes, arrow-to-pandas default conversion behavior will change when users have pandas >= 2.0, but a legacy option is exposed to provide backwards compatibility.
* Closes: apache#33321

Lead-authored-by: Dane Pitkin <dane@voltrondata.com>
Co-authored-by: Dane Pitkin <48041712+danepitkin@users.noreply.github.com>
Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
  • Loading branch information
danepitkin and jorisvandenbossche authored Jul 7, 2023
1 parent f8256bd commit 4f56aba
Show file tree
Hide file tree
Showing 17 changed files with 416 additions and 175 deletions.
31 changes: 23 additions & 8 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -700,7 +700,8 @@ cdef class _PandasConvertible(_Weakrefable):
bint split_blocks=False,
bint self_destruct=False,
str maps_as_pydicts=None,
types_mapper=None
types_mapper=None,
bint coerce_temporal_nanoseconds=False
):
"""
Convert to a pandas-compatible NumPy array or DataFrame, as appropriate
Expand All @@ -721,12 +722,15 @@ cdef class _PandasConvertible(_Weakrefable):
integer_object_nulls : bool, default False
Cast integers with nulls to objects
date_as_object : bool, default True
Cast dates to objects. If False, convert to datetime64[ns] dtype.
Cast dates to objects. If False, convert to datetime64 dtype with
the equivalent time unit (if supported). Note: in pandas version
< 2.0, only datetime64[ns] conversion is supported.
timestamp_as_object : bool, default False
Cast non-nanosecond timestamps (np.datetime64) to objects. This is
useful if you have timestamps that don't fit in the normal date
range of nanosecond timestamps (1678 CE-2262 CE).
If False, all timestamps are converted to datetime64[ns] dtype.
useful in pandas version 1.x if you have timestamps that don't fit
in the normal date range of nanosecond timestamps (1678 CE-2262 CE).
Non-nanosecond timestamps are supported in pandas version 2.0.
If False, all timestamps are converted to datetime64 dtype.
use_threads : bool, default True
Whether to parallelize the conversion using multiple threads.
deduplicate_objects : bool, default True
Expand Down Expand Up @@ -775,6 +779,13 @@ cdef class _PandasConvertible(_Weakrefable):
expected to return a pandas ExtensionDtype or ``None`` if the
default conversion should be used for that type. If you have
a dictionary mapping, you can pass ``dict.get`` as function.
coerce_temporal_nanoseconds : bool, default False
Only applicable to pandas version >= 2.0.
A legacy option to coerce date32, date64, duration, and timestamp
time units to nanoseconds when converting to pandas. This is the
default behavior in pandas version 1.x. Set this option to True if
you'd like to use this coercion when using pandas version >= 2.0
for backwards compatibility (not recommended otherwise).
Returns
-------
Expand Down Expand Up @@ -850,7 +861,8 @@ cdef class _PandasConvertible(_Weakrefable):
safe=safe,
split_blocks=split_blocks,
self_destruct=self_destruct,
maps_as_pydicts=maps_as_pydicts
maps_as_pydicts=maps_as_pydicts,
coerce_temporal_nanoseconds=coerce_temporal_nanoseconds
)
return self._to_pandas(options, categories=categories,
ignore_metadata=ignore_metadata,
Expand All @@ -870,6 +882,7 @@ cdef PandasOptions _convert_pandas_options(dict options):
result.safe_cast = options['safe']
result.split_blocks = options['split_blocks']
result.self_destruct = options['self_destruct']
result.coerce_temporal_nanoseconds = options['coerce_temporal_nanoseconds']
result.ignore_timezone = os.environ.get('PYARROW_IGNORE_TIMEZONE', False)

maps_as_pydicts = options['maps_as_pydicts']
Expand Down Expand Up @@ -1525,6 +1538,7 @@ cdef class Array(_PandasConvertible):
# so it can't be done if the user requested a zero_copy.
c_options.decode_dictionaries = not zero_copy_only
c_options.zero_copy_only = zero_copy_only
c_options.to_numpy = True

with nogil:
check_status(ConvertArrayToPandas(c_options, self.sp_array,
Expand Down Expand Up @@ -1689,8 +1703,9 @@ cdef _array_like_to_pandas(obj, options, types_mapper):
arr = dtype.__from_arrow__(obj)
return pandas_api.series(arr, name=name, copy=False)

# ARROW-3789(wesm): Convert date/timestamp types to datetime64[ns]
c_options.coerce_temporal_nanoseconds = True
if pandas_api.is_v1():
# ARROW-3789: Coerce date/timestamp types to datetime64[ns]
c_options.coerce_temporal_nanoseconds = True

if isinstance(obj, Array):
with nogil:
Expand Down
1 change: 1 addition & 0 deletions python/pyarrow/includes/libarrow_python.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
c_bool decode_dictionaries
unordered_set[c_string] categorical_columns
unordered_set[c_string] extension_columns
c_bool to_numpy

cdef cppclass CSerializedPyObject" arrow::py::SerializedPyObject":
shared_ptr[CRecordBatch] batch
Expand Down
8 changes: 8 additions & 0 deletions python/pyarrow/pandas-shim.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ cdef class _PandasAPIShim(object):
object _array_like_types, _is_extension_array_dtype
bint has_sparse
bint _pd024
bint _is_v1

def __init__(self):
self._tried_importing_pandas = False
Expand All @@ -58,6 +59,7 @@ cdef class _PandasAPIShim(object):
self._pd = pd
self._version = pd.__version__
self._loose_version = Version(pd.__version__)
self._is_v1 = False

if self._loose_version < Version('1.0.0'):
self._have_pandas = False
Expand All @@ -72,6 +74,8 @@ cdef class _PandasAPIShim(object):
"installed. Therefore, pandas-specific integration is not "
"used.".format(self._version), stacklevel=2)
return
elif self._loose_version < Version('2.0.0'):
self._is_v1 = True

self._compat_module = pdcompat
self._data_frame = pd.DataFrame
Expand Down Expand Up @@ -150,6 +154,10 @@ cdef class _PandasAPIShim(object):
self._check_import()
return self._version

def is_v1(self):
self._check_import()
return self._is_v1

@property
def categorical_type(self):
self._check_import()
Expand Down
9 changes: 6 additions & 3 deletions python/pyarrow/pandas_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -714,7 +714,8 @@ def _reconstruct_block(item, columns=None, extension_columns=None):
ordered=item['ordered'])
block = _int.make_block(cat, placement=placement)
elif 'timezone' in item:
dtype = make_datetimetz(item['timezone'])
unit, _ = np.datetime_data(block_arr.dtype)
dtype = make_datetimetz(unit, item['timezone'])
block = _int.make_block(block_arr, placement=placement,
klass=_int.DatetimeTZBlock,
dtype=dtype)
Expand All @@ -738,9 +739,11 @@ def _reconstruct_block(item, columns=None, extension_columns=None):
return block


def make_datetimetz(tz):
def make_datetimetz(unit, tz):
if _pandas_api.is_v1():
unit = 'ns' # ARROW-3789: Coerce date/timestamp types to datetime64[ns]
tz = pa.lib.string_to_tzinfo(tz)
return _pandas_api.datetimetz_type('ns', tz=tz)
return _pandas_api.datetimetz_type(unit, tz=tz)


def table_to_blockmanager(options, table, categories=None,
Expand Down
Loading

0 comments on commit 4f56aba

Please sign in to comment.