apacheGH-33321: [Python] Support converting to non-nano datetime64 fo…

…r pandas >= 2.0 (apache#35656) Do not coerce temporal types to nanosecond when pandas >= 2.0 is imported, since pandas now supports s/ms/us time units. This PR adds support for the following Arrow -> Pandas conversions, which previously all defaulted to `datetime64[ns]` or `datetime64[ns, <TZ>]`: ``` date32 -> datetime64[ms] date64 -> datetime64[ms] datetime64[s] -> datetime64[s] datetime64[ms] -> datetime64[ms] datetime64[us] -> datetime64[us] datetime64[s, <TZ>] -> datetime64[s, <TZ>] datetime64[ms, <TZ>] -> datetime64[ms, <TZ>] datetime64[us, <TZ>] -> datetime64[us, <TZ>] ``` ### Rationale for this change Pandas 2.0 introduces proper support for temporal types. ### Are these changes tested? Yes. Pytests added and updated. ### Are there any user-facing changes? Yes, arrow-to-pandas default conversion behavior will change when users have pandas >= 2.0, but a legacy option is exposed to provide backwards compatibility. * Closes: apache#33321 Lead-authored-by: Dane Pitkin <dane@voltrondata.com> Co-authored-by: Dane Pitkin <48041712+danepitkin@users.noreply.github.com> Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com> Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
danepitkin · Jul 7, 2023 · 4f56aba · 4f56aba
1 parent f8256bd
commit 4f56aba
Show file tree

Hide file tree

Showing 17 changed files with 416 additions and 175 deletions.
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
@@ -700,7 +700,8 @@ cdef class _PandasConvertible(_Weakrefable):
  bint split_blocks=False,
  bint self_destruct=False,
  str maps_as_pydicts=None,
- types_mapper=None
+ types_mapper=None,
+ bint coerce_temporal_nanoseconds=False
  ):
  """
  Convert to a pandas-compatible NumPy array or DataFrame, as appropriate
@@ -721,12 +722,15 @@ cdef class _PandasConvertible(_Weakrefable):
  integer_object_nulls : bool, default False
  Cast integers with nulls to objects
  date_as_object : bool, default True
- Cast dates to objects. If False, convert to datetime64[ns] dtype.
+ Cast dates to objects. If False, convert to datetime64 dtype with
+ the equivalent time unit (if supported). Note: in pandas version
+ < 2.0, only datetime64[ns] conversion is supported.
  timestamp_as_object : bool, default False
  Cast non-nanosecond timestamps (np.datetime64) to objects. This is
- useful if you have timestamps that don't fit in the normal date
- range of nanosecond timestamps (1678 CE-2262 CE).
- If False, all timestamps are converted to datetime64[ns] dtype.
+ useful in pandas version 1.x if you have timestamps that don't fit
+ in the normal date range of nanosecond timestamps (1678 CE-2262 CE).
+ Non-nanosecond timestamps are supported in pandas version 2.0.
+ If False, all timestamps are converted to datetime64 dtype.
  use_threads : bool, default True
  Whether to parallelize the conversion using multiple threads.
  deduplicate_objects : bool, default True
@@ -775,6 +779,13 @@ cdef class _PandasConvertible(_Weakrefable):
  expected to return a pandas ExtensionDtype or ``None`` if the
  default conversion should be used for that type. If you have
  a dictionary mapping, you can pass ``dict.get`` as function.
+ coerce_temporal_nanoseconds : bool, default False
+ Only applicable to pandas version >= 2.0.
+ A legacy option to coerce date32, date64, duration, and timestamp
+ time units to nanoseconds when converting to pandas. This is the
+ default behavior in pandas version 1.x. Set this option to True if
+ you'd like to use this coercion when using pandas version >= 2.0
+ for backwards compatibility (not recommended otherwise).
 
  Returns
  -------
@@ -850,7 +861,8 @@ cdef class _PandasConvertible(_Weakrefable):
  safe=safe,
  split_blocks=split_blocks,
  self_destruct=self_destruct,
- maps_as_pydicts=maps_as_pydicts
+ maps_as_pydicts=maps_as_pydicts,
+ coerce_temporal_nanoseconds=coerce_temporal_nanoseconds
  )
  return self._to_pandas(options, categories=categories,
  ignore_metadata=ignore_metadata,
@@ -870,6 +882,7 @@ cdef PandasOptions _convert_pandas_options(dict options):
  result.safe_cast = options['safe']
  result.split_blocks = options['split_blocks']
  result.self_destruct = options['self_destruct']
+ result.coerce_temporal_nanoseconds = options['coerce_temporal_nanoseconds']
  result.ignore_timezone = os.environ.get('PYARROW_IGNORE_TIMEZONE', False)
 
  maps_as_pydicts = options['maps_as_pydicts']
@@ -1525,6 +1538,7 @@ cdef class Array(_PandasConvertible):
  # so it can't be done if the user requested a zero_copy.
  c_options.decode_dictionaries = not zero_copy_only
  c_options.zero_copy_only = zero_copy_only
+ c_options.to_numpy = True
 
  with nogil:
  check_status(ConvertArrayToPandas(c_options, self.sp_array,
@@ -1689,8 +1703,9 @@ cdef _array_like_to_pandas(obj, options, types_mapper):
  arr = dtype.__from_arrow__(obj)
  return pandas_api.series(arr, name=name, copy=False)
 
- # ARROW-3789(wesm): Convert date/timestamp types to datetime64[ns]
- c_options.coerce_temporal_nanoseconds = True
+ if pandas_api.is_v1():
+ # ARROW-3789: Coerce date/timestamp types to datetime64[ns]
+ c_options.coerce_temporal_nanoseconds = True
 
  if isinstance(obj, Array):
  with nogil:

diff --git a/python/pyarrow/includes/libarrow_python.pxd b/python/pyarrow/includes/libarrow_python.pxd
@@ -197,6 +197,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
  c_bool decode_dictionaries
  unordered_set[c_string] categorical_columns
  unordered_set[c_string] extension_columns
+ c_bool to_numpy
 
  cdef cppclass CSerializedPyObject" arrow::py::SerializedPyObject":
  shared_ptr[CRecordBatch] batch

diff --git a/python/pyarrow/pandas-shim.pxi b/python/pyarrow/pandas-shim.pxi
@@ -37,6 +37,7 @@ cdef class _PandasAPIShim(object):
  object _array_like_types, _is_extension_array_dtype
  bint has_sparse
  bint _pd024
+ bint _is_v1
 
  def __init__(self):
  self._tried_importing_pandas = False
@@ -58,6 +59,7 @@ cdef class _PandasAPIShim(object):
  self._pd = pd
  self._version = pd.__version__
  self._loose_version = Version(pd.__version__)
+ self._is_v1 = False
 
  if self._loose_version < Version('1.0.0'):
  self._have_pandas = False
@@ -72,6 +74,8 @@ cdef class _PandasAPIShim(object):
  "installed. Therefore, pandas-specific integration is not "
  "used.".format(self._version), stacklevel=2)
  return
+ elif self._loose_version < Version('2.0.0'):
+ self._is_v1 = True
 
  self._compat_module = pdcompat
  self._data_frame = pd.DataFrame
@@ -150,6 +154,10 @@ cdef class _PandasAPIShim(object):
  self._check_import()
  return self._version
 
+ def is_v1(self):
+ self._check_import()
+ return self._is_v1
+
  @property
  def categorical_type(self):
  self._check_import()

diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
@@ -714,7 +714,8 @@ def _reconstruct_block(item, columns=None, extension_columns=None):
  ordered=item['ordered'])
  block = _int.make_block(cat, placement=placement)
  elif 'timezone' in item:
- dtype = make_datetimetz(item['timezone'])
+ unit, _ = np.datetime_data(block_arr.dtype)
+ dtype = make_datetimetz(unit, item['timezone'])
  block = _int.make_block(block_arr, placement=placement,
  klass=_int.DatetimeTZBlock,
  dtype=dtype)
@@ -738,9 +739,11 @@ def _reconstruct_block(item, columns=None, extension_columns=None):
  return block
 
 
-def make_datetimetz(tz):
+def make_datetimetz(unit, tz):
+ if _pandas_api.is_v1():
+ unit = 'ns' # ARROW-3789: Coerce date/timestamp types to datetime64[ns]
  tz = pa.lib.string_to_tzinfo(tz)
- return _pandas_api.datetimetz_type('ns', tz=tz)
+ return _pandas_api.datetimetz_type(unit, tz=tz)
 
 
 def table_to_blockmanager(options, table, categories=None,