ENH: DTI/DTA.astype support non-nano (#47579)

* ENH: DTI/DTA.astype support non-nano * whatsnew * GH ref * pyright fixup
pandas-dev · Jul 5, 2022 · 67e8c4c · 67e8c4c
1 parent 700ef33
commit 67e8c4c
Show file tree

Hide file tree

Showing 12 changed files with 97 additions and 14 deletions.
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -275,7 +275,9 @@ Other enhancements
 - :class:`.DataError`, :class:`.SpecificationError`, :class:`.SettingWithCopyError`, :class:`.SettingWithCopyWarning`, :class:`.NumExprClobberingError`, :class:`.UndefinedVariableError`, and :class:`.IndexingError` are now exposed in ``pandas.errors`` (:issue:`27656`)
 - Added ``check_like`` argument to :func:`testing.assert_series_equal` (:issue:`47247`)
 - Allow reading compressed SAS files with :func:`read_sas` (e.g., ``.sas7bdat.gz`` files)
+- :meth:`DatetimeIndex.astype` now supports casting timezone-naive indexes to ``datetime64[s]``, ``datetime64[ms]``, and ``datetime64[us]``, and timezone-aware indexes to the corresponding ``datetime64[unit, tzname]`` dtypes (:issue:`47579`)
 - :class:`Series` reducers (e.g. ``min``, ``max``, ``sum``, ``mean``) will now successfully operate when the dtype is numeric and ``numeric_only=True`` is provided; previously this would raise a ``NotImplementedError`` (:issue:`47500`)
+-
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_150.notable_bug_fixes:

diff --git a/pandas/_libs/tslibs/__init__.py b/pandas/_libs/tslibs/__init__.py
@@ -30,12 +30,14 @@
     "get_unit_from_dtype",
     "periods_per_day",
     "periods_per_second",
+    "is_supported_unit",
 ]
 
 from pandas._libs.tslibs import dtypes
 from pandas._libs.tslibs.conversion import localize_pydatetime
 from pandas._libs.tslibs.dtypes import (
     Resolution,
+    is_supported_unit,
     periods_per_day,
     periods_per_second,
 )

diff --git a/pandas/_libs/tslibs/dtypes.pyi b/pandas/_libs/tslibs/dtypes.pyi
@@ -7,6 +7,7 @@ _period_code_map: dict[str, int]
 
 def periods_per_day(reso: int) -> int: ...
 def periods_per_second(reso: int) -> int: ...
+def is_supported_unit(reso: int) -> bool: ...
 
 class PeriodDtypeBase:
     _dtype_code: int  # PeriodDtypeCode

diff --git a/pandas/_libs/tslibs/dtypes.pyx b/pandas/_libs/tslibs/dtypes.pyx
@@ -277,6 +277,15 @@ class NpyDatetimeUnit(Enum):
     NPY_FR_GENERIC = NPY_DATETIMEUNIT.NPY_FR_GENERIC
 
 
+def is_supported_unit(NPY_DATETIMEUNIT reso):
+    return (
+        reso == NPY_DATETIMEUNIT.NPY_FR_ns
+        or reso == NPY_DATETIMEUNIT.NPY_FR_us
+        or reso == NPY_DATETIMEUNIT.NPY_FR_ms
+        or reso == NPY_DATETIMEUNIT.NPY_FR_s
+    )
+
+
 cdef str npy_unit_to_abbrev(NPY_DATETIMEUNIT unit):
     if unit == NPY_DATETIMEUNIT.NPY_FR_ns or unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC:
         # generic -> default to nanoseconds

diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py
@@ -31,6 +31,7 @@
     get_unit_from_dtype,
     ints_to_pydatetime,
     is_date_array_normalized,
+    is_supported_unit,
     is_unitless,
     normalize_i8_timestamps,
     timezones,
@@ -603,12 +604,26 @@ def astype(self, dtype, copy: bool = True):
                 return self.copy()
             return self
 
+        elif (
+            self.tz is None
+            and is_datetime64_dtype(dtype)
+            and not is_unitless(dtype)
+            and is_supported_unit(get_unit_from_dtype(dtype))
+        ):
+            # unit conversion e.g. datetime64[s]
+            res_values = astype_overflowsafe(self._ndarray, dtype, copy=True)
+            return type(self)._simple_new(res_values, dtype=res_values.dtype)
+            # TODO: preserve freq?
+
         elif is_datetime64_ns_dtype(dtype):
             return astype_dt64_to_dt64tz(self, dtype, copy, via_utc=False)
 
-        elif self.tz is None and is_datetime64_dtype(dtype) and dtype != self.dtype:
-            # unit conversion e.g. datetime64[s]
-            return self._ndarray.astype(dtype)
+        elif self.tz is not None and isinstance(dtype, DatetimeTZDtype):
+            # tzaware unit conversion e.g. datetime64[s, UTC]
+            np_dtype = np.dtype(dtype.str)
+            res_values = astype_overflowsafe(self._ndarray, np_dtype, copy=copy)
+            return type(self)._simple_new(res_values, dtype=dtype)
+            # TODO: preserve freq?
 
         elif is_period_dtype(dtype):
             return self.to_period(freq=dtype.freq)

diff --git a/pandas/core/dtypes/astype.py b/pandas/core/dtypes/astype.py
@@ -15,6 +15,7 @@
 import numpy as np
 
 from pandas._libs import lib
+from pandas._libs.tslibs import is_unitless
 from pandas._libs.tslibs.timedeltas import array_to_timedelta64
 from pandas._typing import (
     ArrayLike,
@@ -280,6 +281,20 @@ def astype_array_safe(
         # Ensure we don't end up with a PandasArray
         dtype = dtype.numpy_dtype
 
+    if (
+        is_datetime64_dtype(values.dtype)
+        # need to do np.dtype check instead of is_datetime64_dtype
+        #  otherwise pyright complains
+        and isinstance(dtype, np.dtype)
+        and dtype.kind == "M"
+        and not is_unitless(dtype)
+        and not is_dtype_equal(dtype, values.dtype)
+    ):
+        # unit conversion, we would re-cast to nanosecond, so this is
+        #  effectively just a copy (regardless of copy kwd)
+        # TODO(2.0): remove special-case
+        return values.copy()
+
     try:
         new_values = astype_array(values, dtype, copy=copy)
     except (ValueError, TypeError):

diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
@@ -966,7 +966,9 @@ def is_datetime64_ns_dtype(arr_or_dtype) -> bool:
             tipo = get_dtype(arr_or_dtype.dtype)
         else:
             return False
-    return tipo == DT64NS_DTYPE or getattr(tipo, "base", None) == DT64NS_DTYPE
+    return tipo == DT64NS_DTYPE or (
+        isinstance(tipo, DatetimeTZDtype) and tipo._unit == "ns"
+    )
 
 
 def is_timedelta64_ns_dtype(arr_or_dtype) -> bool:

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -1064,16 +1064,6 @@ def astype(self, dtype, copy: bool = True):
             # Ensure that self.astype(self.dtype) is self
             return self.copy() if copy else self
 
-        if (
-            self.dtype == np.dtype("M8[ns]")
-            and isinstance(dtype, np.dtype)
-            and dtype.kind == "M"
-            and dtype != np.dtype("M8[ns]")
-        ):
-            # For now DatetimeArray supports this by unwrapping ndarray,
-            #  but DatetimeIndex doesn't
-            raise TypeError(f"Cannot cast {type(self).__name__} to dtype")
-
         values = self._data
         if isinstance(values, ExtensionArray):
             with rewrite_exception(type(values).__name__, type(self).__name__):

diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py
@@ -48,6 +48,7 @@
 from pandas.core.dtypes.common import (
     is_datetime64_dtype,
     is_datetime64tz_dtype,
+    is_dtype_equal,
     is_scalar,
 )
 from pandas.core.dtypes.missing import is_valid_na_for_dtype
@@ -338,6 +339,18 @@ def __new__(
             if copy:
                 data = data.copy()
             return cls._simple_new(data, name=name)
+        elif (
+            isinstance(data, DatetimeArray)
+            and freq is lib.no_default
+            and tz is None
+            and is_dtype_equal(data.dtype, dtype)
+        ):
+            # Reached via Index.__new__ when we call .astype
+            # TODO(2.0): special casing can be removed once _from_sequence_not_strict
+            #  no longer chokes on non-nano
+            if copy:
+                data = data.copy()
+            return cls._simple_new(data, name=name)
 
         dtarr = DatetimeArray._from_sequence_not_strict(
             data,

diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py
@@ -207,6 +207,36 @@ def test_cmp_dt64_arraylike_tznaive(self, comparison_op):
 
 
 class TestDatetimeArray:
+    def test_astype_non_nano_tznaive(self):
+        dti = pd.date_range("2016-01-01", periods=3)
+
+        res = dti.astype("M8[s]")
+        assert res.dtype == "M8[s]"
+
+        dta = dti._data
+        res = dta.astype("M8[s]")
+        assert res.dtype == "M8[s]"
+        assert isinstance(res, pd.core.arrays.DatetimeArray)  # used to be ndarray
+
+    def test_astype_non_nano_tzaware(self):
+        dti = pd.date_range("2016-01-01", periods=3, tz="UTC")
+
+        res = dti.astype("M8[s, US/Pacific]")
+        assert res.dtype == "M8[s, US/Pacific]"
+
+        dta = dti._data
+        res = dta.astype("M8[s, US/Pacific]")
+        assert res.dtype == "M8[s, US/Pacific]"
+
+        # from non-nano to non-nano, preserving reso
+        res2 = res.astype("M8[s, UTC]")
+        assert res2.dtype == "M8[s, UTC]"
+        assert not tm.shares_memory(res2, res)
+
+        res3 = res.astype("M8[s, UTC]", copy=False)
+        assert res2.dtype == "M8[s, UTC]"
+        assert tm.shares_memory(res3, res)
+
     def test_astype_to_same(self):
         arr = DatetimeArray._from_sequence(
             ["2000"], dtype=DatetimeTZDtype(tz="US/Central")

diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py
@@ -474,6 +474,9 @@ def test_is_datetime64_ns_dtype():
         pd.DatetimeIndex([1, 2, 3], dtype=np.dtype("datetime64[ns]"))
     )
 
+    # non-nano dt64tz
+    assert not com.is_datetime64_ns_dtype(DatetimeTZDtype("us", "US/Eastern"))
+
 
 def test_is_timedelta64_ns_dtype():
     assert not com.is_timedelta64_ns_dtype(np.dtype("m8[ps]"))

diff --git a/pandas/tests/tslibs/test_api.py b/pandas/tests/tslibs/test_api.py
@@ -55,6 +55,7 @@ def test_namespace():
         "get_unit_from_dtype",
         "periods_per_day",
         "periods_per_second",
+        "is_supported_unit",
     ]
 
     expected = set(submodules + api)