Skip to content

Commit

Permalink
BUG: Timedelta(td64_out_of_bounds) silently overflowing (#38965)
Browse files Browse the repository at this point in the history
  • Loading branch information
jbrockmendel authored Jan 6, 2021
1 parent 0f2db73 commit 8cf3771
Show file tree
Hide file tree
Showing 7 changed files with 143 additions and 28 deletions.
4 changes: 2 additions & 2 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -198,14 +198,14 @@ Datetimelike
- Bug in :class:`DataFrame` and :class:`Series` constructors sometimes dropping nanoseconds from :class:`Timestamp` (resp. :class:`Timedelta`) ``data``, with ``dtype=datetime64[ns]`` (resp. ``timedelta64[ns]``) (:issue:`38032`)
- Bug in :meth:`DataFrame.first` and :meth:`Series.first` returning two months for offset one month when first day is last calendar day (:issue:`29623`)
- Bug in constructing a :class:`DataFrame` or :class:`Series` with mismatched ``datetime64`` data and ``timedelta64`` dtype, or vice-versa, failing to raise ``TypeError`` (:issue:`38575`, :issue:`38764`, :issue:`38792`)
- Bug in constructing a :class:`Series` or :class:`DataFrame` with a ``datetime`` object out of bounds for ``datetime64[ns]`` dtype (:issue:`38792`)
- Bug in constructing a :class:`Series` or :class:`DataFrame` with a ``datetime`` object out of bounds for ``datetime64[ns]`` dtype or a ``timedelta`` object ouf of bounds for ``timedelta64[ns]`` dtype (:issue:`38792`, :issue:`38965`)
- Bug in :meth:`DatetimeIndex.intersection`, :meth:`DatetimeIndex.symmetric_difference`, :meth:`PeriodIndex.intersection`, :meth:`PeriodIndex.symmetric_difference` always returning object-dtype when operating with :class:`CategoricalIndex` (:issue:`38741`)
- Bug in :meth:`Series.where` incorrectly casting ``datetime64`` values to ``int64`` (:issue:`37682`)
-

Timedelta
^^^^^^^^^

- Bug in constructing :class:`Timedelta` from ``np.timedelta64`` objects with non-nanosecond units that are out of bounds for ``timedelta64[ns]`` (:issue:`38965`)
-
-

Expand Down
1 change: 1 addition & 0 deletions pandas/_libs/tslibs/np_datetime.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ cdef extern from "numpy/ndarraytypes.h":
NPY_FR_ps
NPY_FR_fs
NPY_FR_as
NPY_FR_GENERIC

cdef extern from "src/datetime/np_datetime.h":
ctypedef struct pandas_timedeltastruct:
Expand Down
89 changes: 79 additions & 10 deletions pandas/_libs/tslibs/timedeltas.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,18 @@ PyDateTime_IMPORT

cimport pandas._libs.tslibs.util as util
from pandas._libs.tslibs.base cimport ABCTimestamp
from pandas._libs.tslibs.conversion cimport cast_from_unit
from pandas._libs.tslibs.conversion cimport cast_from_unit, precision_from_unit
from pandas._libs.tslibs.nattype cimport (
NPY_NAT,
c_NaT as NaT,
c_nat_strings as nat_strings,
checknull_with_nat,
)
from pandas._libs.tslibs.np_datetime cimport (
NPY_DATETIMEUNIT,
cmp_scalar,
get_datetime64_unit,
get_timedelta64_value,
pandas_timedeltastruct,
td64_to_tdstruct,
)
Expand Down Expand Up @@ -156,7 +159,7 @@ cpdef int64_t delta_to_nanoseconds(delta) except? -1:
if isinstance(delta, _Timedelta):
delta = delta.value
if is_timedelta64_object(delta):
return delta.astype("timedelta64[ns]").item()
return get_timedelta64_value(ensure_td64ns(delta))
if is_integer_object(delta):
return delta
if PyDelta_Check(delta):
Expand All @@ -169,6 +172,72 @@ cpdef int64_t delta_to_nanoseconds(delta) except? -1:
raise TypeError(type(delta))


cdef str npy_unit_to_abbrev(NPY_DATETIMEUNIT unit):
if unit == NPY_DATETIMEUNIT.NPY_FR_ns or unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC:
# generic -> default to nanoseconds
return "ns"
elif unit == NPY_DATETIMEUNIT.NPY_FR_us:
return "us"
elif unit == NPY_DATETIMEUNIT.NPY_FR_ms:
return "ms"
elif unit == NPY_DATETIMEUNIT.NPY_FR_s:
return "s"
elif unit == NPY_DATETIMEUNIT.NPY_FR_m:
return "m"
elif unit == NPY_DATETIMEUNIT.NPY_FR_h:
return "h"
elif unit == NPY_DATETIMEUNIT.NPY_FR_D:
return "D"
elif unit == NPY_DATETIMEUNIT.NPY_FR_W:
return "W"
elif unit == NPY_DATETIMEUNIT.NPY_FR_M:
return "M"
elif unit == NPY_DATETIMEUNIT.NPY_FR_Y:
return "Y"
else:
raise NotImplementedError(unit)


@cython.overflowcheck(True)
cdef object ensure_td64ns(object ts):
"""
Overflow-safe implementation of td64.astype("m8[ns]")
Parameters
----------
ts : np.timedelta64
Returns
-------
np.timedelta64[ns]
"""
cdef:
NPY_DATETIMEUNIT td64_unit
int64_t td64_value, mult
str unitstr

td64_unit = get_datetime64_unit(ts)
if (
td64_unit != NPY_DATETIMEUNIT.NPY_FR_ns
and td64_unit != NPY_DATETIMEUNIT.NPY_FR_GENERIC
):
unitstr = npy_unit_to_abbrev(td64_unit)

td64_value = get_timedelta64_value(ts)

mult = precision_from_unit(unitstr)[0]
try:
# NB: cython#1381 this cannot be *=
td64_value = td64_value * mult
except OverflowError as err:
from pandas._libs.tslibs.conversion import OutOfBoundsTimedelta
raise OutOfBoundsTimedelta(ts)

return np.timedelta64(td64_value, "ns")

return ts


cdef convert_to_timedelta64(object ts, str unit):
"""
Convert an incoming object to a timedelta64 if possible.
Expand All @@ -184,37 +253,37 @@ cdef convert_to_timedelta64(object ts, str unit):
Return an ns based int64
"""
if checknull_with_nat(ts):
return np.timedelta64(NPY_NAT)
return np.timedelta64(NPY_NAT, "ns")
elif isinstance(ts, _Timedelta):
# already in the proper format
ts = np.timedelta64(ts.value)
ts = np.timedelta64(ts.value, "ns")
elif is_datetime64_object(ts):
# only accept a NaT here
if ts.astype('int64') == NPY_NAT:
return np.timedelta64(NPY_NAT)
elif is_timedelta64_object(ts):
ts = ts.astype(f"m8[{unit.lower()}]")
ts = ensure_td64ns(ts)
elif is_integer_object(ts):
if ts == NPY_NAT:
return np.timedelta64(NPY_NAT)
return np.timedelta64(NPY_NAT, "ns")
else:
if unit in ['Y', 'M', 'W']:
ts = np.timedelta64(ts, unit)
else:
ts = cast_from_unit(ts, unit)
ts = np.timedelta64(ts)
ts = np.timedelta64(ts, "ns")
elif is_float_object(ts):
if unit in ['Y', 'M', 'W']:
ts = np.timedelta64(int(ts), unit)
else:
ts = cast_from_unit(ts, unit)
ts = np.timedelta64(ts)
ts = np.timedelta64(ts, "ns")
elif isinstance(ts, str):
if len(ts) > 0 and ts[0] == 'P':
ts = parse_iso_format_string(ts)
else:
ts = parse_timedelta_string(ts)
ts = np.timedelta64(ts)
ts = np.timedelta64(ts, "ns")
elif is_tick_object(ts):
ts = np.timedelta64(ts.nanos, 'ns')

Expand Down Expand Up @@ -1196,7 +1265,7 @@ class Timedelta(_Timedelta):
elif is_timedelta64_object(value):
if unit is not None:
value = value.astype(f'timedelta64[{unit}]')
value = value.astype('timedelta64[ns]')
value = ensure_td64ns(value)
elif is_tick_object(value):
value = np.timedelta64(value.nanos, 'ns')
elif is_integer_object(value) or is_float_object(value):
Expand Down
20 changes: 16 additions & 4 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from pandas._libs.tslibs import (
NaT,
OutOfBoundsDatetime,
OutOfBoundsTimedelta,
Period,
Timedelta,
Timestamp,
Expand Down Expand Up @@ -743,8 +744,12 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> Tuple[DtypeObj,
val = val.value

elif isinstance(val, (np.timedelta64, timedelta)):
val = Timedelta(val).value
dtype = np.dtype("m8[ns]")
try:
val = Timedelta(val).value
except (OutOfBoundsTimedelta, OverflowError):
dtype = np.dtype(object)
else:
dtype = np.dtype("m8[ns]")

elif is_bool(val):
dtype = np.dtype(np.bool_)
Expand Down Expand Up @@ -1386,7 +1391,7 @@ def try_timedelta(v):

try:
td_values = to_timedelta(v)
except ValueError:
except (ValueError, OverflowError):
return v.reshape(shape)
else:
return np.asarray(td_values).reshape(shape)
Expand Down Expand Up @@ -1618,8 +1623,16 @@ def construct_2d_arraylike_from_scalar(
value: Scalar, length: int, width: int, dtype: np.dtype, copy: bool
) -> np.ndarray:

shape = (length, width)

if dtype.kind in ["m", "M"]:
value = maybe_unbox_datetimelike(value, dtype)
elif dtype == object:
if isinstance(value, (np.timedelta64, np.datetime64)):
# calling np.array below would cast to pytimedelta/pydatetime
out = np.empty(shape, dtype=object)
out.fill(value)
return out

# Attempt to coerce to a numpy array
try:
Expand All @@ -1632,7 +1645,6 @@ def construct_2d_arraylike_from_scalar(
if arr.ndim != 0:
raise ValueError("DataFrame constructor not properly called!")

shape = (length, width)
return np.full(shape, arr)


Expand Down
28 changes: 17 additions & 11 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import pytest
import pytz

from pandas.compat.numpy import _np_version_under1p19, _np_version_under1p20
from pandas.compat.numpy import _np_version_under1p19

from pandas.core.dtypes.common import is_integer_dtype
from pandas.core.dtypes.dtypes import DatetimeTZDtype, IntervalDtype, PeriodDtype
Expand Down Expand Up @@ -2371,16 +2371,10 @@ def test_from_timedelta_scalar_preserves_nanos(self, constructor):
def test_from_timestamp_scalar_preserves_nanos(self, constructor):
ts = Timestamp.now() + Timedelta(1)

obj = Series(ts, index=range(1), dtype="M8[ns]")
obj = constructor(ts, dtype="M8[ns]")
assert get1(obj) == ts

def test_from_timedelta64_scalar_object(self, constructor, request):
if getattr(constructor, "func", None) is DataFrame and _np_version_under1p20:
# getattr check means we only xfail when box is None
mark = pytest.mark.xfail(
reason="np.array(td64, dtype=object) converts to int"
)
request.node.add_marker(mark)
def test_from_timedelta64_scalar_object(self, constructor):

td = Timedelta(1)
td64 = td.to_timedelta64()
Expand All @@ -2407,8 +2401,20 @@ def test_from_scalar_datetimelike_mismatched(self, constructor, cls, request):
with pytest.raises(TypeError, match="Cannot cast"):
constructor(scalar, dtype=dtype)

def test_from_out_of_bounds_datetime(self, constructor):
@pytest.mark.parametrize("cls", [datetime, np.datetime64])
def test_from_out_of_bounds_datetime(self, constructor, cls):
scalar = datetime(9999, 1, 1)
if cls is np.datetime64:
scalar = np.datetime64(scalar, "D")
result = constructor(scalar)

assert type(get1(result)) is cls

@pytest.mark.parametrize("cls", [timedelta, np.timedelta64])
def test_from_out_of_bounds_timedelta(self, constructor, cls):
scalar = datetime(9999, 1, 1) - datetime(1970, 1, 1)
if cls is np.timedelta64:
scalar = np.timedelta64(scalar, "D")
result = constructor(scalar)

assert type(get1(result)) is datetime
assert type(get1(result)) is cls
27 changes: 27 additions & 0 deletions pandas/tests/scalar/timedelta/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import numpy as np
import pytest

from pandas._libs.tslibs import OutOfBoundsTimedelta

from pandas import Timedelta, offsets, to_timedelta


Expand Down Expand Up @@ -197,6 +199,31 @@ def test_overflow_on_construction():
Timedelta(timedelta(days=13 * 19999))


def test_construction_out_of_bounds_td64():
# TODO: parametrize over units just above/below the implementation bounds
# once GH#38964 is resolved

# Timedelta.max is just under 106752 days
td64 = np.timedelta64(106752, "D")
assert td64.astype("m8[ns]").view("i8") < 0 # i.e. naive astype will be wrong

msg = "106752 days"
with pytest.raises(OutOfBoundsTimedelta, match=msg):
Timedelta(td64)

# But just back in bounds and we are OK
assert Timedelta(td64 - 1) == td64 - 1

td64 *= -1
assert td64.astype("m8[ns]").view("i8") > 0 # i.e. naive astype will be wrong

with pytest.raises(OutOfBoundsTimedelta, match=msg):
Timedelta(td64)

# But just back in bounds and we are OK
assert Timedelta(td64 + 1) == td64 + 1


@pytest.mark.parametrize(
"fmt,exp",
[
Expand Down
2 changes: 1 addition & 1 deletion pandas/util/_exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def rewrite_exception(old_name: str, new_name: str):
try:
yield
except Exception as err:
msg = err.args[0]
msg = str(err.args[0])
msg = msg.replace(old_name, new_name)
args: Tuple[str, ...] = (msg,)
if len(err.args) > 1:
Expand Down

0 comments on commit 8cf3771

Please sign in to comment.