Skip to content

ENH: Added stride/offset aliases in to_datetime #26631

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 9 commits into from
21 changes: 5 additions & 16 deletions pandas/_libs/tslibs/frequencies.pyx
Original file line number Diff line number Diff line change
@@ -1,20 +1,10 @@
import re

cimport numpy as cnp
cnp.import_array()

from pandas._libs.tslibs.util cimport is_integer_object

from pandas._libs.tslibs.ccalendar import MONTH_NUMBERS

# ----------------------------------------------------------------------
# Constants

# hack to handle WOM-1MON
opattern = re.compile(
r'([+\-]?\d*|[+\-]?\d*\.\d*)\s*([A-Za-z]+([\-][\dA-Za-z\-]+)?)'
)

INVALID_FREQ_ERR_MSG = "Invalid frequency: {0}"

# ---------------------------------------------------------------------
Expand Down Expand Up @@ -194,20 +184,19 @@ cpdef _base_and_stride(str freqstr):
--------
_freq_and_stride('5Min') -> 'Min', 5
"""
groups = opattern.match(freqstr)
base = freqstr.lstrip('+-. 0123456789')
stride = freqstr[:freqstr.index(base)]

if not groups:
if not base:
raise ValueError("Could not evaluate {freq}".format(freq=freqstr))

stride = groups.group(1)

# Possible for stride to be float at this point. Should it fail or floor?
# Right now it fails.
if len(stride):
stride = int(stride)
else:
stride = 1

base = groups.group(2)

return base, stride


Expand Down
74 changes: 47 additions & 27 deletions pandas/_libs/tslibs/timedeltas.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ from pandas._libs.tslibs.util cimport (

from pandas._libs.tslibs.c_timestamp cimport _Timestamp

from pandas._libs.tslibs.ccalendar import DAY_SECONDS
from pandas._libs.tslibs.frequencies import _base_and_stride

from pandas._libs.tslibs.np_datetime cimport (
cmp_scalar, reverse_ops, td64_to_tdstruct, pandas_timedeltastruct)
Expand Down Expand Up @@ -235,7 +235,7 @@ def array_to_timedelta64(object[:] values, unit='ns', errors='raise'):
else:
result[i] = parse_timedelta_string(values[i])
except (TypeError, ValueError):
unit = parse_timedelta_unit(unit)
unit, stride = _base_and_stride(unit)
for i in range(n):
try:
result[i] = convert_to_timedelta64(values[i], unit)
Expand All @@ -257,36 +257,52 @@ cpdef inline object precision_from_unit(object unit):
int64_t m
int p

if unit == 'Y':
m = 1000000000L * 31556952
p = 9
elif unit == 'M':
m = 1000000000L * 2629746
p = 9
elif unit == 'W':
m = 1000000000L * DAY_SECONDS * 7
p = 9
elif unit == 'D' or unit == 'd':
m = 1000000000L * DAY_SECONDS
if unit is None:
m = 1L
p = 0
return m, p

unit, stride = _base_and_stride(unit)

if unit in ['Y', 'A', 'M', 'W']:
warnings.warn("Y, A, M, and W units are deprecated and "
"will be removed in a future version.",
FutureWarning)

# Don't know why previous implementation used the multiplication in the
# in-line comment instead of the value. Current approach saves an
# operation.
if unit in ['D', 'd']:
m = stride * 86400000000000L # 1000000000L * DAY_SECONDS
p = 9
elif unit == 'h':
m = 1000000000L * 3600
elif unit in ['H', 'h']:
m = stride * 3600000000000L # 1000000000L * 3600
p = 9
elif unit == 'm':
m = 1000000000L * 60
elif unit in ['T', 'min', 'm']:
m = stride * 60000000000L # 1000000000L * 60
p = 9
elif unit == 's':
m = 1000000000L
elif unit in ['S', 's']:
m = stride * 1000000000L
p = 9
elif unit == 'ms':
m = 1000000L
elif unit in ['L', 'ms']:
m = stride * 1000000L
p = 6
elif unit == 'us':
m = 1000L
elif unit in ['U', 'us']:
m = stride * 1000L
p = 3
elif unit == 'ns' or unit is None:
m = 1L
elif unit in ['N', 'ns']:
m = stride * 1L
p = 0
# deprecated units at end because rarely evaluated
elif unit in ['Y', 'A']:
m = stride * 1000000000L * 31556952
p = 9
elif unit == 'M':
m = stride * 1000000000L * 2629746
p = 9
elif unit == 'W':
m = stride * 1000000000L * 86400 * 7
p = 9
else:
raise ValueError("cannot cast unit {unit}".format(unit=unit))
return m, p
Expand All @@ -305,7 +321,7 @@ cdef inline int64_t cast_from_unit(object ts, object unit) except? -1:
if ts is None:
return m

# cast the unit, multiply base/frace separately
# cast the unit, multiply base/frac separately
# to avoid precision issues from float -> int
base = <int64_t>ts
frac = ts - base
Expand Down Expand Up @@ -505,8 +521,9 @@ cdef inline timedelta_from_spec(object number, object frac, object unit):

try:
unit = ''.join(unit)
unit, stride = _base_and_stride(unit)
if unit == 'M':
# To parse ISO 8601 string, 'M' should be treated as minute,
# To parse ISO 8601 string, 'm' should be treated as minute,
# not month
unit = 'm'
unit = parse_timedelta_unit(unit)
Expand Down Expand Up @@ -1255,6 +1272,9 @@ class Timedelta(_Timedelta):
"[weeks, days, hours, minutes, seconds, "
"milliseconds, microseconds, nanoseconds]")

if unit is not None:
unit, stride = _base_and_stride(unit)

if unit in {'Y', 'y', 'M'}:
warnings.warn("M and Y units are deprecated and "
"will be removed in a future version.",
Expand Down
60 changes: 45 additions & 15 deletions pandas/core/tools/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from pandas._libs import tslib, tslibs
from pandas._libs.tslibs import Timestamp, conversion, parsing
from pandas._libs.tslibs.frequencies import _base_and_stride
from pandas._libs.tslibs.parsing import ( # noqa
DateParseError,
_format_is_iso,
Expand Down Expand Up @@ -378,7 +379,6 @@ def _convert_listlike_datetimes(

arg = ensure_object(arg)
require_iso8601 = False

if infer_datetime_format and format is None:
format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst)

Expand Down Expand Up @@ -490,15 +490,18 @@ def _adjust_to_origin(arg, origin, unit):
origin : 'julian' or Timestamp
origin offset for the arg
unit : string
passed unit from to_datetime, must be 'D'
passed unit from to_datetime, must be 'D' if origin is 'julian'

Returns
-------
ndarray or scalar of adjusted date(s)
"""
from pandas import DatetimeIndex

if origin == "julian":
original = arg
j0 = Timestamp(0).to_julian_date()
unit, stride = _base_and_stride(unit)
if unit != "D":
raise ValueError("unit must be 'D' for origin='julian'")
try:
Expand Down Expand Up @@ -527,7 +530,8 @@ def _adjust_to_origin(arg, origin, unit):
)
)

# we are going to offset back to unix / epoch time
# test the origin to make sure within valid range and no time
# zone
try:
offset = Timestamp(origin)
except tslibs.OutOfBoundsDatetime:
Expand All @@ -542,18 +546,28 @@ def _adjust_to_origin(arg, origin, unit):

if offset.tz is not None:
raise ValueError("origin offset {} must be tz-naive".format(offset))
offset -= Timestamp(0)

# convert the offset to the unit of the arg
# this should be lossless in terms of precision
offset = offset // tslibs.Timedelta(1, unit=unit)
unit, stride = _base_and_stride(unit)

delta = tslibs.Timedelta(stride, unit=unit)

# scalars & ndarray-like can handle the addition
# scalars & ndarray-like can handle the multiplication and addition
if is_list_like(arg) and not isinstance(
arg, (ABCSeries, ABCIndexClass, np.ndarray)
):
arg = np.asarray(arg)
arg = arg + offset

if stride == 1 and (offset - offset.floor("D")) == tslibs.Timedelta(0):
arg = arg + (offset.value // delta.value)
else:
# convert any integer type to int64 to prevent overflow
if is_integer_dtype(arg):
arg = arg.astype("int64", copy=False)
try:
arg = DatetimeIndex((arg * delta.value) + offset.value)
except TypeError:
arg = Timestamp((arg * delta.value) + offset.value)

return arg


Expand Down Expand Up @@ -627,11 +641,27 @@ def to_datetime(
- If True, require an exact format match.
- If False, allow the format to match anywhere in the target string.

unit : string, default 'ns'
unit of the arg (D,s,ms,us,ns) denote the unit, which is an
integer or float number. This will be based off the origin.
Example, with unit='ms' and origin='unix' (the default), this
would calculate the number of milliseconds to the unix epoch start.
unit : string, default is 'N'
The unit code for the value(s) in `arg`. Used when `arg` is
a numeric value or ordered collection of numeric values.
The unit code is a subset of pandas offset aliases, ISO 8601
codes, and legacy codes.

- 'D', for daily
- 'H' or 'h' for hourly
- 'T', 'm', or 'min' for minutely
- 'S' or 's' for seconds
- 'L' or 'ms' for milliseconds
- 'U' or 'us' for microseconds
- 'N' or 'ns' for nanoseconds

The resulting DatetimeIndex will be based off the `origin`.
For example, with unit='L' and origin='unix' (the default) then
the values in `arg` would represent the number of milliseconds
from the unix epoch start.

The unit code can be prefixed with a stride. For example,
results would be equivalent between unit='24H' and unit='D'.
infer_datetime_format : boolean, default False
If True and no `format` is given, attempt to infer the format of the
datetime strings, and if it can be inferred, switch to a faster
Expand Down Expand Up @@ -885,7 +915,7 @@ def coerce(values):
# we allow coercion to if errors allows
values = to_numeric(values, errors=errors)

# prevent overflow in case of int8 or int16
# convert any integer type to int64 to prevent overflow
if is_integer_dtype(values):
values = values.astype("int64", copy=False)
return values
Expand Down
92 changes: 92 additions & 0 deletions pandas/tests/indexes/datetimes/test_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -1358,6 +1358,98 @@ def test_to_datetime_errors_ignore_utc_true(self):
expected = DatetimeIndex(["1970-01-01 00:00:01"], tz="UTC")
tm.assert_index_equal(result, expected)

@pytest.mark.parametrize(
"unitl,unitr",
[
("D", "24h"),
("2D", "48H"),
("3D", "72H"),
("4D", "96H"),
("7D", "168H"),
("86400000000000N", "D"),
("60T", "H"),
("120m", "2H"),
("60min", "H"),
("3600S", "h"),
("3600000ms", "H"),
("3600000L", "H"),
("3600000000U", "H"),
("60s", "m"),
("60S", "T"),
("60S", "min"),
("1000ms", "S"),
("1000L", "S"),
("1000000U", "S"),
("1000000000N", "S"),
],
)
def test_to_datetime_stride(self, unitl, unitr):
result = pd.to_datetime([1, 2, 3, 5], unit=unitl)
expected = pd.to_datetime([1, 2, 3, 5], unit=unitr)
tm.assert_index_equal(result, expected)

result = pd.to_datetime(2, unit=unitl)
expected = pd.to_datetime(2, unit=unitr)
assert result == expected

# Can't use 'm' for minutes and 'M' for months in the following test
# since tested against pd.date_range which sees both 'm' and 'M' as
# months.
@pytest.mark.parametrize(
"unit,epoch",
[
("D", "1980-01-02"),
("D", "2018-05-18"),
("D", "2018-05-18T11"),
("D", "2018-05-18T11:04"),
("D", "2018-05-18T11:04:52"),
("2D", "1970-01-01"),
("2D", "1970-01-01T21:12:43"),
("2D", "2019-05-03"),
("2D", "2019-05-03T12:11"),
("3D", "1970-01-01"),
("3D", "2019-05-03T14"),
("4D", "1970-05-03"),
("4D", "2019-05-03T11"),
("5D", "2019-05-03T11"),
("6D", "2019-05-03T11"),
("7D", "2019-05-03T11"),
("14D", "2019-05-03T11"),
("H", "2018-05-18"),
("H", "2018-05-18T11"),
("H", "2018-05-18T11:04"),
("H", "2018-05-18T11:04:52"),
("12h", "1990-05-03T12:00:00"),
("24H", "1980-12-31"),
("48h", "1980-12-31"),
("96h", "1980-12-31"),
("2H", "2019-12-31T11:59"),
("24h", "2001-08-15"),
("5T", "2001-08-15"),
("5min", "2001-08-15"),
("10T", "2001-08-15"),
("5S", "1970-01-01T01:10"),
("60S", "1970-01-01T01:10:12"),
("5T", "1980-12-31"),
("1000T", "1980-12-31"),
("100N", "1980-12-31"),
("N", "1980-12-31"),
],
)
def test_to_datetime_stride_epoch(self, unit, epoch):
result = pd.to_datetime(list(range(100)), unit=unit, origin=epoch)
expected = pd.date_range(start=epoch, freq=unit, periods=100)
tm.assert_index_equal(result, expected)

result = pd.to_datetime(2, unit=unit, origin=epoch)
expected = pd.date_range(start=epoch, freq=unit, periods=100)[2]
assert result == expected

@pytest.mark.parametrize("unit", ["Y", "A", "M", "W"])
def test_to_datetime_unit_code_deprecated(self, unit):
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
_ = pd.to_datetime(list(range(100)), unit=unit, origin="unix")


class TestToDatetimeMisc:
def test_to_datetime_barely_out_of_bounds(self):
Expand Down
Loading