From 9c67f70eaa90f6286cebb56fc1b827451ffa51bc Mon Sep 17 00:00:00 2001 From: Tim Cera Date: Mon, 3 Jun 2019 09:52:08 -0400 Subject: [PATCH 1/5] ENH: Added stride/offset aliases in to_datetime Added support for stride and offset aliases to the 'unit' keyword in 'to_datetime'. Documented support for additional 'unit' intervals that were available. --- pandas/_libs/tslibs/timedeltas.pyx | 59 ++++++++++++++++++++---------- pandas/core/tools/datetimes.py | 25 ++++++++++--- 2 files changed, 60 insertions(+), 24 deletions(-) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 16093ddd77667..81641348ea159 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -26,6 +26,8 @@ from pandas._libs.tslibs.c_timestamp cimport _Timestamp from pandas._libs.tslibs.ccalendar import DAY_SECONDS +from pandas._libs.tslibs.frequencies import _base_and_stride + from pandas._libs.tslibs.np_datetime cimport ( cmp_scalar, reverse_ops, td64_to_tdstruct, pandas_timedeltastruct) @@ -252,35 +254,54 @@ cpdef inline object precision_from_unit(object unit): int64_t m int p - if unit == 'Y': - m = 1000000000L * 31556952 + if unit is None: + m = 1L + p = 0 + return m, p + + unit, stride = _base_and_stride(unit) + + # Normalize old or lowercase codes to standard offset aliases. + if unit in ['min', 'm']: + unit = 'T' + elif unit == 'ns': + unit = 'N' + elif unit == 'ms': + unit = 'L' + elif unit == 'us': + unit = 'U' + + unit = unit.upper() + + if unit in ['Y', 'A']: + m = stride * 1000000000L * 31556952 p = 9 elif unit == 'M': - m = 1000000000L * 2629746 + m = stride * 1000000000L * 2629746 p = 9 elif unit == 'W': - m = 1000000000L * DAY_SECONDS * 7 + m = stride * 1000000000L * DAY_SECONDS * 7 p = 9 - elif unit == 'D' or unit == 'd': - m = 1000000000L * DAY_SECONDS + elif unit == 'D': + m = stride * 1000000000L * DAY_SECONDS p = 9 - elif unit == 'h': - m = 1000000000L * 3600 + elif unit == 'H': + m = stride * 1000000000L * 3600 p = 9 - elif unit == 'm': - m = 1000000000L * 60 + elif unit == 'T': + m = stride * 1000000000L * 60 p = 9 - elif unit == 's': - m = 1000000000L + elif unit == 'S': + m = stride * 1000000000L p = 9 - elif unit == 'ms': - m = 1000000L + elif unit == 'L': + m = stride * 1000000L p = 6 - elif unit == 'us': - m = 1000L + elif unit == 'U': + m = stride * 1000L p = 3 - elif unit == 'ns' or unit is None: - m = 1L + elif unit == 'N': + m = stride * 1L p = 0 else: raise ValueError("cannot cast unit {unit}".format(unit=unit)) @@ -300,7 +321,7 @@ cdef inline int64_t cast_from_unit(object ts, object unit) except? -1: if ts is None: return m - # cast the unit, multiply base/frace separately + # cast the unit, multiply base/frac separately # to avoid precision issues from float -> int base = ts frac = ts - base diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 817d539d4ad6f..17f5fdfb5c96c 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -470,11 +470,26 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, - If True, require an exact format match. - If False, allow the format to match anywhere in the target string. - unit : string, default 'ns' - unit of the arg (D,s,ms,us,ns) denote the unit, which is an - integer or float number. This will be based off the origin. - Example, with unit='ms' and origin='unix' (the default), this - would calculate the number of milliseconds to the unix epoch start. + unit : string, default is 'N' + The unit of the arg. Uses a subset of the pandas offset aliases. + + - 'Y', 'A' for yearly (long term average of 365.2425 days) + - 'M' for monthly (long term average of 30.436875 days) + - 'W' for weekly + - 'D' for daily + - 'H' for hourly + - 'T', 'min' for minutely + - 'S' for seconds + - 'L', 'ms' for milliseconds + - 'U', 'us' for microseconds + - 'N' for nanoseconds + + This will be based off the origin. Example, with unit='L' and + origin='unix' (the default), this would calculate the number of + milliseconds to the unix epoch start. + + The offset alias can be prefixed with a stride. For example, results + would be equivalent between unit='7D' and unit='W'. infer_datetime_format : boolean, default False If True and no `format` is given, attempt to infer the format of the datetime strings, and if it can be inferred, switch to a faster From bd3d10fcaf495d6c91855e946e5960e93e5f9aad Mon Sep 17 00:00:00 2001 From: Tim Cera Date: Mon, 3 Jun 2019 09:52:08 -0400 Subject: [PATCH 2/5] ENH: Added stride/offset aliases in to_datetime Added support for stride and offset aliases to the 'unit' keyword in 'to_datetime'. Documented support for additional 'unit' intervals that were available. --- pandas/_libs/tslibs/timedeltas.pyx | 59 ++++++++++++++++++++---------- pandas/core/tools/datetimes.py | 25 ++++++++++--- 2 files changed, 60 insertions(+), 24 deletions(-) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 6a32553fe2d38..f1cfe8d6eb297 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -26,6 +26,8 @@ from pandas._libs.tslibs.c_timestamp cimport _Timestamp from pandas._libs.tslibs.ccalendar import DAY_SECONDS +from pandas._libs.tslibs.frequencies import _base_and_stride + from pandas._libs.tslibs.np_datetime cimport ( cmp_scalar, reverse_ops, td64_to_tdstruct, pandas_timedeltastruct) @@ -252,35 +254,54 @@ cpdef inline object precision_from_unit(object unit): int64_t m int p - if unit == 'Y': - m = 1000000000L * 31556952 + if unit is None: + m = 1L + p = 0 + return m, p + + unit, stride = _base_and_stride(unit) + + # Normalize old or lowercase codes to standard offset aliases. + if unit in ['min', 'm']: + unit = 'T' + elif unit == 'ns': + unit = 'N' + elif unit == 'ms': + unit = 'L' + elif unit == 'us': + unit = 'U' + + unit = unit.upper() + + if unit in ['Y', 'A']: + m = stride * 1000000000L * 31556952 p = 9 elif unit == 'M': - m = 1000000000L * 2629746 + m = stride * 1000000000L * 2629746 p = 9 elif unit == 'W': - m = 1000000000L * DAY_SECONDS * 7 + m = stride * 1000000000L * DAY_SECONDS * 7 p = 9 - elif unit == 'D' or unit == 'd': - m = 1000000000L * DAY_SECONDS + elif unit == 'D': + m = stride * 1000000000L * DAY_SECONDS p = 9 - elif unit == 'h': - m = 1000000000L * 3600 + elif unit == 'H': + m = stride * 1000000000L * 3600 p = 9 - elif unit == 'm': - m = 1000000000L * 60 + elif unit == 'T': + m = stride * 1000000000L * 60 p = 9 - elif unit == 's': - m = 1000000000L + elif unit == 'S': + m = stride * 1000000000L p = 9 - elif unit == 'ms': - m = 1000000L + elif unit == 'L': + m = stride * 1000000L p = 6 - elif unit == 'us': - m = 1000L + elif unit == 'U': + m = stride * 1000L p = 3 - elif unit == 'ns' or unit is None: - m = 1L + elif unit == 'N': + m = stride * 1L p = 0 else: raise ValueError("cannot cast unit {unit}".format(unit=unit)) @@ -300,7 +321,7 @@ cdef inline int64_t cast_from_unit(object ts, object unit) except? -1: if ts is None: return m - # cast the unit, multiply base/frace separately + # cast the unit, multiply base/frac separately # to avoid precision issues from float -> int base = ts frac = ts - base diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 20c4b9422459c..b706b39214c27 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -626,11 +626,26 @@ def to_datetime( - If True, require an exact format match. - If False, allow the format to match anywhere in the target string. - unit : string, default 'ns' - unit of the arg (D,s,ms,us,ns) denote the unit, which is an - integer or float number. This will be based off the origin. - Example, with unit='ms' and origin='unix' (the default), this - would calculate the number of milliseconds to the unix epoch start. + unit : string, default is 'N' + The unit of the arg. Uses a subset of the pandas offset aliases. + + - 'Y', 'A' for yearly (long term average of 365.2425 days) + - 'M' for monthly (long term average of 30.436875 days) + - 'W' for weekly + - 'D' for daily + - 'H' for hourly + - 'T', 'min' for minutely + - 'S' for seconds + - 'L', 'ms' for milliseconds + - 'U', 'us' for microseconds + - 'N' for nanoseconds + + This will be based off the origin. Example, with unit='L' and + origin='unix' (the default), this would calculate the number of + milliseconds to the unix epoch start. + + The offset alias can be prefixed with a stride. For example, results + would be equivalent between unit='7D' and unit='W'. infer_datetime_format : boolean, default False If True and no `format` is given, attempt to infer the format of the datetime strings, and if it can be inferred, switch to a faster From 0104f1763571d66cc27c6a02ea7a04dde1839acf Mon Sep 17 00:00:00 2001 From: Tim Cera Date: Thu, 13 Jun 2019 17:31:19 -0400 Subject: [PATCH 3/5] FEAT: Continue adding stride to unit codes in to_datetime() The 'origin' in to_datetime can be any resolution. New tests for origin resolution and strided unit codes. Deprecated warning for to_datetime units: 'A', 'Y', 'M', 'W'. Edited docstring to reflect strided unit codes and origin resolution. Replaced re test with simple lstrip. Moved pattern to only other file where it is used. Added 'd' to represent daily, but default should be 'D'. Changed 'd' to 'D' in tests. --- pandas/_libs/tslibs/frequencies.pyx | 21 +---- pandas/_libs/tslibs/timedeltas.pyx | 66 ++++++------- pandas/core/tools/datetimes.py | 73 +++++++++------ pandas/tests/indexes/datetimes/test_tools.py | 92 +++++++++++++++++++ .../indexes/timedeltas/test_arithmetic.py | 4 +- .../indexes/timedeltas/test_construction.py | 2 +- .../indexes/timedeltas/test_timedelta.py | 2 +- pandas/tests/io/sas/test_sas7bdat.py | 4 +- .../tests/scalar/timedelta/test_arithmetic.py | 40 ++++---- .../scalar/timedelta/test_construction.py | 4 +- pandas/tests/scalar/timedelta/test_formats.py | 2 +- .../tests/scalar/timedelta/test_timedelta.py | 18 ++-- pandas/tests/series/test_analytics.py | 2 +- pandas/tests/series/test_dtypes.py | 2 +- pandas/tseries/frequencies.py | 7 +- 15 files changed, 220 insertions(+), 119 deletions(-) diff --git a/pandas/_libs/tslibs/frequencies.pyx b/pandas/_libs/tslibs/frequencies.pyx index f2dcd37b191ed..c03c525aac596 100644 --- a/pandas/_libs/tslibs/frequencies.pyx +++ b/pandas/_libs/tslibs/frequencies.pyx @@ -1,5 +1,3 @@ -import re - cimport numpy as cnp cnp.import_array() @@ -7,14 +5,6 @@ from pandas._libs.tslibs.util cimport is_integer_object from pandas._libs.tslibs.ccalendar import MONTH_NUMBERS -# ---------------------------------------------------------------------- -# Constants - -# hack to handle WOM-1MON -opattern = re.compile( - r'([+\-]?\d*|[+\-]?\d*\.\d*)\s*([A-Za-z]+([\-][\dA-Za-z\-]+)?)' -) - INVALID_FREQ_ERR_MSG = "Invalid frequency: {0}" # --------------------------------------------------------------------- @@ -190,20 +180,19 @@ cpdef _base_and_stride(freqstr): -------- _freq_and_stride('5Min') -> 'Min', 5 """ - groups = opattern.match(freqstr) + base = freqstr.lstrip('+-. 0123456789') + stride = freqstr[:freqstr.index(base)] - if not groups: + if not base: raise ValueError("Could not evaluate {freq}".format(freq=freqstr)) - stride = groups.group(1) - + # Possible for stride to be float at this point. Should it fail or floor? + # Right now it fails. if len(stride): stride = int(stride) else: stride = 1 - base = groups.group(2) - return base, stride diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index f1cfe8d6eb297..0faaf591305c6 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -24,8 +24,6 @@ from pandas._libs.tslibs.util cimport ( from pandas._libs.tslibs.c_timestamp cimport _Timestamp -from pandas._libs.tslibs.ccalendar import DAY_SECONDS - from pandas._libs.tslibs.frequencies import _base_and_stride from pandas._libs.tslibs.np_datetime cimport ( @@ -232,6 +230,7 @@ def array_to_timedelta64(object[:] values, unit='ns', errors='raise'): for i in range(n): result[i] = parse_timedelta_string(values[i]) except: + unit, stride = _base_and_stride(unit) unit = parse_timedelta_unit(unit) for i in range(n): try: @@ -261,48 +260,45 @@ cpdef inline object precision_from_unit(object unit): unit, stride = _base_and_stride(unit) - # Normalize old or lowercase codes to standard offset aliases. - if unit in ['min', 'm']: - unit = 'T' - elif unit == 'ns': - unit = 'N' - elif unit == 'ms': - unit = 'L' - elif unit == 'us': - unit = 'U' - - unit = unit.upper() + if unit in ['Y', 'A', 'M', 'W']: + warnings.warn("Y, A, M, and W units are deprecated and " + "will be removed in a future version.", + FutureWarning) - if unit in ['Y', 'A']: - m = stride * 1000000000L * 31556952 - p = 9 - elif unit == 'M': - m = stride * 1000000000L * 2629746 - p = 9 - elif unit == 'W': - m = stride * 1000000000L * DAY_SECONDS * 7 + # Don't know why previous implementation used the multiplication in the + # in-line comment instead of the value. Current approach saves an + # operation. + if unit in ['D', 'd']: + m = stride * 86400000000000L # 1000000000L * DAY_SECONDS p = 9 - elif unit == 'D': - m = stride * 1000000000L * DAY_SECONDS + elif unit in ['H', 'h']: + m = stride * 3600000000000L # 1000000000L * 3600 p = 9 - elif unit == 'H': - m = stride * 1000000000L * 3600 + elif unit in ['T', 'min', 'm']: + m = stride * 60000000000L # 1000000000L * 60 p = 9 - elif unit == 'T': - m = stride * 1000000000L * 60 - p = 9 - elif unit == 'S': + elif unit in ['S', 's']: m = stride * 1000000000L p = 9 - elif unit == 'L': + elif unit in ['L', 'ms']: m = stride * 1000000L p = 6 - elif unit == 'U': + elif unit in ['U', 'us']: m = stride * 1000L p = 3 - elif unit == 'N': + elif unit in ['N', 'ns']: m = stride * 1L p = 0 + # deprecated units at end because rarely evaluated + elif unit in ['Y', 'A']: + m = stride * 1000000000L * 31556952 + p = 9 + elif unit == 'M': + m = stride * 1000000000L * 2629746 + p = 9 + elif unit == 'W': + m = stride * 1000000000L * 86400 * 7 + p = 9 else: raise ValueError("cannot cast unit {unit}".format(unit=unit)) return m, p @@ -521,8 +517,9 @@ cdef inline timedelta_from_spec(object number, object frac, object unit): try: unit = ''.join(unit) + unit, stride = _base_and_stride(unit) if unit == 'M': - # To parse ISO 8601 string, 'M' should be treated as minute, + # To parse ISO 8601 string, 'm' should be treated as minute, # not month unit = 'm' unit = parse_timedelta_unit(unit) @@ -1271,6 +1268,9 @@ class Timedelta(_Timedelta): "[weeks, days, hours, minutes, seconds, " "milliseconds, microseconds, nanoseconds]") + if unit is not None: + unit, stride = _base_and_stride(unit) + if unit in {'Y', 'y', 'M'}: warnings.warn("M and Y units are deprecated and " "will be removed in a future version.", diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index b706b39214c27..4b0f36f811afa 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -7,6 +7,7 @@ from pandas._libs import tslib, tslibs from pandas._libs.tslibs import Timestamp, conversion, parsing +from pandas._libs.tslibs.frequencies import _base_and_stride from pandas._libs.tslibs.parsing import ( # noqa DateParseError, _format_is_iso, @@ -375,7 +376,6 @@ def _convert_listlike_datetimes( arg = ensure_object(arg) require_iso8601 = False - if infer_datetime_format and format is None: format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) @@ -489,15 +489,18 @@ def _adjust_to_origin(arg, origin, unit): origin : 'julian' or Timestamp origin offset for the arg unit : string - passed unit from to_datetime, must be 'D' + passed unit from to_datetime, must be 'D' if origin is 'julian' Returns ------- ndarray or scalar of adjusted date(s) """ + from pandas import DatetimeIndex + if origin == "julian": original = arg j0 = Timestamp(0).to_julian_date() + unit, stride = _base_and_stride(unit) if unit != "D": raise ValueError("unit must be 'D' for origin='julian'") try: @@ -526,7 +529,8 @@ def _adjust_to_origin(arg, origin, unit): ) ) - # we are going to offset back to unix / epoch time + # test the origin to make sure within valid range and no time + # zone try: offset = Timestamp(origin) except tslibs.OutOfBoundsDatetime: @@ -541,18 +545,28 @@ def _adjust_to_origin(arg, origin, unit): if offset.tz is not None: raise ValueError("origin offset {} must be tz-naive".format(offset)) - offset -= Timestamp(0) - # convert the offset to the unit of the arg - # this should be lossless in terms of precision - offset = offset // tslibs.Timedelta(1, unit=unit) + unit, stride = _base_and_stride(unit) + + delta = tslibs.Timedelta(stride, unit=unit) - # scalars & ndarray-like can handle the addition + # scalars & ndarray-like can handle the multiplication and addition if is_list_like(arg) and not isinstance( arg, (ABCSeries, ABCIndexClass, np.ndarray) ): arg = np.asarray(arg) - arg = arg + offset + + if stride == 1 and (offset - offset.floor("D")) == tslibs.Timedelta(0): + arg = arg + (offset.value // delta.value) + else: + # convert any integer type to int64 to prevent overflow + if is_integer_dtype(arg): + arg = arg.astype("int64", copy=False) + try: + arg = DatetimeIndex((arg * delta.value) + offset.value) + except TypeError: + arg = Timestamp((arg * delta.value) + offset.value) + return arg @@ -627,25 +641,26 @@ def to_datetime( - If False, allow the format to match anywhere in the target string. unit : string, default is 'N' - The unit of the arg. Uses a subset of the pandas offset aliases. - - - 'Y', 'A' for yearly (long term average of 365.2425 days) - - 'M' for monthly (long term average of 30.436875 days) - - 'W' for weekly - - 'D' for daily - - 'H' for hourly - - 'T', 'min' for minutely - - 'S' for seconds - - 'L', 'ms' for milliseconds - - 'U', 'us' for microseconds - - 'N' for nanoseconds - - This will be based off the origin. Example, with unit='L' and - origin='unix' (the default), this would calculate the number of - milliseconds to the unix epoch start. - - The offset alias can be prefixed with a stride. For example, results - would be equivalent between unit='7D' and unit='W'. + The unit code for the value(s) in `arg`. Used when `arg` is + a numeric value or ordered collection of numeric values. + The unit code is a subset of pandas offset aliases, ISO 8601 + codes, and legacy codes. + + - 'D', for daily + - 'H' or 'h' for hourly + - 'T', 'm', or 'min' for minutely + - 'S' or 's' for seconds + - 'L' or 'ms' for milliseconds + - 'U' or 'us' for microseconds + - 'N' or 'ns' for nanoseconds + + The resulting DatetimeIndex will be based off the `origin`. + For example, with unit='L' and origin='unix' (the default) then + the values in `arg` would represent the number of milliseconds + from the unix epoch start. + + The unit code can be prefixed with a stride. For example, + results would be equivalent between unit='24H' and unit='D'. infer_datetime_format : boolean, default False If True and no `format` is given, attempt to infer the format of the datetime strings, and if it can be inferred, switch to a faster @@ -899,7 +914,7 @@ def coerce(values): # we allow coercion to if errors allows values = to_numeric(values, errors=errors) - # prevent overflow in case of int8 or int16 + # convert any integer type to int64 to prevent overflow if is_integer_dtype(values): values = values.astype("int64", copy=False) return values diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 10d422e8aa52c..a54489ad0fdee 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -1361,6 +1361,98 @@ def test_to_datetime_errors_ignore_utc_true(self): expected = DatetimeIndex(["1970-01-01 00:00:01"], tz="UTC") tm.assert_index_equal(result, expected) + @pytest.mark.parametrize( + "unitl,unitr", + [ + ("D", "24h"), + ("2D", "48H"), + ("3D", "72H"), + ("4D", "96H"), + ("7D", "168H"), + ("86400000000000N", "D"), + ("60T", "H"), + ("120m", "2H"), + ("60min", "H"), + ("3600S", "h"), + ("3600000ms", "H"), + ("3600000L", "H"), + ("3600000000U", "H"), + ("60s", "m"), + ("60S", "T"), + ("60S", "min"), + ("1000ms", "S"), + ("1000L", "S"), + ("1000000U", "S"), + ("1000000000N", "S"), + ], + ) + def test_to_datetime_stride(self, unitl, unitr): + result = pd.to_datetime([1, 2, 3, 5], unit=unitl) + expected = pd.to_datetime([1, 2, 3, 5], unit=unitr) + tm.assert_index_equal(result, expected) + + result = pd.to_datetime(2, unit=unitl) + expected = pd.to_datetime(2, unit=unitr) + assert result == expected + + # Can't use 'm' for minutes and 'M' for months in the following test + # since tested against pd.date_range which sees both 'm' and 'M' as + # months. + @pytest.mark.parametrize( + "unit,epoch", + [ + ("D", "1980-01-02"), + ("D", "2018-05-18"), + ("D", "2018-05-18T11"), + ("D", "2018-05-18T11:04"), + ("D", "2018-05-18T11:04:52"), + ("2D", "1970-01-01"), + ("2D", "1970-01-01T21:12:43"), + ("2D", "2019-05-03"), + ("2D", "2019-05-03T12:11"), + ("3D", "1970-01-01"), + ("3D", "2019-05-03T14"), + ("4D", "1970-05-03"), + ("4D", "2019-05-03T11"), + ("5D", "2019-05-03T11"), + ("6D", "2019-05-03T11"), + ("7D", "2019-05-03T11"), + ("14D", "2019-05-03T11"), + ("H", "2018-05-18"), + ("H", "2018-05-18T11"), + ("H", "2018-05-18T11:04"), + ("H", "2018-05-18T11:04:52"), + ("12h", "1990-05-03T12:00:00"), + ("24H", "1980-12-31"), + ("48h", "1980-12-31"), + ("96h", "1980-12-31"), + ("2H", "2019-12-31T11:59"), + ("24h", "2001-08-15"), + ("5T", "2001-08-15"), + ("5min", "2001-08-15"), + ("10T", "2001-08-15"), + ("5S", "1970-01-01T01:10"), + ("60S", "1970-01-01T01:10:12"), + ("5T", "1980-12-31"), + ("1000T", "1980-12-31"), + ("100N", "1980-12-31"), + ("N", "1980-12-31"), + ], + ) + def test_to_datetime_stride_epoch(self, unit, epoch): + result = pd.to_datetime(list(range(100)), unit=unit, origin=epoch) + expected = pd.date_range(start=epoch, freq=unit, periods=100) + tm.assert_index_equal(result, expected) + + result = pd.to_datetime(2, unit=unit, origin=epoch) + expected = pd.date_range(start=epoch, freq=unit, periods=100)[2] + assert result == expected + + @pytest.mark.parametrize("unit", ["Y", "A", "M", "W"]) + def test_to_datetime_unit_code_deprecated(self, unit): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + _ = pd.to_datetime(list(range(100)), unit=unit, origin="unix") + class TestToDatetimeMisc: def test_to_datetime_barely_out_of_bounds(self): diff --git a/pandas/tests/indexes/timedeltas/test_arithmetic.py b/pandas/tests/indexes/timedeltas/test_arithmetic.py index 4544657f79af7..ff13208e4cfb1 100644 --- a/pandas/tests/indexes/timedeltas/test_arithmetic.py +++ b/pandas/tests/indexes/timedeltas/test_arithmetic.py @@ -61,7 +61,7 @@ def test_tdi_shift_minutes(self): def test_tdi_shift_int(self): # GH#8083 - trange = pd.to_timedelta(range(5), unit="d") + pd.offsets.Hour(1) + trange = pd.to_timedelta(range(5), unit="D") + pd.offsets.Hour(1) result = trange.shift(1) expected = TimedeltaIndex( [ @@ -77,7 +77,7 @@ def test_tdi_shift_int(self): def test_tdi_shift_nonstandard_freq(self): # GH#8083 - trange = pd.to_timedelta(range(5), unit="d") + pd.offsets.Hour(1) + trange = pd.to_timedelta(range(5), unit="D") + pd.offsets.Hour(1) result = trange.shift(3, freq="2D 1s") expected = TimedeltaIndex( [ diff --git a/pandas/tests/indexes/timedeltas/test_construction.py b/pandas/tests/indexes/timedeltas/test_construction.py index 2e00d558958e1..b76bbe374af4d 100644 --- a/pandas/tests/indexes/timedeltas/test_construction.py +++ b/pandas/tests/indexes/timedeltas/test_construction.py @@ -182,7 +182,7 @@ def test_constructor_coverage(self): # NumPy string array strings = np.array(["1 days", "2 days", "3 days"]) result = TimedeltaIndex(strings) - expected = to_timedelta([1, 2, 3], unit="d") + expected = to_timedelta([1, 2, 3], unit="D") tm.assert_index_equal(result, expected) from_ints = TimedeltaIndex(expected.asi8) diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index e790a913fcac2..9a7a70d99957d 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -35,7 +35,7 @@ def setup_method(self, method): self.setup_indices() def create_index(self): - return pd.to_timedelta(range(5), unit="d") + pd.offsets.Hour(1) + return pd.to_timedelta(range(5), unit="D") + pd.offsets.Hour(1) def test_numeric_compat(self): # Dummy method to override super's version; this test is now done diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index e37561c865c7a..7b9231ce1809a 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -23,9 +23,9 @@ def setup_method(self, datapath): fname = os.path.join(self.dirpath, "test_sas7bdat_{j}.csv".format(j=j)) df = pd.read_csv(fname) epoch = pd.datetime(1960, 1, 1) - t1 = pd.to_timedelta(df["Column4"], unit="d") + t1 = pd.to_timedelta(df["Column4"], unit="D") df["Column4"] = epoch + t1 - t2 = pd.to_timedelta(df["Column12"], unit="d") + t2 = pd.to_timedelta(df["Column12"], unit="D") df["Column12"] = epoch + t2 for k in range(df.shape[1]): col = df.iloc[:, k] diff --git a/pandas/tests/scalar/timedelta/test_arithmetic.py b/pandas/tests/scalar/timedelta/test_arithmetic.py index 52f32d41a02ff..82982c9d68268 100644 --- a/pandas/tests/scalar/timedelta/test_arithmetic.py +++ b/pandas/tests/scalar/timedelta/test_arithmetic.py @@ -68,7 +68,7 @@ def test_td_add_sub_one_day_ten_seconds(self, one_day_ten_secs): @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_td_add_datetimelike_scalar(self, op): # GH#19738 - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = op(td, datetime(2016, 1, 1)) if op is operator.add: @@ -90,7 +90,7 @@ def test_td_add_datetimelike_scalar(self, op): @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_td_add_td(self, op): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = op(td, Timedelta(days=10)) assert isinstance(result, Timedelta) @@ -98,35 +98,35 @@ def test_td_add_td(self, op): @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_td_add_pytimedelta(self, op): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = op(td, timedelta(days=9)) assert isinstance(result, Timedelta) assert result == Timedelta(days=19) @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_td_add_timedelta64(self, op): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = op(td, np.timedelta64(-4, "D")) assert isinstance(result, Timedelta) assert result == Timedelta(days=6) @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_td_add_offset(self, op): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = op(td, pd.offsets.Hour(6)) assert isinstance(result, Timedelta) assert result == Timedelta(days=10, hours=6) def test_td_sub_td(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") expected = Timedelta(0, unit="ns") result = td - td assert isinstance(result, Timedelta) assert result == expected def test_td_sub_pytimedelta(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") expected = Timedelta(0, unit="ns") result = td - td.to_pytimedelta() @@ -138,7 +138,7 @@ def test_td_sub_pytimedelta(self): assert result == expected def test_td_sub_timedelta64(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") expected = Timedelta(0, unit="ns") result = td - td.to_timedelta64() @@ -151,12 +151,12 @@ def test_td_sub_timedelta64(self): def test_td_sub_nat(self): # In this context pd.NaT is treated as timedelta-like - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = td - NaT assert result is NaT def test_td_sub_td64_nat(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") td_nat = np.timedelta64("NaT") result = td - td_nat @@ -166,13 +166,13 @@ def test_td_sub_td64_nat(self): assert result is NaT def test_td_sub_offset(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = td - pd.offsets.Hour(1) assert isinstance(result, Timedelta) assert result == Timedelta(239, unit="h") def test_td_add_sub_numeric_raises(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") for other in [2, 2.0, np.int64(2), np.float64(2)]: with pytest.raises(TypeError): td + other @@ -184,7 +184,7 @@ def test_td_add_sub_numeric_raises(self): other - td def test_td_rsub_nat(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = NaT - td assert result is NaT @@ -192,7 +192,7 @@ def test_td_rsub_nat(self): assert result is NaT def test_td_rsub_offset(self): - result = pd.offsets.Hour(1) - Timedelta(10, unit="d") + result = pd.offsets.Hour(1) - Timedelta(10, unit="D") assert isinstance(result, Timedelta) assert result == Timedelta(-239, unit="h") @@ -263,7 +263,7 @@ class TestTimedeltaMultiplicationDivision: @pytest.mark.parametrize("op", [operator.mul, ops.rmul]) def test_td_mul_nat(self, op, td_nat): # GH#19819 - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") with pytest.raises(TypeError): op(td, td_nat) @@ -271,7 +271,7 @@ def test_td_mul_nat(self, op, td_nat): @pytest.mark.parametrize("op", [operator.mul, ops.rmul]) def test_td_mul_nan(self, op, nan): # np.float64('NaN') has a 'dtype' attr, avoid treating as array - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = op(td, nan) assert result is NaT @@ -304,7 +304,7 @@ def test_td_mul_scalar(self, op): def test_td_div_timedeltalike_scalar(self): # GH#19738 - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = td / pd.offsets.Hour(1) assert result == 240 @@ -316,7 +316,7 @@ def test_td_div_timedeltalike_scalar(self): def test_td_div_numeric_scalar(self): # GH#19738 - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = td / 2 assert isinstance(result, Timedelta) @@ -329,7 +329,7 @@ def test_td_div_numeric_scalar(self): @pytest.mark.parametrize("nan", [np.nan, np.float64("NaN"), float("nan")]) def test_td_div_nan(self, nan): # np.float64('NaN') has a 'dtype' attr, avoid treating as array - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = td / nan assert result is NaT @@ -341,7 +341,7 @@ def test_td_div_nan(self, nan): def test_td_rdiv_timedeltalike_scalar(self): # GH#19738 - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = pd.offsets.Hour(1) / td assert result == 1 / 240.0 diff --git a/pandas/tests/scalar/timedelta/test_construction.py b/pandas/tests/scalar/timedelta/test_construction.py index ae1e84576c092..e373e83f0e93d 100644 --- a/pandas/tests/scalar/timedelta/test_construction.py +++ b/pandas/tests/scalar/timedelta/test_construction.py @@ -8,8 +8,8 @@ def test_construction(): expected = np.timedelta64(10, "D").astype("m8[ns]").view("i8") - assert Timedelta(10, unit="d").value == expected - assert Timedelta(10.0, unit="d").value == expected + assert Timedelta(10, unit="D").value == expected + assert Timedelta(10.0, unit="D").value == expected assert Timedelta("10 days").value == expected assert Timedelta(days=10).value == expected assert Timedelta(days=10.0).value == expected diff --git a/pandas/tests/scalar/timedelta/test_formats.py b/pandas/tests/scalar/timedelta/test_formats.py index 753186ee4b738..0130ff977390d 100644 --- a/pandas/tests/scalar/timedelta/test_formats.py +++ b/pandas/tests/scalar/timedelta/test_formats.py @@ -6,7 +6,7 @@ @pytest.mark.parametrize( "td, expected_repr", [ - (Timedelta(10, unit="d"), "Timedelta('10 days 00:00:00')"), + (Timedelta(10, unit="D"), "Timedelta('10 days 00:00:00')"), (Timedelta(10, unit="s"), "Timedelta('0 days 00:00:10')"), (Timedelta(10, unit="ms"), "Timedelta('0 days 00:00:00.010000')"), (Timedelta(-10, unit="ms"), "Timedelta('-1 days +23:59:59.990000')"), diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py index e4980be49d35f..e0a0fc0225085 100644 --- a/pandas/tests/scalar/timedelta/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -62,17 +62,17 @@ class Other: assert td.__floordiv__(other) is NotImplemented def test_unary_ops(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") # __neg__, __pos__ - assert -td == Timedelta(-10, unit="d") - assert -td == Timedelta("-10d") - assert +td == Timedelta(10, unit="d") + assert -td == Timedelta(-10, unit="D") + assert -td == Timedelta("-10D") + assert +td == Timedelta(10, unit="D") # __abs__, __abs__(__neg__) assert abs(td) == td assert abs(-td) == td - assert abs(-td) == Timedelta("10d") + assert abs(-td) == Timedelta("10D") class TestTimedeltaComparison: @@ -198,7 +198,7 @@ def test_total_seconds_scalar(self): def test_conversion(self): - for td in [Timedelta(10, unit="d"), Timedelta("1 days, 10:11:12.012345")]: + for td in [Timedelta(10, unit="D"), Timedelta("1 days, 10:11:12.012345")]: pydt = td.to_pytimedelta() assert td == Timedelta(pydt) assert td == pydt @@ -454,7 +454,7 @@ def test_numeric_conversions(self): assert Timedelta(10, unit="us") == np.timedelta64(10, "us") assert Timedelta(10, unit="ms") == np.timedelta64(10, "ms") assert Timedelta(10, unit="s") == np.timedelta64(10, "s") - assert Timedelta(10, unit="d") == np.timedelta64(10, "D") + assert Timedelta(10, unit="D") == np.timedelta64(10, "D") def test_timedelta_conversions(self): assert Timedelta(timedelta(seconds=1)) == np.timedelta64(1, "s").astype( @@ -574,7 +574,7 @@ def test_round(self): def test_contains(self): # Checking for any NaT-like objects # GH 13603 - td = to_timedelta(range(5), unit="d") + pd.offsets.Hour(1) + td = to_timedelta(range(5), unit="D") + pd.offsets.Hour(1) for v in [pd.NaT, None, float("nan"), np.nan]: assert not (v in td) @@ -584,7 +584,7 @@ def test_contains(self): def test_identity(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") assert isinstance(td, Timedelta) assert isinstance(td, timedelta) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 32d32a5d14fb2..d596a0576fe09 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -780,7 +780,7 @@ def test_isin_with_i8(self): assert_series_equal(result, expected) # timedelta64[ns] - s = Series(pd.to_timedelta(range(5), unit="d")) + s = Series(pd.to_timedelta(range(5), unit="D")) result = s.isin(s[0:2]) assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 9be79bf93ece7..f313dab4c8bd5 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -165,7 +165,7 @@ def test_astype_str_cast(self): expected = Series([str("2010-01-04 00:00:00-05:00")]) tm.assert_series_equal(s, expected) - td = Series([Timedelta(1, unit="d")]) + td = Series([Timedelta(1, unit="D")]) s = td.astype(str) expected = Series([str("1 days 00:00:00.000000000")]) diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index dfe91b514bbe1..92ce4e654c38b 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -144,8 +144,13 @@ def to_offset(freq): else: delta = None stride_sign = None + + # hack to handle WOM-1MON + opattern = re.compile( + r"([+\-]?\d*|[+\-]?\d*\.\d*)\s*([A-Za-z]+([\-][\dA-Za-z\-]+)?)" + ) try: - splitted = re.split(libfreqs.opattern, freq) + splitted = re.split(opattern, freq) if splitted[-1] != "" and not splitted[-1].isspace(): # the last element must be blank raise ValueError("last element must be blank") From 6037971adb54975b811e58cc351477190117e1cf Mon Sep 17 00:00:00 2001 From: Tim Cera Date: Mon, 3 Jun 2019 09:52:08 -0400 Subject: [PATCH 4/5] ENH: Added stride/offset aliases in to_datetime Added support for stride and offset aliases to the 'unit' keyword in 'to_datetime'. Documented support for additional 'unit' intervals that were available. --- pandas/_libs/tslibs/timedeltas.pyx | 59 ++++++++++++++++++++---------- pandas/core/tools/datetimes.py | 25 ++++++++++--- 2 files changed, 60 insertions(+), 24 deletions(-) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 6a32553fe2d38..f1cfe8d6eb297 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -26,6 +26,8 @@ from pandas._libs.tslibs.c_timestamp cimport _Timestamp from pandas._libs.tslibs.ccalendar import DAY_SECONDS +from pandas._libs.tslibs.frequencies import _base_and_stride + from pandas._libs.tslibs.np_datetime cimport ( cmp_scalar, reverse_ops, td64_to_tdstruct, pandas_timedeltastruct) @@ -252,35 +254,54 @@ cpdef inline object precision_from_unit(object unit): int64_t m int p - if unit == 'Y': - m = 1000000000L * 31556952 + if unit is None: + m = 1L + p = 0 + return m, p + + unit, stride = _base_and_stride(unit) + + # Normalize old or lowercase codes to standard offset aliases. + if unit in ['min', 'm']: + unit = 'T' + elif unit == 'ns': + unit = 'N' + elif unit == 'ms': + unit = 'L' + elif unit == 'us': + unit = 'U' + + unit = unit.upper() + + if unit in ['Y', 'A']: + m = stride * 1000000000L * 31556952 p = 9 elif unit == 'M': - m = 1000000000L * 2629746 + m = stride * 1000000000L * 2629746 p = 9 elif unit == 'W': - m = 1000000000L * DAY_SECONDS * 7 + m = stride * 1000000000L * DAY_SECONDS * 7 p = 9 - elif unit == 'D' or unit == 'd': - m = 1000000000L * DAY_SECONDS + elif unit == 'D': + m = stride * 1000000000L * DAY_SECONDS p = 9 - elif unit == 'h': - m = 1000000000L * 3600 + elif unit == 'H': + m = stride * 1000000000L * 3600 p = 9 - elif unit == 'm': - m = 1000000000L * 60 + elif unit == 'T': + m = stride * 1000000000L * 60 p = 9 - elif unit == 's': - m = 1000000000L + elif unit == 'S': + m = stride * 1000000000L p = 9 - elif unit == 'ms': - m = 1000000L + elif unit == 'L': + m = stride * 1000000L p = 6 - elif unit == 'us': - m = 1000L + elif unit == 'U': + m = stride * 1000L p = 3 - elif unit == 'ns' or unit is None: - m = 1L + elif unit == 'N': + m = stride * 1L p = 0 else: raise ValueError("cannot cast unit {unit}".format(unit=unit)) @@ -300,7 +321,7 @@ cdef inline int64_t cast_from_unit(object ts, object unit) except? -1: if ts is None: return m - # cast the unit, multiply base/frace separately + # cast the unit, multiply base/frac separately # to avoid precision issues from float -> int base = ts frac = ts - base diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 20c4b9422459c..b706b39214c27 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -626,11 +626,26 @@ def to_datetime( - If True, require an exact format match. - If False, allow the format to match anywhere in the target string. - unit : string, default 'ns' - unit of the arg (D,s,ms,us,ns) denote the unit, which is an - integer or float number. This will be based off the origin. - Example, with unit='ms' and origin='unix' (the default), this - would calculate the number of milliseconds to the unix epoch start. + unit : string, default is 'N' + The unit of the arg. Uses a subset of the pandas offset aliases. + + - 'Y', 'A' for yearly (long term average of 365.2425 days) + - 'M' for monthly (long term average of 30.436875 days) + - 'W' for weekly + - 'D' for daily + - 'H' for hourly + - 'T', 'min' for minutely + - 'S' for seconds + - 'L', 'ms' for milliseconds + - 'U', 'us' for microseconds + - 'N' for nanoseconds + + This will be based off the origin. Example, with unit='L' and + origin='unix' (the default), this would calculate the number of + milliseconds to the unix epoch start. + + The offset alias can be prefixed with a stride. For example, results + would be equivalent between unit='7D' and unit='W'. infer_datetime_format : boolean, default False If True and no `format` is given, attempt to infer the format of the datetime strings, and if it can be inferred, switch to a faster From 87c9aeee1911bfa75fb17cda8bbc9be818bbd1c3 Mon Sep 17 00:00:00 2001 From: Tim Cera Date: Thu, 13 Jun 2019 17:31:19 -0400 Subject: [PATCH 5/5] FEAT: Continue adding stride to unit codes in to_datetime() The 'origin' in to_datetime can be any resolution. New tests for origin resolution and strided unit codes. Deprecated warning for to_datetime units: 'A', 'Y', 'M', 'W'. Edited docstring to reflect strided unit codes and origin resolution. Replaced re test with simple lstrip. Moved pattern to only other file where it is used. Added 'd' to represent daily, but default should be 'D'. Changed 'd' to 'D' in tests. --- pandas/_libs/tslibs/frequencies.pyx | 21 +---- pandas/_libs/tslibs/timedeltas.pyx | 66 ++++++------- pandas/core/tools/datetimes.py | 73 +++++++++------ pandas/tests/indexes/datetimes/test_tools.py | 92 +++++++++++++++++++ .../indexes/timedeltas/test_arithmetic.py | 4 +- .../indexes/timedeltas/test_construction.py | 2 +- .../indexes/timedeltas/test_timedelta.py | 2 +- pandas/tests/io/sas/test_sas7bdat.py | 4 +- .../tests/scalar/timedelta/test_arithmetic.py | 40 ++++---- .../scalar/timedelta/test_construction.py | 4 +- pandas/tests/scalar/timedelta/test_formats.py | 2 +- .../tests/scalar/timedelta/test_timedelta.py | 18 ++-- pandas/tests/series/test_analytics.py | 2 +- pandas/tests/series/test_dtypes.py | 2 +- pandas/tseries/frequencies.py | 7 +- 15 files changed, 220 insertions(+), 119 deletions(-) diff --git a/pandas/_libs/tslibs/frequencies.pyx b/pandas/_libs/tslibs/frequencies.pyx index f2dcd37b191ed..c03c525aac596 100644 --- a/pandas/_libs/tslibs/frequencies.pyx +++ b/pandas/_libs/tslibs/frequencies.pyx @@ -1,5 +1,3 @@ -import re - cimport numpy as cnp cnp.import_array() @@ -7,14 +5,6 @@ from pandas._libs.tslibs.util cimport is_integer_object from pandas._libs.tslibs.ccalendar import MONTH_NUMBERS -# ---------------------------------------------------------------------- -# Constants - -# hack to handle WOM-1MON -opattern = re.compile( - r'([+\-]?\d*|[+\-]?\d*\.\d*)\s*([A-Za-z]+([\-][\dA-Za-z\-]+)?)' -) - INVALID_FREQ_ERR_MSG = "Invalid frequency: {0}" # --------------------------------------------------------------------- @@ -190,20 +180,19 @@ cpdef _base_and_stride(freqstr): -------- _freq_and_stride('5Min') -> 'Min', 5 """ - groups = opattern.match(freqstr) + base = freqstr.lstrip('+-. 0123456789') + stride = freqstr[:freqstr.index(base)] - if not groups: + if not base: raise ValueError("Could not evaluate {freq}".format(freq=freqstr)) - stride = groups.group(1) - + # Possible for stride to be float at this point. Should it fail or floor? + # Right now it fails. if len(stride): stride = int(stride) else: stride = 1 - base = groups.group(2) - return base, stride diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index f1cfe8d6eb297..0faaf591305c6 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -24,8 +24,6 @@ from pandas._libs.tslibs.util cimport ( from pandas._libs.tslibs.c_timestamp cimport _Timestamp -from pandas._libs.tslibs.ccalendar import DAY_SECONDS - from pandas._libs.tslibs.frequencies import _base_and_stride from pandas._libs.tslibs.np_datetime cimport ( @@ -232,6 +230,7 @@ def array_to_timedelta64(object[:] values, unit='ns', errors='raise'): for i in range(n): result[i] = parse_timedelta_string(values[i]) except: + unit, stride = _base_and_stride(unit) unit = parse_timedelta_unit(unit) for i in range(n): try: @@ -261,48 +260,45 @@ cpdef inline object precision_from_unit(object unit): unit, stride = _base_and_stride(unit) - # Normalize old or lowercase codes to standard offset aliases. - if unit in ['min', 'm']: - unit = 'T' - elif unit == 'ns': - unit = 'N' - elif unit == 'ms': - unit = 'L' - elif unit == 'us': - unit = 'U' - - unit = unit.upper() + if unit in ['Y', 'A', 'M', 'W']: + warnings.warn("Y, A, M, and W units are deprecated and " + "will be removed in a future version.", + FutureWarning) - if unit in ['Y', 'A']: - m = stride * 1000000000L * 31556952 - p = 9 - elif unit == 'M': - m = stride * 1000000000L * 2629746 - p = 9 - elif unit == 'W': - m = stride * 1000000000L * DAY_SECONDS * 7 + # Don't know why previous implementation used the multiplication in the + # in-line comment instead of the value. Current approach saves an + # operation. + if unit in ['D', 'd']: + m = stride * 86400000000000L # 1000000000L * DAY_SECONDS p = 9 - elif unit == 'D': - m = stride * 1000000000L * DAY_SECONDS + elif unit in ['H', 'h']: + m = stride * 3600000000000L # 1000000000L * 3600 p = 9 - elif unit == 'H': - m = stride * 1000000000L * 3600 + elif unit in ['T', 'min', 'm']: + m = stride * 60000000000L # 1000000000L * 60 p = 9 - elif unit == 'T': - m = stride * 1000000000L * 60 - p = 9 - elif unit == 'S': + elif unit in ['S', 's']: m = stride * 1000000000L p = 9 - elif unit == 'L': + elif unit in ['L', 'ms']: m = stride * 1000000L p = 6 - elif unit == 'U': + elif unit in ['U', 'us']: m = stride * 1000L p = 3 - elif unit == 'N': + elif unit in ['N', 'ns']: m = stride * 1L p = 0 + # deprecated units at end because rarely evaluated + elif unit in ['Y', 'A']: + m = stride * 1000000000L * 31556952 + p = 9 + elif unit == 'M': + m = stride * 1000000000L * 2629746 + p = 9 + elif unit == 'W': + m = stride * 1000000000L * 86400 * 7 + p = 9 else: raise ValueError("cannot cast unit {unit}".format(unit=unit)) return m, p @@ -521,8 +517,9 @@ cdef inline timedelta_from_spec(object number, object frac, object unit): try: unit = ''.join(unit) + unit, stride = _base_and_stride(unit) if unit == 'M': - # To parse ISO 8601 string, 'M' should be treated as minute, + # To parse ISO 8601 string, 'm' should be treated as minute, # not month unit = 'm' unit = parse_timedelta_unit(unit) @@ -1271,6 +1268,9 @@ class Timedelta(_Timedelta): "[weeks, days, hours, minutes, seconds, " "milliseconds, microseconds, nanoseconds]") + if unit is not None: + unit, stride = _base_and_stride(unit) + if unit in {'Y', 'y', 'M'}: warnings.warn("M and Y units are deprecated and " "will be removed in a future version.", diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index b706b39214c27..4b0f36f811afa 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -7,6 +7,7 @@ from pandas._libs import tslib, tslibs from pandas._libs.tslibs import Timestamp, conversion, parsing +from pandas._libs.tslibs.frequencies import _base_and_stride from pandas._libs.tslibs.parsing import ( # noqa DateParseError, _format_is_iso, @@ -375,7 +376,6 @@ def _convert_listlike_datetimes( arg = ensure_object(arg) require_iso8601 = False - if infer_datetime_format and format is None: format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) @@ -489,15 +489,18 @@ def _adjust_to_origin(arg, origin, unit): origin : 'julian' or Timestamp origin offset for the arg unit : string - passed unit from to_datetime, must be 'D' + passed unit from to_datetime, must be 'D' if origin is 'julian' Returns ------- ndarray or scalar of adjusted date(s) """ + from pandas import DatetimeIndex + if origin == "julian": original = arg j0 = Timestamp(0).to_julian_date() + unit, stride = _base_and_stride(unit) if unit != "D": raise ValueError("unit must be 'D' for origin='julian'") try: @@ -526,7 +529,8 @@ def _adjust_to_origin(arg, origin, unit): ) ) - # we are going to offset back to unix / epoch time + # test the origin to make sure within valid range and no time + # zone try: offset = Timestamp(origin) except tslibs.OutOfBoundsDatetime: @@ -541,18 +545,28 @@ def _adjust_to_origin(arg, origin, unit): if offset.tz is not None: raise ValueError("origin offset {} must be tz-naive".format(offset)) - offset -= Timestamp(0) - # convert the offset to the unit of the arg - # this should be lossless in terms of precision - offset = offset // tslibs.Timedelta(1, unit=unit) + unit, stride = _base_and_stride(unit) + + delta = tslibs.Timedelta(stride, unit=unit) - # scalars & ndarray-like can handle the addition + # scalars & ndarray-like can handle the multiplication and addition if is_list_like(arg) and not isinstance( arg, (ABCSeries, ABCIndexClass, np.ndarray) ): arg = np.asarray(arg) - arg = arg + offset + + if stride == 1 and (offset - offset.floor("D")) == tslibs.Timedelta(0): + arg = arg + (offset.value // delta.value) + else: + # convert any integer type to int64 to prevent overflow + if is_integer_dtype(arg): + arg = arg.astype("int64", copy=False) + try: + arg = DatetimeIndex((arg * delta.value) + offset.value) + except TypeError: + arg = Timestamp((arg * delta.value) + offset.value) + return arg @@ -627,25 +641,26 @@ def to_datetime( - If False, allow the format to match anywhere in the target string. unit : string, default is 'N' - The unit of the arg. Uses a subset of the pandas offset aliases. - - - 'Y', 'A' for yearly (long term average of 365.2425 days) - - 'M' for monthly (long term average of 30.436875 days) - - 'W' for weekly - - 'D' for daily - - 'H' for hourly - - 'T', 'min' for minutely - - 'S' for seconds - - 'L', 'ms' for milliseconds - - 'U', 'us' for microseconds - - 'N' for nanoseconds - - This will be based off the origin. Example, with unit='L' and - origin='unix' (the default), this would calculate the number of - milliseconds to the unix epoch start. - - The offset alias can be prefixed with a stride. For example, results - would be equivalent between unit='7D' and unit='W'. + The unit code for the value(s) in `arg`. Used when `arg` is + a numeric value or ordered collection of numeric values. + The unit code is a subset of pandas offset aliases, ISO 8601 + codes, and legacy codes. + + - 'D', for daily + - 'H' or 'h' for hourly + - 'T', 'm', or 'min' for minutely + - 'S' or 's' for seconds + - 'L' or 'ms' for milliseconds + - 'U' or 'us' for microseconds + - 'N' or 'ns' for nanoseconds + + The resulting DatetimeIndex will be based off the `origin`. + For example, with unit='L' and origin='unix' (the default) then + the values in `arg` would represent the number of milliseconds + from the unix epoch start. + + The unit code can be prefixed with a stride. For example, + results would be equivalent between unit='24H' and unit='D'. infer_datetime_format : boolean, default False If True and no `format` is given, attempt to infer the format of the datetime strings, and if it can be inferred, switch to a faster @@ -899,7 +914,7 @@ def coerce(values): # we allow coercion to if errors allows values = to_numeric(values, errors=errors) - # prevent overflow in case of int8 or int16 + # convert any integer type to int64 to prevent overflow if is_integer_dtype(values): values = values.astype("int64", copy=False) return values diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 10d422e8aa52c..a54489ad0fdee 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -1361,6 +1361,98 @@ def test_to_datetime_errors_ignore_utc_true(self): expected = DatetimeIndex(["1970-01-01 00:00:01"], tz="UTC") tm.assert_index_equal(result, expected) + @pytest.mark.parametrize( + "unitl,unitr", + [ + ("D", "24h"), + ("2D", "48H"), + ("3D", "72H"), + ("4D", "96H"), + ("7D", "168H"), + ("86400000000000N", "D"), + ("60T", "H"), + ("120m", "2H"), + ("60min", "H"), + ("3600S", "h"), + ("3600000ms", "H"), + ("3600000L", "H"), + ("3600000000U", "H"), + ("60s", "m"), + ("60S", "T"), + ("60S", "min"), + ("1000ms", "S"), + ("1000L", "S"), + ("1000000U", "S"), + ("1000000000N", "S"), + ], + ) + def test_to_datetime_stride(self, unitl, unitr): + result = pd.to_datetime([1, 2, 3, 5], unit=unitl) + expected = pd.to_datetime([1, 2, 3, 5], unit=unitr) + tm.assert_index_equal(result, expected) + + result = pd.to_datetime(2, unit=unitl) + expected = pd.to_datetime(2, unit=unitr) + assert result == expected + + # Can't use 'm' for minutes and 'M' for months in the following test + # since tested against pd.date_range which sees both 'm' and 'M' as + # months. + @pytest.mark.parametrize( + "unit,epoch", + [ + ("D", "1980-01-02"), + ("D", "2018-05-18"), + ("D", "2018-05-18T11"), + ("D", "2018-05-18T11:04"), + ("D", "2018-05-18T11:04:52"), + ("2D", "1970-01-01"), + ("2D", "1970-01-01T21:12:43"), + ("2D", "2019-05-03"), + ("2D", "2019-05-03T12:11"), + ("3D", "1970-01-01"), + ("3D", "2019-05-03T14"), + ("4D", "1970-05-03"), + ("4D", "2019-05-03T11"), + ("5D", "2019-05-03T11"), + ("6D", "2019-05-03T11"), + ("7D", "2019-05-03T11"), + ("14D", "2019-05-03T11"), + ("H", "2018-05-18"), + ("H", "2018-05-18T11"), + ("H", "2018-05-18T11:04"), + ("H", "2018-05-18T11:04:52"), + ("12h", "1990-05-03T12:00:00"), + ("24H", "1980-12-31"), + ("48h", "1980-12-31"), + ("96h", "1980-12-31"), + ("2H", "2019-12-31T11:59"), + ("24h", "2001-08-15"), + ("5T", "2001-08-15"), + ("5min", "2001-08-15"), + ("10T", "2001-08-15"), + ("5S", "1970-01-01T01:10"), + ("60S", "1970-01-01T01:10:12"), + ("5T", "1980-12-31"), + ("1000T", "1980-12-31"), + ("100N", "1980-12-31"), + ("N", "1980-12-31"), + ], + ) + def test_to_datetime_stride_epoch(self, unit, epoch): + result = pd.to_datetime(list(range(100)), unit=unit, origin=epoch) + expected = pd.date_range(start=epoch, freq=unit, periods=100) + tm.assert_index_equal(result, expected) + + result = pd.to_datetime(2, unit=unit, origin=epoch) + expected = pd.date_range(start=epoch, freq=unit, periods=100)[2] + assert result == expected + + @pytest.mark.parametrize("unit", ["Y", "A", "M", "W"]) + def test_to_datetime_unit_code_deprecated(self, unit): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + _ = pd.to_datetime(list(range(100)), unit=unit, origin="unix") + class TestToDatetimeMisc: def test_to_datetime_barely_out_of_bounds(self): diff --git a/pandas/tests/indexes/timedeltas/test_arithmetic.py b/pandas/tests/indexes/timedeltas/test_arithmetic.py index 4544657f79af7..ff13208e4cfb1 100644 --- a/pandas/tests/indexes/timedeltas/test_arithmetic.py +++ b/pandas/tests/indexes/timedeltas/test_arithmetic.py @@ -61,7 +61,7 @@ def test_tdi_shift_minutes(self): def test_tdi_shift_int(self): # GH#8083 - trange = pd.to_timedelta(range(5), unit="d") + pd.offsets.Hour(1) + trange = pd.to_timedelta(range(5), unit="D") + pd.offsets.Hour(1) result = trange.shift(1) expected = TimedeltaIndex( [ @@ -77,7 +77,7 @@ def test_tdi_shift_int(self): def test_tdi_shift_nonstandard_freq(self): # GH#8083 - trange = pd.to_timedelta(range(5), unit="d") + pd.offsets.Hour(1) + trange = pd.to_timedelta(range(5), unit="D") + pd.offsets.Hour(1) result = trange.shift(3, freq="2D 1s") expected = TimedeltaIndex( [ diff --git a/pandas/tests/indexes/timedeltas/test_construction.py b/pandas/tests/indexes/timedeltas/test_construction.py index 2e00d558958e1..b76bbe374af4d 100644 --- a/pandas/tests/indexes/timedeltas/test_construction.py +++ b/pandas/tests/indexes/timedeltas/test_construction.py @@ -182,7 +182,7 @@ def test_constructor_coverage(self): # NumPy string array strings = np.array(["1 days", "2 days", "3 days"]) result = TimedeltaIndex(strings) - expected = to_timedelta([1, 2, 3], unit="d") + expected = to_timedelta([1, 2, 3], unit="D") tm.assert_index_equal(result, expected) from_ints = TimedeltaIndex(expected.asi8) diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index e790a913fcac2..9a7a70d99957d 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -35,7 +35,7 @@ def setup_method(self, method): self.setup_indices() def create_index(self): - return pd.to_timedelta(range(5), unit="d") + pd.offsets.Hour(1) + return pd.to_timedelta(range(5), unit="D") + pd.offsets.Hour(1) def test_numeric_compat(self): # Dummy method to override super's version; this test is now done diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index e37561c865c7a..7b9231ce1809a 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -23,9 +23,9 @@ def setup_method(self, datapath): fname = os.path.join(self.dirpath, "test_sas7bdat_{j}.csv".format(j=j)) df = pd.read_csv(fname) epoch = pd.datetime(1960, 1, 1) - t1 = pd.to_timedelta(df["Column4"], unit="d") + t1 = pd.to_timedelta(df["Column4"], unit="D") df["Column4"] = epoch + t1 - t2 = pd.to_timedelta(df["Column12"], unit="d") + t2 = pd.to_timedelta(df["Column12"], unit="D") df["Column12"] = epoch + t2 for k in range(df.shape[1]): col = df.iloc[:, k] diff --git a/pandas/tests/scalar/timedelta/test_arithmetic.py b/pandas/tests/scalar/timedelta/test_arithmetic.py index 52f32d41a02ff..82982c9d68268 100644 --- a/pandas/tests/scalar/timedelta/test_arithmetic.py +++ b/pandas/tests/scalar/timedelta/test_arithmetic.py @@ -68,7 +68,7 @@ def test_td_add_sub_one_day_ten_seconds(self, one_day_ten_secs): @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_td_add_datetimelike_scalar(self, op): # GH#19738 - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = op(td, datetime(2016, 1, 1)) if op is operator.add: @@ -90,7 +90,7 @@ def test_td_add_datetimelike_scalar(self, op): @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_td_add_td(self, op): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = op(td, Timedelta(days=10)) assert isinstance(result, Timedelta) @@ -98,35 +98,35 @@ def test_td_add_td(self, op): @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_td_add_pytimedelta(self, op): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = op(td, timedelta(days=9)) assert isinstance(result, Timedelta) assert result == Timedelta(days=19) @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_td_add_timedelta64(self, op): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = op(td, np.timedelta64(-4, "D")) assert isinstance(result, Timedelta) assert result == Timedelta(days=6) @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_td_add_offset(self, op): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = op(td, pd.offsets.Hour(6)) assert isinstance(result, Timedelta) assert result == Timedelta(days=10, hours=6) def test_td_sub_td(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") expected = Timedelta(0, unit="ns") result = td - td assert isinstance(result, Timedelta) assert result == expected def test_td_sub_pytimedelta(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") expected = Timedelta(0, unit="ns") result = td - td.to_pytimedelta() @@ -138,7 +138,7 @@ def test_td_sub_pytimedelta(self): assert result == expected def test_td_sub_timedelta64(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") expected = Timedelta(0, unit="ns") result = td - td.to_timedelta64() @@ -151,12 +151,12 @@ def test_td_sub_timedelta64(self): def test_td_sub_nat(self): # In this context pd.NaT is treated as timedelta-like - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = td - NaT assert result is NaT def test_td_sub_td64_nat(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") td_nat = np.timedelta64("NaT") result = td - td_nat @@ -166,13 +166,13 @@ def test_td_sub_td64_nat(self): assert result is NaT def test_td_sub_offset(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = td - pd.offsets.Hour(1) assert isinstance(result, Timedelta) assert result == Timedelta(239, unit="h") def test_td_add_sub_numeric_raises(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") for other in [2, 2.0, np.int64(2), np.float64(2)]: with pytest.raises(TypeError): td + other @@ -184,7 +184,7 @@ def test_td_add_sub_numeric_raises(self): other - td def test_td_rsub_nat(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = NaT - td assert result is NaT @@ -192,7 +192,7 @@ def test_td_rsub_nat(self): assert result is NaT def test_td_rsub_offset(self): - result = pd.offsets.Hour(1) - Timedelta(10, unit="d") + result = pd.offsets.Hour(1) - Timedelta(10, unit="D") assert isinstance(result, Timedelta) assert result == Timedelta(-239, unit="h") @@ -263,7 +263,7 @@ class TestTimedeltaMultiplicationDivision: @pytest.mark.parametrize("op", [operator.mul, ops.rmul]) def test_td_mul_nat(self, op, td_nat): # GH#19819 - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") with pytest.raises(TypeError): op(td, td_nat) @@ -271,7 +271,7 @@ def test_td_mul_nat(self, op, td_nat): @pytest.mark.parametrize("op", [operator.mul, ops.rmul]) def test_td_mul_nan(self, op, nan): # np.float64('NaN') has a 'dtype' attr, avoid treating as array - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = op(td, nan) assert result is NaT @@ -304,7 +304,7 @@ def test_td_mul_scalar(self, op): def test_td_div_timedeltalike_scalar(self): # GH#19738 - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = td / pd.offsets.Hour(1) assert result == 240 @@ -316,7 +316,7 @@ def test_td_div_timedeltalike_scalar(self): def test_td_div_numeric_scalar(self): # GH#19738 - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = td / 2 assert isinstance(result, Timedelta) @@ -329,7 +329,7 @@ def test_td_div_numeric_scalar(self): @pytest.mark.parametrize("nan", [np.nan, np.float64("NaN"), float("nan")]) def test_td_div_nan(self, nan): # np.float64('NaN') has a 'dtype' attr, avoid treating as array - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = td / nan assert result is NaT @@ -341,7 +341,7 @@ def test_td_div_nan(self, nan): def test_td_rdiv_timedeltalike_scalar(self): # GH#19738 - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") result = pd.offsets.Hour(1) / td assert result == 1 / 240.0 diff --git a/pandas/tests/scalar/timedelta/test_construction.py b/pandas/tests/scalar/timedelta/test_construction.py index ae1e84576c092..e373e83f0e93d 100644 --- a/pandas/tests/scalar/timedelta/test_construction.py +++ b/pandas/tests/scalar/timedelta/test_construction.py @@ -8,8 +8,8 @@ def test_construction(): expected = np.timedelta64(10, "D").astype("m8[ns]").view("i8") - assert Timedelta(10, unit="d").value == expected - assert Timedelta(10.0, unit="d").value == expected + assert Timedelta(10, unit="D").value == expected + assert Timedelta(10.0, unit="D").value == expected assert Timedelta("10 days").value == expected assert Timedelta(days=10).value == expected assert Timedelta(days=10.0).value == expected diff --git a/pandas/tests/scalar/timedelta/test_formats.py b/pandas/tests/scalar/timedelta/test_formats.py index 753186ee4b738..0130ff977390d 100644 --- a/pandas/tests/scalar/timedelta/test_formats.py +++ b/pandas/tests/scalar/timedelta/test_formats.py @@ -6,7 +6,7 @@ @pytest.mark.parametrize( "td, expected_repr", [ - (Timedelta(10, unit="d"), "Timedelta('10 days 00:00:00')"), + (Timedelta(10, unit="D"), "Timedelta('10 days 00:00:00')"), (Timedelta(10, unit="s"), "Timedelta('0 days 00:00:10')"), (Timedelta(10, unit="ms"), "Timedelta('0 days 00:00:00.010000')"), (Timedelta(-10, unit="ms"), "Timedelta('-1 days +23:59:59.990000')"), diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py index e4980be49d35f..e0a0fc0225085 100644 --- a/pandas/tests/scalar/timedelta/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -62,17 +62,17 @@ class Other: assert td.__floordiv__(other) is NotImplemented def test_unary_ops(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") # __neg__, __pos__ - assert -td == Timedelta(-10, unit="d") - assert -td == Timedelta("-10d") - assert +td == Timedelta(10, unit="d") + assert -td == Timedelta(-10, unit="D") + assert -td == Timedelta("-10D") + assert +td == Timedelta(10, unit="D") # __abs__, __abs__(__neg__) assert abs(td) == td assert abs(-td) == td - assert abs(-td) == Timedelta("10d") + assert abs(-td) == Timedelta("10D") class TestTimedeltaComparison: @@ -198,7 +198,7 @@ def test_total_seconds_scalar(self): def test_conversion(self): - for td in [Timedelta(10, unit="d"), Timedelta("1 days, 10:11:12.012345")]: + for td in [Timedelta(10, unit="D"), Timedelta("1 days, 10:11:12.012345")]: pydt = td.to_pytimedelta() assert td == Timedelta(pydt) assert td == pydt @@ -454,7 +454,7 @@ def test_numeric_conversions(self): assert Timedelta(10, unit="us") == np.timedelta64(10, "us") assert Timedelta(10, unit="ms") == np.timedelta64(10, "ms") assert Timedelta(10, unit="s") == np.timedelta64(10, "s") - assert Timedelta(10, unit="d") == np.timedelta64(10, "D") + assert Timedelta(10, unit="D") == np.timedelta64(10, "D") def test_timedelta_conversions(self): assert Timedelta(timedelta(seconds=1)) == np.timedelta64(1, "s").astype( @@ -574,7 +574,7 @@ def test_round(self): def test_contains(self): # Checking for any NaT-like objects # GH 13603 - td = to_timedelta(range(5), unit="d") + pd.offsets.Hour(1) + td = to_timedelta(range(5), unit="D") + pd.offsets.Hour(1) for v in [pd.NaT, None, float("nan"), np.nan]: assert not (v in td) @@ -584,7 +584,7 @@ def test_contains(self): def test_identity(self): - td = Timedelta(10, unit="d") + td = Timedelta(10, unit="D") assert isinstance(td, Timedelta) assert isinstance(td, timedelta) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 32d32a5d14fb2..d596a0576fe09 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -780,7 +780,7 @@ def test_isin_with_i8(self): assert_series_equal(result, expected) # timedelta64[ns] - s = Series(pd.to_timedelta(range(5), unit="d")) + s = Series(pd.to_timedelta(range(5), unit="D")) result = s.isin(s[0:2]) assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 9be79bf93ece7..f313dab4c8bd5 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -165,7 +165,7 @@ def test_astype_str_cast(self): expected = Series([str("2010-01-04 00:00:00-05:00")]) tm.assert_series_equal(s, expected) - td = Series([Timedelta(1, unit="d")]) + td = Series([Timedelta(1, unit="D")]) s = td.astype(str) expected = Series([str("1 days 00:00:00.000000000")]) diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index dfe91b514bbe1..92ce4e654c38b 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -144,8 +144,13 @@ def to_offset(freq): else: delta = None stride_sign = None + + # hack to handle WOM-1MON + opattern = re.compile( + r"([+\-]?\d*|[+\-]?\d*\.\d*)\s*([A-Za-z]+([\-][\dA-Za-z\-]+)?)" + ) try: - splitted = re.split(libfreqs.opattern, freq) + splitted = re.split(opattern, freq) if splitted[-1] != "" and not splitted[-1].isspace(): # the last element must be blank raise ValueError("last element must be blank")