pandas-dev · timcera · Jun 3, 2019 · Jun 3, 2019 · Jun 13, 2019 · Jul 28, 2019
diff --git a/pandas/_libs/tslibs/frequencies.pyx b/pandas/_libs/tslibs/frequencies.pyx
@@ -1,20 +1,10 @@
-import re
-
 cimport numpy as cnp
 cnp.import_array()
 
 from pandas._libs.tslibs.util cimport is_integer_object
 
 from pandas._libs.tslibs.ccalendar import MONTH_NUMBERS
 
-# ----------------------------------------------------------------------
-# Constants
-
-# hack to handle WOM-1MON
-opattern = re.compile(
-    r'([+\-]?\d*|[+\-]?\d*\.\d*)\s*([A-Za-z]+([\-][\dA-Za-z\-]+)?)'
-)
-
 INVALID_FREQ_ERR_MSG = "Invalid frequency: {0}"
 
 # ---------------------------------------------------------------------
@@ -194,20 +184,19 @@ cpdef _base_and_stride(str freqstr):
     --------
     _freq_and_stride('5Min') -> 'Min', 5
     """
-    groups = opattern.match(freqstr)
+    base = freqstr.lstrip('+-. 0123456789')
+    stride = freqstr[:freqstr.index(base)]
 
-    if not groups:
+    if not base:
         raise ValueError("Could not evaluate {freq}".format(freq=freqstr))
 
-    stride = groups.group(1)
-
+    # Possible for stride to be float at this point.  Should it fail or floor?
+    # Right now it fails.
     if len(stride):
         stride = int(stride)
     else:
         stride = 1
 
-    base = groups.group(2)
-
     return base, stride
 
 

diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx
@@ -24,7 +24,7 @@ from pandas._libs.tslibs.util cimport (
 
 from pandas._libs.tslibs.c_timestamp cimport _Timestamp
 
-from pandas._libs.tslibs.ccalendar import DAY_SECONDS
+from pandas._libs.tslibs.frequencies import _base_and_stride
 
 from pandas._libs.tslibs.np_datetime cimport (
     cmp_scalar, reverse_ops, td64_to_tdstruct, pandas_timedeltastruct)
@@ -235,7 +235,7 @@ def array_to_timedelta64(object[:] values, unit='ns', errors='raise'):
             else:
                 result[i] = parse_timedelta_string(values[i])
     except (TypeError, ValueError):
-        unit = parse_timedelta_unit(unit)
+        unit, stride = _base_and_stride(unit)
         for i in range(n):
             try:
                 result[i] = convert_to_timedelta64(values[i], unit)
@@ -257,36 +257,52 @@ cpdef inline object precision_from_unit(object unit):
         int64_t m
         int p
 
-    if unit == 'Y':
-        m = 1000000000L * 31556952
-        p = 9
-    elif unit == 'M':
-        m = 1000000000L * 2629746
-        p = 9
-    elif unit == 'W':
-        m = 1000000000L * DAY_SECONDS * 7
-        p = 9
-    elif unit == 'D' or unit == 'd':
-        m = 1000000000L * DAY_SECONDS
+    if unit is None:
+        m = 1L
+        p = 0
+        return m, p
+
+    unit, stride = _base_and_stride(unit)
+
+    if unit in ['Y', 'A', 'M', 'W']:
+        warnings.warn("Y, A, M, and W units are deprecated and "
+                      "will be removed in a future version.",
+                      FutureWarning)
+
+    # Don't know why previous implementation used the multiplication in the
+    # in-line comment instead of the value.  Current approach saves an
+    # operation.
+    if unit in ['D', 'd']:
+        m = stride * 86400000000000L # 1000000000L * DAY_SECONDS
         p = 9
-    elif unit == 'h':
-        m = 1000000000L * 3600
+    elif unit in ['H', 'h']:
+        m = stride * 3600000000000L # 1000000000L * 3600
         p = 9
-    elif unit == 'm':
-        m = 1000000000L * 60
+    elif unit in ['T', 'min', 'm']:
+        m = stride * 60000000000L # 1000000000L * 60
         p = 9
-    elif unit == 's':
-        m = 1000000000L
+    elif unit in ['S', 's']:
+        m = stride * 1000000000L
         p = 9
-    elif unit == 'ms':
-        m = 1000000L
+    elif unit in ['L', 'ms']:
+        m = stride * 1000000L
         p = 6
-    elif unit == 'us':
-        m = 1000L
+    elif unit in ['U', 'us']:
+        m = stride * 1000L
         p = 3
-    elif unit == 'ns' or unit is None:
-        m = 1L
+    elif unit in ['N', 'ns']:
+        m = stride * 1L
         p = 0
+    # deprecated units at end because rarely evaluated
+    elif unit in ['Y', 'A']:
+        m = stride * 1000000000L * 31556952
+        p = 9
+    elif unit == 'M':
+        m = stride * 1000000000L * 2629746
+        p = 9
+    elif unit == 'W':
+        m = stride * 1000000000L * 86400 * 7
+        p = 9
     else:
         raise ValueError("cannot cast unit {unit}".format(unit=unit))
     return m, p
@@ -305,7 +321,7 @@ cdef inline int64_t cast_from_unit(object ts, object unit) except? -1:
     if ts is None:
         return m
 
-    # cast the unit, multiply base/frace separately
+    # cast the unit, multiply base/frac separately
     # to avoid precision issues from float -> int
     base = <int64_t>ts
     frac = ts - base
@@ -505,8 +521,9 @@ cdef inline timedelta_from_spec(object number, object frac, object unit):
 
     try:
         unit = ''.join(unit)
+        unit, stride = _base_and_stride(unit)
         if unit == 'M':
-            # To parse ISO 8601 string, 'M' should be treated as minute,
+            # To parse ISO 8601 string, 'm' should be treated as minute,
             # not month
             unit = 'm'
         unit = parse_timedelta_unit(unit)
@@ -1255,6 +1272,9 @@ class Timedelta(_Timedelta):
                                  "[weeks, days, hours, minutes, seconds, "
                                  "milliseconds, microseconds, nanoseconds]")
 
+        if unit is not None:
+            unit, stride = _base_and_stride(unit)
+
         if unit in {'Y', 'y', 'M'}:
             warnings.warn("M and Y units are deprecated and "
                           "will be removed in a future version.",

diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
@@ -7,6 +7,7 @@
 
 from pandas._libs import tslib, tslibs
 from pandas._libs.tslibs import Timestamp, conversion, parsing
+from pandas._libs.tslibs.frequencies import _base_and_stride
 from pandas._libs.tslibs.parsing import (  # noqa
     DateParseError,
     _format_is_iso,
@@ -378,7 +379,6 @@ def _convert_listlike_datetimes(
 
     arg = ensure_object(arg)
     require_iso8601 = False
-
     if infer_datetime_format and format is None:
         format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst)
 
@@ -490,15 +490,18 @@ def _adjust_to_origin(arg, origin, unit):
     origin : 'julian' or Timestamp
         origin offset for the arg
     unit : string
-        passed unit from to_datetime, must be 'D'
+        passed unit from to_datetime, must be 'D' if origin is 'julian'
 
     Returns
     -------
     ndarray or scalar of adjusted date(s)
     """
+    from pandas import DatetimeIndex
+
     if origin == "julian":
         original = arg
         j0 = Timestamp(0).to_julian_date()
+        unit, stride = _base_and_stride(unit)
         if unit != "D":
             raise ValueError("unit must be 'D' for origin='julian'")
         try:
@@ -527,7 +530,8 @@ def _adjust_to_origin(arg, origin, unit):
                 )
             )
 
-        # we are going to offset back to unix / epoch time
+        # test the origin to make sure within valid range and no time
+        # zone
         try:
             offset = Timestamp(origin)
         except tslibs.OutOfBoundsDatetime:
@@ -542,18 +546,28 @@ def _adjust_to_origin(arg, origin, unit):
 
         if offset.tz is not None:
             raise ValueError("origin offset {} must be tz-naive".format(offset))
-        offset -= Timestamp(0)
 
-        # convert the offset to the unit of the arg
-        # this should be lossless in terms of precision
-        offset = offset // tslibs.Timedelta(1, unit=unit)
+        unit, stride = _base_and_stride(unit)
+
+        delta = tslibs.Timedelta(stride, unit=unit)
 
-        # scalars & ndarray-like can handle the addition
+        # scalars & ndarray-like can handle the multiplication and addition
         if is_list_like(arg) and not isinstance(
             arg, (ABCSeries, ABCIndexClass, np.ndarray)
         ):
             arg = np.asarray(arg)
-        arg = arg + offset
+
+        if stride == 1 and (offset - offset.floor("D")) == tslibs.Timedelta(0):
+            arg = arg + (offset.value // delta.value)
+        else:
+            # convert any integer type to int64 to prevent overflow
+            if is_integer_dtype(arg):
+                arg = arg.astype("int64", copy=False)
+            try:
+                arg = DatetimeIndex((arg * delta.value) + offset.value)
+            except TypeError:
+                arg = Timestamp((arg * delta.value) + offset.value)
+
     return arg
 
 
@@ -627,11 +641,27 @@ def to_datetime(
         - If True, require an exact format match.
         - If False, allow the format to match anywhere in the target string.
 
-    unit : string, default 'ns'
-        unit of the arg (D,s,ms,us,ns) denote the unit, which is an
-        integer or float number. This will be based off the origin.
-        Example, with unit='ms' and origin='unix' (the default), this
-        would calculate the number of milliseconds to the unix epoch start.
+    unit : string, default is 'N'
+        The unit code for the value(s) in `arg`.  Used when `arg` is
+        a numeric value or ordered collection of numeric values.
+        The unit code is a subset of pandas offset aliases, ISO 8601
+        codes, and legacy codes.
+
+        - 'D', for daily
+        - 'H' or 'h' for hourly
+        - 'T', 'm', or 'min' for minutely
+        - 'S' or 's' for seconds
+        - 'L' or 'ms' for milliseconds
+        - 'U' or 'us' for microseconds
+        - 'N' or 'ns' for nanoseconds
+
+        The resulting DatetimeIndex will be based off the `origin`.
+        For example, with unit='L' and origin='unix' (the default) then
+        the values in `arg` would represent the number of milliseconds
+        from the unix epoch start.
+
+        The unit code can be prefixed with a stride. For example,
+        results would be equivalent between unit='24H' and unit='D'.
     infer_datetime_format : boolean, default False
         If True and no `format` is given, attempt to infer the format of the
         datetime strings, and if it can be inferred, switch to a faster
@@ -885,7 +915,7 @@ def coerce(values):
         # we allow coercion to if errors allows
         values = to_numeric(values, errors=errors)
 
-        # prevent overflow in case of int8 or int16
+        # convert any integer type to int64 to prevent overflow
         if is_integer_dtype(values):
             values = values.astype("int64", copy=False)
         return values

diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py
@@ -1358,6 +1358,98 @@ def test_to_datetime_errors_ignore_utc_true(self):
         expected = DatetimeIndex(["1970-01-01 00:00:01"], tz="UTC")
         tm.assert_index_equal(result, expected)
 
+    @pytest.mark.parametrize(
+        "unitl,unitr",
+        [
+            ("D", "24h"),
+            ("2D", "48H"),
+            ("3D", "72H"),
+            ("4D", "96H"),
+            ("7D", "168H"),
+            ("86400000000000N", "D"),
+            ("60T", "H"),
+            ("120m", "2H"),
+            ("60min", "H"),
+            ("3600S", "h"),
+            ("3600000ms", "H"),
+            ("3600000L", "H"),
+            ("3600000000U", "H"),
+            ("60s", "m"),
+            ("60S", "T"),
+            ("60S", "min"),
+            ("1000ms", "S"),
+            ("1000L", "S"),
+            ("1000000U", "S"),
+            ("1000000000N", "S"),
+        ],
+    )
+    def test_to_datetime_stride(self, unitl, unitr):
+        result = pd.to_datetime([1, 2, 3, 5], unit=unitl)
+        expected = pd.to_datetime([1, 2, 3, 5], unit=unitr)
+        tm.assert_index_equal(result, expected)
+
+        result = pd.to_datetime(2, unit=unitl)
+        expected = pd.to_datetime(2, unit=unitr)
+        assert result == expected
+
+    # Can't use 'm' for minutes and 'M' for months in the following test
+    # since tested against pd.date_range which sees both 'm' and 'M' as
+    # months.
+    @pytest.mark.parametrize(
+        "unit,epoch",
+        [
+            ("D", "1980-01-02"),
+            ("D", "2018-05-18"),
+            ("D", "2018-05-18T11"),
+            ("D", "2018-05-18T11:04"),
+            ("D", "2018-05-18T11:04:52"),
+            ("2D", "1970-01-01"),
+            ("2D", "1970-01-01T21:12:43"),
+            ("2D", "2019-05-03"),
+            ("2D", "2019-05-03T12:11"),
+            ("3D", "1970-01-01"),
+            ("3D", "2019-05-03T14"),
+            ("4D", "1970-05-03"),
+            ("4D", "2019-05-03T11"),
+            ("5D", "2019-05-03T11"),
+            ("6D", "2019-05-03T11"),
+            ("7D", "2019-05-03T11"),
+            ("14D", "2019-05-03T11"),
+            ("H", "2018-05-18"),
+            ("H", "2018-05-18T11"),
+            ("H", "2018-05-18T11:04"),
+            ("H", "2018-05-18T11:04:52"),
+            ("12h", "1990-05-03T12:00:00"),
+            ("24H", "1980-12-31"),
+            ("48h", "1980-12-31"),
+            ("96h", "1980-12-31"),
+            ("2H", "2019-12-31T11:59"),
+            ("24h", "2001-08-15"),
+            ("5T", "2001-08-15"),
+            ("5min", "2001-08-15"),
+            ("10T", "2001-08-15"),
+            ("5S", "1970-01-01T01:10"),
+            ("60S", "1970-01-01T01:10:12"),
+            ("5T", "1980-12-31"),
+            ("1000T", "1980-12-31"),
+            ("100N", "1980-12-31"),
+            ("N", "1980-12-31"),
+        ],
+    )
+    def test_to_datetime_stride_epoch(self, unit, epoch):
+        result = pd.to_datetime(list(range(100)), unit=unit, origin=epoch)
+        expected = pd.date_range(start=epoch, freq=unit, periods=100)
+        tm.assert_index_equal(result, expected)
+
+        result = pd.to_datetime(2, unit=unit, origin=epoch)
+        expected = pd.date_range(start=epoch, freq=unit, periods=100)[2]
+        assert result == expected
+
+    @pytest.mark.parametrize("unit", ["Y", "A", "M", "W"])
+    def test_to_datetime_unit_code_deprecated(self, unit):
+        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+            _ = pd.to_datetime(list(range(100)), unit=unit, origin="unix")
+
 
 class TestToDatetimeMisc:
     def test_to_datetime_barely_out_of_bounds(self):