diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 46464a3ecf420..cef34a3af0136 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -780,6 +780,7 @@ Performance improvements - Performance improvement in :func:`merge` when not merging on the index - the new index will now be :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`49478`) - Performance improvement in :meth:`DataFrame.to_dict` and :meth:`Series.to_dict` when using any non-object dtypes (:issue:`46470`) - Performance improvement in :func:`read_html` when there are multiple tables (:issue:`49929`) +- Performance improvement in :func:`to_datetime` when using ``'%Y%m%d'`` format (:issue:`17410`) .. --------------------------------------------------------------------------- .. _whatsnew_200.bug_fixes: @@ -809,6 +810,13 @@ Datetimelike - Bug in :meth:`Timestamp.round` when the ``freq`` argument has zero-duration (e.g. "0ns") returning incorrect results instead of raising (:issue:`49737`) - Bug in :func:`to_datetime` was not raising ``ValueError`` when invalid format was passed and ``errors`` was ``'ignore'`` or ``'coerce'`` (:issue:`50266`) - Bug in :class:`DateOffset` was throwing ``TypeError`` when constructing with milliseconds and another super-daily argument (:issue:`49897`) +- Bug in :func:`to_datetime` was not raising ``ValueError`` when parsing string with decimal date with format ``'%Y%m%d'`` (:issue:`50051`) +- Bug in :func:`to_datetime` was not converting ``None`` to ``NaT`` when parsing mixed-offset date strings with ISO8601 format (:issue:`50071`) +- Bug in :func:`to_datetime` was not returning input when parsing out-of-bounds date string with ``errors='ignore'`` and ``format='%Y%m%d'`` (:issue:`14487`) +- Bug in :func:`to_datetime` was converting timezone-naive ``datetime.datetime`` to timezone-aware when parsing with timezone-aware strings, ISO8601 format, and ``utc=False`` (:issue:`50254`) +- Bug in :func:`to_datetime` was throwing ``ValueError`` when parsing dates with ISO8601 format where some values were not zero-padded (:issue:`21422`) +- Bug in :func:`to_datetime` was giving incorrect results when using ``format='%Y%m%d'`` and ``errors='ignore'`` (:issue:`26493`) +- Bug in :func:`to_datetime` was failing to parse date strings ``'today'`` and ``'now'`` if ``format`` was not ISO8601 (:issue:`50359`) - Timedelta diff --git a/pandas/_libs/tslib.pyi b/pandas/_libs/tslib.pyi index ab94c4d59c5fc..9819b5173db56 100644 --- a/pandas/_libs/tslib.pyi +++ b/pandas/_libs/tslib.pyi @@ -23,9 +23,6 @@ def array_to_datetime( dayfirst: bool = ..., yearfirst: bool = ..., utc: bool = ..., - require_iso8601: bool = ..., - format: str | None = ..., - exact: bool = ..., ) -> tuple[np.ndarray, tzinfo | None]: ... # returned ndarray may be object dtype or datetime64[ns] diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index feae7fc4344d9..acee56a6598c2 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -38,6 +38,7 @@ from pandas._libs.tslibs.np_datetime cimport ( pydatetime_to_dt64, string_to_dts, ) +from pandas._libs.tslibs.strptime cimport parse_today_now from pandas._libs.util cimport ( is_datetime64_object, is_float_object, @@ -443,9 +444,6 @@ cpdef array_to_datetime( bint dayfirst=False, bint yearfirst=False, bint utc=False, - bint require_iso8601=False, - format: str | None=None, - bint exact=True, ): """ Converts a 1D array of date-like values to a numpy array of either: @@ -472,8 +470,6 @@ cpdef array_to_datetime( yearfirst parsing behavior when encountering datetime strings utc : bool, default False indicator whether the dates should be UTC - require_iso8601 : bool, default False - indicator whether the datetime string should be iso8601 Returns ------- @@ -544,16 +540,6 @@ cpdef array_to_datetime( iresult[i] = get_datetime64_nanos(val, NPY_FR_ns) elif is_integer_object(val) or is_float_object(val): - if require_iso8601: - if is_coerce: - iresult[i] = NPY_NAT - continue - elif is_raise: - raise ValueError( - f"time data \"{val}\" doesn't " - f"match format \"{format}\", at position {i}" - ) - return values, tz_out # these must be ns unit by-definition seen_integer = True @@ -584,25 +570,13 @@ cpdef array_to_datetime( string_to_dts_failed = string_to_dts( val, &dts, &out_bestunit, &out_local, - &out_tzoffset, False, format, exact + &out_tzoffset, False, None, False ) if string_to_dts_failed: # An error at this point is a _parsing_ error # specifically _not_ OutOfBoundsDatetime - if _parse_today_now(val, &iresult[i], utc): + if parse_today_now(val, &iresult[i], utc): continue - elif require_iso8601: - # if requiring iso8601 strings, skip trying - # other formats - if is_coerce: - iresult[i] = NPY_NAT - continue - elif is_raise: - raise ValueError( - f"time data \"{val}\" doesn't " - f"match format \"{format}\", at position {i}" - ) - return values, tz_out try: py_dt = parse_datetime_string(val, @@ -665,18 +639,6 @@ cpdef array_to_datetime( if is_coerce: iresult[i] = NPY_NAT continue - elif require_iso8601 and isinstance(val, str): - # GH#19382 for just-barely-OutOfBounds falling back to - # dateutil parser will return incorrect result because - # it will ignore nanoseconds - if is_raise: - - # Still raise OutOfBoundsDatetime, - # as error message is informative. - raise - - assert is_ignore - return values, tz_out raise except OutOfBoundsDatetime: @@ -835,26 +797,6 @@ cdef _array_to_datetime_object( return oresult, None -cdef bint _parse_today_now(str val, int64_t* iresult, bint utc): - # We delay this check for as long as possible - # because it catches relatively rare cases - - # Multiply by 1000 to convert to nanos, since these methods naturally have - # microsecond resolution - if val == "now": - if utc: - iresult[0] = Timestamp.utcnow().value * 1000 - else: - # GH#18705 make sure to_datetime("now") matches Timestamp("now") - # Note using Timestamp.now() is faster than Timestamp("now") - iresult[0] = Timestamp.now().value * 1000 - return True - elif val == "today": - iresult[0] = Timestamp.today().value * 1000 - return True - return False - - def array_to_datetime_with_tz(ndarray values, tzinfo tz): """ Vectorized analogue to pd.Timestamp(value, tz=tz) diff --git a/pandas/_libs/tslibs/parsing.pyi b/pandas/_libs/tslibs/parsing.pyi index c7244447edaf7..2e666249a76fc 100644 --- a/pandas/_libs/tslibs/parsing.pyi +++ b/pandas/_libs/tslibs/parsing.pyi @@ -40,7 +40,6 @@ def try_parse_datetime_components( minutes: npt.NDArray[np.object_], # object[:] seconds: npt.NDArray[np.object_], # object[:] ) -> npt.NDArray[np.object_]: ... -def format_is_iso(f: str) -> bool: ... def guess_datetime_format( dt_str, dayfirst: bool | None = ..., diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 26317da62c8d9..aa95febfc9721 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -818,26 +818,6 @@ class _timelex: _DATEUTIL_LEXER_SPLIT = _timelex.split -def format_is_iso(f: str) -> bint: - """ - Does format match the iso8601 set that can be handled by the C parser? - Generally of form YYYY-MM-DDTHH:MM:SS - date separator can be different - but must be consistent. Leading 0s in dates and times are optional. - """ - iso_template = "%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S{micro_or_tz}".format - excluded_formats = ["%Y%m%d", "%Y%m", "%Y"] - - for date_sep in [" ", "/", "\\", "-", ".", ""]: - for time_sep in [" ", "T"]: - for micro_or_tz in ["", "%z", ".%f", ".%f%z"]: - if (iso_template(date_sep=date_sep, - time_sep=time_sep, - micro_or_tz=micro_or_tz, - ).startswith(f) and f not in excluded_formats): - return True - return False - - def guess_datetime_format(dt_str: str, bint dayfirst=False) -> str | None: """ Guess the datetime format of a given datetime string. diff --git a/pandas/_libs/tslibs/strptime.pxd b/pandas/_libs/tslibs/strptime.pxd new file mode 100644 index 0000000000000..175195d4362e4 --- /dev/null +++ b/pandas/_libs/tslibs/strptime.pxd @@ -0,0 +1,4 @@ +from numpy cimport int64_t + + +cdef bint parse_today_now(str val, int64_t* iresult, bint utc) diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index a863824d92cc7..27e99706137b6 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -34,12 +34,14 @@ from pandas._libs.tslibs.nattype cimport ( c_nat_strings as nat_strings, ) from pandas._libs.tslibs.np_datetime cimport ( + NPY_DATETIMEUNIT, NPY_FR_ns, check_dts_bounds, npy_datetimestruct, npy_datetimestruct_to_datetime, pydate_to_dt64, pydatetime_to_dt64, + string_to_dts, ) from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime from pandas._libs.tslibs.timestamps cimport _Timestamp @@ -48,9 +50,50 @@ from pandas._libs.util cimport ( is_float_object, is_integer_object, ) +from pandas._libs.tslibs.timestamps import Timestamp cnp.import_array() +cdef bint format_is_iso(f: str): + """ + Does format match the iso8601 set that can be handled by the C parser? + Generally of form YYYY-MM-DDTHH:MM:SS - date separator can be different + but must be consistent. Leading 0s in dates and times are optional. + """ + excluded_formats = ["%Y%m"] + + for date_sep in [" ", "/", "\\", "-", ".", ""]: + for time_sep in [" ", "T"]: + for micro_or_tz in ["", "%z", ".%f", ".%f%z"]: + iso_fmt = f"%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S{micro_or_tz}" + if iso_fmt.startswith(f) and f not in excluded_formats: + return True + return False + + +def _test_format_is_iso(f: str) -> bool: + """Only used in testing.""" + return format_is_iso(f) + + +cdef bint parse_today_now(str val, int64_t* iresult, bint utc): + # We delay this check for as long as possible + # because it catches relatively rare cases + + # Multiply by 1000 to convert to nanos, since these methods naturally have + # microsecond resolution + if val == "now": + if utc: + iresult[0] = Timestamp.utcnow().value * 1000 + else: + # GH#18705 make sure to_datetime("now") matches Timestamp("now") + # Note using Timestamp.now() is faster than Timestamp("now") + iresult[0] = Timestamp.now().value * 1000 + return True + elif val == "today": + iresult[0] = Timestamp.today().value * 1000 + return True + return False cdef dict _parse_code_table = {"y": 0, "Y": 1, @@ -111,6 +154,9 @@ def array_strptime( bint found_naive = False bint found_tz = False tzinfo tz_out = None + bint iso_format = fmt is not None and format_is_iso(fmt) + NPY_DATETIMEUNIT out_bestunit + int out_local = 0, out_tzoffset = 0 assert is_raise or is_ignore or is_coerce @@ -232,6 +278,34 @@ def array_strptime( else: val = str(val) + if iso_format: + string_to_dts_failed = string_to_dts( + val, &dts, &out_bestunit, &out_local, + &out_tzoffset, False, fmt, exact + ) + if not string_to_dts_failed: + # No error reported by string_to_dts, pick back up + # where we left off + value = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts) + if out_local == 1: + # Store the out_tzoffset in seconds + # since we store the total_seconds of + # dateutil.tz.tzoffset objects + tz = timezone(timedelta(minutes=out_tzoffset)) + result_timezone[i] = tz + out_local = 0 + out_tzoffset = 0 + iresult[i] = value + check_dts_bounds(&dts) + continue + + if parse_today_now(val, &iresult[i], utc): + continue + + # Some ISO formats can't be parsed by string_to_dts + # For example, 6-digit YYYYMD. So, if there's an error, + # try the string-matching code below. + # exact matching if exact: found = format_regex.match(val) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 0162f54bf5225..608b38765621b 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -2118,10 +2118,7 @@ def objects_to_datetime64ns( yearfirst, utc: bool = False, errors: DateTimeErrorChoices = "raise", - require_iso8601: bool = False, allow_object: bool = False, - format: str | None = None, - exact: bool = True, ): """ Convert data to array of timestamps. @@ -2134,7 +2131,6 @@ def objects_to_datetime64ns( utc : bool, default False Whether to convert/localize timestamps to UTC. errors : {'raise', 'ignore', 'coerce'} - require_iso8601 : bool, default False allow_object : bool Whether to return an object-dtype ndarray instead of raising if the data contains more than one timezone. @@ -2165,9 +2161,6 @@ def objects_to_datetime64ns( utc=utc, dayfirst=dayfirst, yearfirst=yearfirst, - require_iso8601=require_iso8601, - format=format, - exact=exact, ) result = result.reshape(data.shape, order=order) except OverflowError as err: diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index a97a866a8406e..49661bd86b7cc 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -36,7 +36,6 @@ from pandas._libs.tslibs.conversion import precision_from_unit from pandas._libs.tslibs.parsing import ( DateParseError, - format_is_iso, guess_datetime_format, ) from pandas._libs.tslibs.strptime import array_strptime @@ -419,7 +418,6 @@ def _convert_listlike_datetimes( # warn if passing timedelta64, raise for PeriodDtype # NB: this must come after unit transformation - orig_arg = arg try: arg, _ = maybe_convert_dtype(arg, copy=False, tz=libtimezones.maybe_get_tz(tz)) except TypeError: @@ -432,24 +430,12 @@ def _convert_listlike_datetimes( raise arg = ensure_object(arg) - require_iso8601 = False if format is None: format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) - # There is a special fast-path for iso8601 formatted datetime strings - require_iso8601 = format is not None and format_is_iso(format) - - if format is not None and not require_iso8601: - return _to_datetime_with_format( - arg, - orig_arg, - name, - utc, - format, - exact, - errors, - ) + if format is not None: + return _array_strptime_with_fallback(arg, name, utc, format, exact, errors) result, tz_parsed = objects_to_datetime64ns( arg, @@ -457,10 +443,7 @@ def _convert_listlike_datetimes( yearfirst=yearfirst, utc=utc, errors=errors, - require_iso8601=require_iso8601, allow_object=True, - format=format, - exact=exact, ) if tz_parsed is not None: @@ -490,40 +473,6 @@ def _array_strptime_with_fallback( return _box_as_indexlike(result, utc=utc, name=name) -def _to_datetime_with_format( - arg, - orig_arg, - name, - utc: bool, - fmt: str, - exact: bool, - errors: str, -) -> Index: - """ - Try parsing with the given format. - """ - result = None - - # shortcut formatting here - if fmt == "%Y%m%d": - # pass orig_arg as float-dtype may have been converted to - # datetime64[ns] - orig_arg = ensure_object(orig_arg) - try: - # may return None without raising - result = _attempt_YYYYMMDD(orig_arg, errors=errors) - except (ValueError, TypeError, OutOfBoundsDatetime) as err: - raise ValueError( - "cannot convert the input to '%Y%m%d' date format" - ) from err - if result is not None: - return _box_as_indexlike(result, utc=utc, name=name) - - # fallback - res = _array_strptime_with_fallback(arg, name, utc, fmt, exact, errors) - return res - - def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index: """ to_datetime specalized to the case where a 'unit' is passed. @@ -978,7 +927,7 @@ def to_datetime( in addition to forcing non-dates (or non-parseable dates) to :const:`NaT`. >>> pd.to_datetime('13000101', format='%Y%m%d', errors='ignore') - datetime.datetime(1300, 1, 1, 0, 0) + '13000101' >>> pd.to_datetime('13000101', format='%Y%m%d', errors='coerce') NaT @@ -1011,14 +960,12 @@ def to_datetime( Index([2020-10-25 02:00:00+02:00, 2020-10-25 04:00:00+01:00], dtype='object') - - A mix of timezone-aware and timezone-naive inputs is converted to - a timezone-aware :class:`DatetimeIndex` if the offsets of the timezone-aware - are constant: + - A mix of timezone-aware and timezone-naive inputs is also converted to + a simple :class:`Index` containing :class:`datetime.datetime` objects: >>> from datetime import datetime >>> pd.to_datetime(["2020-01-01 01:00:00-01:00", datetime(2020, 1, 1, 3, 0)]) - DatetimeIndex(['2020-01-01 01:00:00-01:00', '2020-01-01 02:00:00-01:00'], - dtype='datetime64[ns, UTC-01:00]', freq=None) + Index([2020-01-01 01:00:00-01:00, 2020-01-01 03:00:00], dtype='object') | diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 8d001935c97f9..a0667899840f1 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -131,8 +131,11 @@ def test_to_datetime_format_YYYYMMDD_with_nat(self, cache): # string with NaT ser2 = ser.apply(str) ser2[2] = "nat" - result = to_datetime(ser2, format="%Y%m%d", cache=cache) - tm.assert_series_equal(result, expected) + with pytest.raises( + ValueError, match='unconverted data remains: ".0", at position 0' + ): + # https://github.com/pandas-dev/pandas/issues/50051 + to_datetime(ser2, format="%Y%m%d", cache=cache) def test_to_datetime_format_YYYYMM_with_nat(self, cache): # https://github.com/pandas-dev/pandas/issues/50237 @@ -147,15 +150,26 @@ def test_to_datetime_format_YYYYMM_with_nat(self, cache): def test_to_datetime_format_YYYYMMDD_ignore(self, cache): # coercion - # GH 7930 + # GH 7930, GH 14487 ser = Series([20121231, 20141231, 99991231]) result = to_datetime(ser, format="%Y%m%d", errors="ignore", cache=cache) expected = Series( - [datetime(2012, 12, 31), datetime(2014, 12, 31), datetime(9999, 12, 31)], + [20121231, 20141231, 99991231], dtype=object, ) tm.assert_series_equal(result, expected) + def test_to_datetime_format_YYYYMMDD_ignore_with_outofbounds(self, cache): + # https://github.com/pandas-dev/pandas/issues/26493 + result = to_datetime( + ["15010101", "20150101", np.nan], + format="%Y%m%d", + errors="ignore", + cache=cache, + ) + expected = Index(["15010101", "20150101", np.nan]) + tm.assert_index_equal(result, expected) + def test_to_datetime_format_YYYYMMDD_coercion(self, cache): # coercion # GH 7930 @@ -549,6 +563,26 @@ def test_to_datetime_mixed_date_and_string(self, format): ), id="all tz-aware, mixed offsets, with utc", ), + pytest.param( + False, + ["2000-01-01 01:00:00", "2000-01-01 02:00:00+00:00"], + Index( + [ + Timestamp("2000-01-01 01:00:00"), + Timestamp("2000-01-01 02:00:00+0000", tz="UTC"), + ], + ), + id="tz-aware string, naive pydatetime, without utc", + ), + pytest.param( + True, + ["2000-01-01 01:00:00", "2000-01-01 02:00:00+00:00"], + DatetimeIndex( + ["2000-01-01 01:00:00+00:00", "2000-01-01 02:00:00+00:00"], + dtype="datetime64[ns, UTC]", + ), + id="tz-aware string, naive pydatetime, with utc", + ), ], ) @pytest.mark.parametrize( @@ -559,6 +593,7 @@ def test_to_datetime_mixed_datetime_and_string_with_format( self, fmt, utc, args, expected, constructor ): # https://github.com/pandas-dev/pandas/issues/49298 + # https://github.com/pandas-dev/pandas/issues/50254 # note: ISO8601 formats go down a fastpath, so we need to check both # a ISO8601 format and a non-ISO8601 one ts1 = constructor(args[0]) @@ -566,6 +601,62 @@ def test_to_datetime_mixed_datetime_and_string_with_format( result = to_datetime([ts1, ts2], format=fmt, utc=utc) tm.assert_index_equal(result, expected) + @pytest.mark.parametrize( + "fmt, utc, expected", + [ + pytest.param( + "%Y-%m-%d %H:%M:%S%z", + True, + DatetimeIndex( + ["2000-01-01 08:00:00+00:00", "2000-01-02 00:00:00+00:00", "NaT"], + dtype="datetime64[ns, UTC]", + ), + id="ISO8601, UTC", + ), + pytest.param( + "%Y-%m-%d %H:%M:%S%z", + False, + Index( + [ + Timestamp("2000-01-01 09:00:00+0100", tz="UTC+01:00"), + Timestamp("2000-01-02 02:00:00+0200", tz="UTC+02:00"), + NaT, + ] + ), + id="ISO8601, non-UTC", + ), + pytest.param( + "%Y-%d-%m %H:%M:%S%z", + True, + DatetimeIndex( + ["2000-01-01 08:00:00+00:00", "2000-02-01 00:00:00+00:00", "NaT"], + dtype="datetime64[ns, UTC]", + ), + id="non-ISO8601, UTC", + ), + pytest.param( + "%Y-%d-%m %H:%M:%S%z", + False, + Index( + [ + Timestamp("2000-01-01 09:00:00+0100", tz="UTC+01:00"), + Timestamp("2000-02-01 02:00:00+0200", tz="UTC+02:00"), + NaT, + ] + ), + id="non-ISO8601, non-UTC", + ), + ], + ) + def test_to_datetime_mixed_offsets_with_none(self, fmt, utc, expected): + # https://github.com/pandas-dev/pandas/issues/50071 + result = to_datetime( + ["2000-01-01 09:00:00+01:00", "2000-01-02 02:00:00+02:00", None], + format=fmt, + utc=utc, + ) + tm.assert_index_equal(result, expected) + @pytest.mark.parametrize( "fmt", ["%Y-%d-%m %H:%M:%S%z", "%Y-%m-%d %H:%M:%S%z"], @@ -835,6 +926,29 @@ def test_to_datetime_today(self, tz): def test_to_datetime_today_now_unicode_bytes(self, arg): to_datetime([arg]) + @pytest.mark.parametrize( + "format, expected_ds", + [ + ("%Y-%m-%d %H:%M:%S%z", "2020-01-03"), + ("%Y-%d-%m %H:%M:%S%z", "2020-03-01"), + (None, "2020-01-03"), + ], + ) + @pytest.mark.parametrize( + "string, attribute", + [ + ("now", "utcnow"), + ("today", "today"), + ], + ) + def test_to_datetime_now_with_format(self, format, expected_ds, string, attribute): + # https://github.com/pandas-dev/pandas/issues/50359 + result = to_datetime(["2020-01-03 00:00:00Z", string], format=format, utc=True) + expected = DatetimeIndex( + [expected_ds, getattr(Timestamp, attribute)()], dtype="datetime64[ns, UTC]" + ) + assert (expected - result).max().total_seconds() < 1 + @pytest.mark.parametrize( "dt", [np.datetime64("2000-01-01"), np.datetime64("2000-01-02")] ) @@ -2004,12 +2118,15 @@ def test_to_datetime_iso8601_fails(self, input, format, exact): def test_to_datetime_iso8601_exact_fails(self, input, format): # https://github.com/pandas-dev/pandas/issues/12649 # `format` is shorter than the date string, so only fails with `exact=True` + msg = "|".join( + [ + '^unconverted data remains: ".*", at position 0$', + 'time data ".*" doesn\'t match format ".*", at position 0', + ] + ) with pytest.raises( ValueError, - match=( - rf"time data \"{input}\" doesn't match format " - rf"\"{format}\", at position 0" - ), + match=(msg), ): to_datetime(input, format=format) @@ -2074,6 +2191,28 @@ def test_to_datetime_iso8601_valid(self, input, format): result = to_datetime(input, format=format) assert result == expected + @pytest.mark.parametrize( + "input, format", + [ + ("2020-1", "%Y-%m"), + ("2020-1-1", "%Y-%m-%d"), + ("2020-1-1 0", "%Y-%m-%d %H"), + ("2020-1-1T0", "%Y-%m-%dT%H"), + ("2020-1-1 0:0", "%Y-%m-%d %H:%M"), + ("2020-1-1T0:0", "%Y-%m-%dT%H:%M"), + ("2020-1-1 0:0:0", "%Y-%m-%d %H:%M:%S"), + ("2020-1-1T0:0:0", "%Y-%m-%dT%H:%M:%S"), + ("2020-1-1T0:0:0.000", "%Y-%m-%dT%H:%M:%S.%f"), + ("2020-1-1T0:0:0.000000", "%Y-%m-%dT%H:%M:%S.%f"), + ("2020-1-1T0:0:0.000000000", "%Y-%m-%dT%H:%M:%S.%f"), + ], + ) + def test_to_datetime_iso8601_non_padded(self, input, format): + # https://github.com/pandas-dev/pandas/issues/21422 + expected = Timestamp(2020, 1, 1) + result = to_datetime(input, format=format) + assert result == expected + @pytest.mark.parametrize( "input, format", [ @@ -2587,8 +2726,7 @@ def test_day_not_in_month_raise(self, cache): ( "2015-02-29", "%Y-%m-%d", - '^time data "2015-02-29" doesn\'t match format "%Y-%m-%d", ' - "at position 0$", + "^day is out of range for month, at position 0$", ), ( "2015-29-02", @@ -2598,8 +2736,7 @@ def test_day_not_in_month_raise(self, cache): ( "2015-02-32", "%Y-%m-%d", - '^time data "2015-02-32" doesn\'t match format "%Y-%m-%d", ' - "at position 0$", + '^unconverted data remains: "2", at position 0$', ), ( "2015-32-02", @@ -2610,8 +2747,7 @@ def test_day_not_in_month_raise(self, cache): ( "2015-04-31", "%Y-%m-%d", - '^time data "2015-04-31" doesn\'t match format "%Y-%m-%d", ' - "at position 0$", + "^day is out of range for month, at position 0$", ), ( "2015-31-04", diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index 7675305e27d22..558c802fd70f6 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -8,7 +8,10 @@ import numpy as np import pytest -from pandas._libs.tslibs import parsing +from pandas._libs.tslibs import ( + parsing, + strptime, +) from pandas._libs.tslibs.parsing import parse_time_string import pandas.util._test_decorators as td @@ -154,27 +157,25 @@ def test_parsers_month_freq(date_str, expected): ("2011-12-30T00:00:00", "%Y-%m-%dT%H:%M:%S"), ("2011-12-30T00:00:00UTC", "%Y-%m-%dT%H:%M:%S%Z"), ("2011-12-30T00:00:00Z", "%Y-%m-%dT%H:%M:%S%z"), - # The +9 format for offsets is supported by dateutil, - # but don't round-trip, see https://github.com/pandas-dev/pandas/issues/48921 - ("2011-12-30T00:00:00+9", None), - ("2011-12-30T00:00:00+09", None), + ("2011-12-30T00:00:00+9", "%Y-%m-%dT%H:%M:%S%z"), + ("2011-12-30T00:00:00+09", "%Y-%m-%dT%H:%M:%S%z"), ("2011-12-30T00:00:00+090", None), ("2011-12-30T00:00:00+0900", "%Y-%m-%dT%H:%M:%S%z"), ("2011-12-30T00:00:00-0900", "%Y-%m-%dT%H:%M:%S%z"), ("2011-12-30T00:00:00+09:00", "%Y-%m-%dT%H:%M:%S%z"), ("2011-12-30T00:00:00+09:000", None), - ("2011-12-30T00:00:00+9:0", None), + ("2011-12-30T00:00:00+9:0", "%Y-%m-%dT%H:%M:%S%z"), ("2011-12-30T00:00:00+09:", None), ("2011-12-30T00:00:00.000000UTC", "%Y-%m-%dT%H:%M:%S.%f%Z"), ("2011-12-30T00:00:00.000000Z", "%Y-%m-%dT%H:%M:%S.%f%z"), - ("2011-12-30T00:00:00.000000+9", None), - ("2011-12-30T00:00:00.000000+09", None), + ("2011-12-30T00:00:00.000000+9", "%Y-%m-%dT%H:%M:%S.%f%z"), + ("2011-12-30T00:00:00.000000+09", "%Y-%m-%dT%H:%M:%S.%f%z"), ("2011-12-30T00:00:00.000000+090", None), ("2011-12-30T00:00:00.000000+0900", "%Y-%m-%dT%H:%M:%S.%f%z"), ("2011-12-30T00:00:00.000000-0900", "%Y-%m-%dT%H:%M:%S.%f%z"), ("2011-12-30T00:00:00.000000+09:00", "%Y-%m-%dT%H:%M:%S.%f%z"), ("2011-12-30T00:00:00.000000+09:000", None), - ("2011-12-30T00:00:00.000000+9:0", None), + ("2011-12-30T00:00:00.000000+9:0", "%Y-%m-%dT%H:%M:%S.%f%z"), ("2011-12-30T00:00:00.000000+09:", None), ("2011-12-30 00:00:00.000000", "%Y-%m-%d %H:%M:%S.%f"), ("Tue 24 Aug 2021 01:30:48 AM", "%a %d %b %Y %H:%M:%S %p"), @@ -305,16 +306,16 @@ def test_parse_time_string_check_instance_type_raise_exception(): ("%Y-%m-%dT%H:%M:%S.%f", True), ("%Y-%m-%dT%H:%M:%S.%f%z", True), ("%Y-%m-%dT%H:%M:%S.%f%Z", False), - ("%Y%m%d", False), + ("%Y%m%d", True), ("%Y%m", False), - ("%Y", False), + ("%Y", True), ("%Y-%m-%d", True), ("%Y-%m", True), ], ) def test_is_iso_format(fmt, expected): # see gh-41047 - result = parsing.format_is_iso(fmt) + result = strptime._test_format_is_iso(fmt) assert result == expected