Skip to content

WIP warn if parsing with du_parse #47828

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 20 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/user_guide/timedeltas.rst
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,7 @@ Similarly to other of the datetime-like indices, ``DatetimeIndex`` and ``PeriodI
Selections work similarly, with coercion on string-likes and slices:

.. ipython:: python
:okwarning:

s["1 day":"2 day"]
s["1 day 01:00:00"]
Expand All @@ -432,6 +433,7 @@ Selections work similarly, with coercion on string-likes and slices:
Furthermore you can use partial string selection and the range will be inferred:

.. ipython:: python
:okwarning:

s["1 day":"1 day 5 hours"]

Expand Down
46 changes: 38 additions & 8 deletions pandas/_libs/tslibs/parsing.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,19 @@ cdef inline bint does_string_look_like_time(str parse_string):

return 0 <= hour <= 23 and 0 <= minute <= 59

from pandas.util._exceptions import find_stack_level
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there an option that avoids importing from outside libs here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yup, could set it explicitly each time - I'll do that once I've sorted out the tests

import inspect


def du_parse_with_warning(*args, **kwargs):
parsed = du_parse(*args, **kwargs)
warnings.warn(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we take a perf hit here?

"Parsing datetime strings without a format specified, "
"please specify a format to avoid unexpected results",
stacklevel=find_stack_level(inspect.currentframe()),
)
return parsed


def parse_datetime_string(
# NB: This will break with np.str_ (GH#32264) even though
Expand Down Expand Up @@ -290,8 +303,12 @@ def parse_datetime_string(

if does_string_look_like_time(date_string):
# use current datetime as default, not pass _DEFAULT_DATETIME
dt = du_parse(date_string, dayfirst=dayfirst,
yearfirst=yearfirst, **kwargs)
dt = du_parse_with_warning(
date_string,
dayfirst=dayfirst,
yearfirst=yearfirst,
**kwargs,
)
return dt

dt, _ = _parse_delimited_date(date_string, dayfirst)
Expand All @@ -307,8 +324,13 @@ def parse_datetime_string(
pass

try:
dt = du_parse(date_string, default=_DEFAULT_DATETIME,
dayfirst=dayfirst, yearfirst=yearfirst, **kwargs)
dt = du_parse_with_warning(
date_string,
default=_DEFAULT_DATETIME,
dayfirst=dayfirst,
yearfirst=yearfirst,
**kwargs,
)
except TypeError:
# following may be raised from dateutil
# TypeError: 'NoneType' object is not iterable
Expand Down Expand Up @@ -706,7 +728,11 @@ def try_parse_dates(
date = datetime.now()
default = datetime(date.year, date.month, 1)

parse_date = lambda x: du_parse(x, dayfirst=dayfirst, default=default)
parse_date = lambda x: du_parse_with_warning(
x,
dayfirst=dayfirst,
default=default,
)

# EAFP here
try:
Expand Down Expand Up @@ -753,13 +779,17 @@ def try_parse_date_and_time(
date = datetime.now()
default = datetime(date.year, date.month, 1)

parse_date = lambda x: du_parse(x, dayfirst=dayfirst, default=default)
parse_date = lambda x: du_parse_with_warning(
x,
dayfirst=dayfirst,
default=default,
)

else:
parse_date = date_parser

if time_parser is None:
parse_time = lambda x: du_parse(x)
parse_time = lambda x: du_parse_with_warning(x)

else:
parse_time = time_parser
Expand Down Expand Up @@ -980,7 +1010,7 @@ def guess_datetime_format(dt_str, bint dayfirst=False):
datetime_attrs_to_format.insert(0, day_attribute_and_format)

try:
parsed_datetime = du_parse(dt_str, dayfirst=dayfirst)
parsed_datetime = du_parse_with_warning(dt_str, dayfirst=dayfirst)
except (ValueError, OverflowError):
# In case the datetime can't be parsed, its format cannot be guessed
return None
Expand Down
3 changes: 2 additions & 1 deletion pandas/tests/apply/test_frame_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -836,7 +836,8 @@ def test_with_dictlike_columns_with_datetime():
df["author"] = ["X", "Y", "Z"]
df["publisher"] = ["BBC", "NBC", "N24"]
df["date"] = pd.to_datetime(
["17-10-2010 07:15:30", "13-05-2011 08:20:35", "15-01-2013 09:09:09"]
["17-10-2010 07:15:30", "13-05-2011 08:20:35", "15-01-2013 09:09:09"],
format="%d-%m-%Y %H:%M:%S",
)
result = df.apply(lambda x: {}, axis=1)
expected = Series([{}, {}, {}])
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/arithmetic/test_datetime64.py
Original file line number Diff line number Diff line change
Expand Up @@ -1620,7 +1620,7 @@ def test_dti_add_sub_nonzero_mth_offset(
):
# GH 26258
tz = tz_aware_fixture
date = date_range(start="01 Jan 2014", end="01 Jan 2017", freq="AS", tz=tz)
date = date_range(start="2014-01-01", end="2017-01-01", freq="AS", tz=tz)
date = tm.box_expected(date, box_with_array, False)
mth = getattr(date, op)
result = mth(offset)
Expand Down
23 changes: 17 additions & 6 deletions pandas/tests/arrays/test_datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,13 +316,19 @@ def test_searchsorted_castable_strings(self, arr1d, box, request, string_storage
arr = pd.Series(arr)

# scalar
result = arr.searchsorted(str(arr[1]))
with tm.assert_produces_warning(None, raise_on_extra_warnings=False):
# wip
result = arr.searchsorted(str(arr[1]))
assert result == 1

result = arr.searchsorted(str(arr[2]), side="right")
with tm.assert_produces_warning(None, raise_on_extra_warnings=False):
# wip
result = arr.searchsorted(str(arr[2]), side="right")
assert result == 3

result = arr.searchsorted([str(x) for x in arr[1:3]])
with tm.assert_produces_warning(None, raise_on_extra_warnings=False):
# wip
result = arr.searchsorted([str(x) for x in arr[1:3]])
expected = np.array([1, 2], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)

Expand All @@ -345,7 +351,10 @@ def test_searchsorted_castable_strings(self, arr1d, box, request, string_storage
f"or array of those. Got '{arr_type}' instead."
),
):
arr.searchsorted([str(arr[1]), "baz"])
with tm.assert_produces_warning(
UserWarning, match="without a format specified"
):
arr.searchsorted([str(arr[1]), "baz"])

def test_getitem_near_implementation_bounds(self):
# We only check tz-naive for DTA bc the bounds are slightly different
Expand Down Expand Up @@ -480,15 +489,17 @@ def test_setitem_strs(self, arr1d, request):
expected[[0, 1]] = arr1d[-2:]

result = arr1d.copy()
result[:2] = [str(x) for x in arr1d[-2:]]
with tm.assert_produces_warning(None, raise_on_extra_warnings=False):
result[:2] = [str(x) for x in arr1d[-2:]]
tm.assert_equal(result, expected)

# Same thing but now for just a scalar str
expected = arr1d.copy()
expected[0] = arr1d[-1]

result = arr1d.copy()
result[0] = str(arr1d[-1])
with tm.assert_produces_warning(None, raise_on_extra_warnings=False):
result[0] = str(arr1d[-1])
tm.assert_equal(result, expected)

@pytest.mark.parametrize("as_index", [True, False])
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/dtypes/test_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ class TestABCClasses:
multi_index = pd.MultiIndex.from_arrays(tuples, names=("number", "color"))
datetime_index = pd.to_datetime(["2000/1/1", "2010/1/1"])
timedelta_index = pd.to_timedelta(np.arange(5), unit="s")
period_index = pd.period_range("2000/1/1", "2010/1/1/", freq="M")
period_index = pd.period_range("1/1/2000", "1/1/2010", freq="M")
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pd.to_datetime(["2000/1/1", "2010/1/1"]) doesn't raise, but pd.period_range("2000/1/1", "2010/1/1/", freq="M") does

don't know why, need to investigate

categorical = pd.Categorical([1, 2, 3], categories=[2, 3, 1])
categorical_df = pd.DataFrame({"values": [1, 2, 3]}, index=categorical)
df = pd.DataFrame({"names": ["a", "b", "c"]}, index=multi_index)
Expand Down
12 changes: 6 additions & 6 deletions pandas/tests/frame/methods/test_drop.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,17 +405,17 @@ def test_drop_level_nonunique_datetime(self):
idx = Index([2, 3, 4, 4, 5], name="id")
idxdt = pd.to_datetime(
[
"201603231400",
"201603231500",
"201603231600",
"201603231600",
"201603231700",
"2016-03-23 14:00",
"2016-03-23 15:00",
"2016-03-23 16:00",
"2016-03-23 16:00",
"2016-03-23 17:00",
]
)
df = DataFrame(np.arange(10).reshape(5, 2), columns=list("ab"), index=idx)
df["tstamp"] = idxdt
df = df.set_index("tstamp", append=True)
ts = Timestamp("201603231600")
ts = Timestamp("2016-03-23 16:00")
assert df.index.is_unique is False

result = df.drop(ts, level="tstamp")
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/methods/test_quantile.py
Original file line number Diff line number Diff line change
Expand Up @@ -590,7 +590,7 @@ def test_quantile_empty_no_rows_dt64(self):

def test_quantile_empty_no_columns(self):
# GH#23925 _get_numeric_data may drop all columns
df = DataFrame(pd.date_range("1/1/18", periods=5))
df = DataFrame(pd.date_range("1/1/2018", periods=5))
df.columns.name = "captain tightpants"
result = df.quantile(0.5, numeric_only=True)
expected = Series([], index=[], name=0.5, dtype=np.float64)
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/methods/test_reindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,7 @@ def test_reindex_nearest_tz_empty_frame(self):
tm.assert_frame_equal(result, expected)

def test_reindex_frame_add_nat(self):
rng = date_range("1/1/2000 00:00:00", periods=10, freq="10s")
rng = date_range("2000-01-01 00:00:00", periods=10, freq="10s")
df = DataFrame({"A": np.random.randn(len(rng)), "B": rng})

result = df.reindex(range(15))
Expand Down
4 changes: 3 additions & 1 deletion pandas/tests/frame/methods/test_to_timestamp.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,9 @@ def test_to_timestamp_hourly(self, frame_or_series):
if frame_or_series is not Series:
obj = obj.to_frame()

exp_index = date_range("1/1/2001 00:59:59", end="1/2/2001 00:59:59", freq="H")
exp_index = date_range(
"2001-01-01 00:59:59", end="2001-01-02 00:59:59", freq="H"
)
result = obj.to_timestamp(how="end")
exp_index = exp_index + Timedelta(1, "s") - Timedelta(1, "ns")
tm.assert_index_equal(result.index, exp_index)
Expand Down
6 changes: 5 additions & 1 deletion pandas/tests/groupby/test_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -603,7 +603,11 @@ def test_apply_numeric_coercion_when_datetime():

# GH 15421
df = DataFrame(
{"A": [10, 20, 30], "B": ["foo", "3", "4"], "T": [pd.Timestamp("12:31:22")] * 3}
{
"A": [10, 20, 30],
"B": ["foo", "3", "4"],
"T": [pd.Timestamp("2000-01-01 12:31:22")] * 3,
}
)

def get_B(g):
Expand Down
3 changes: 2 additions & 1 deletion pandas/tests/groupby/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -688,7 +688,8 @@ def test_max_nan_bug():
-05-06,2013-05-06 00:00:00,,log.log
-05-07,2013-05-07 00:00:00,OE,xlsx"""

df = pd.read_csv(StringIO(raw), parse_dates=[0])
with tm.assert_produces_warning(UserWarning, match="without a format specified"):
df = pd.read_csv(StringIO(raw), parse_dates=[0])
gb = df.groupby("Date")
r = gb[["File"]].max()
e = gb["File"].max().to_frame()
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/groupby/test_quantile.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@
([np.nan, 4.0, np.nan, 2.0, np.nan], [np.nan, 4.0, np.nan, 2.0, np.nan]),
# Timestamps
(
list(pd.date_range("1/1/18", freq="D", periods=5)),
list(pd.date_range("1/1/18", freq="D", periods=5))[::-1],
list(pd.date_range("1/1/2018", freq="D", periods=5)),
list(pd.date_range("1/1/2018", freq="D", periods=5))[::-1],
),
# All NA
([np.nan] * 5, [np.nan] * 5),
Expand Down
8 changes: 5 additions & 3 deletions pandas/tests/indexing/test_loc.py
Original file line number Diff line number Diff line change
Expand Up @@ -632,12 +632,14 @@ def test_loc_setitem_consistency_slice_column_len(self):
]
df = DataFrame(values, index=mi, columns=cols)

msg = "will attempt to set the values inplace instead"
with tm.assert_produces_warning(FutureWarning, match=msg):
msg = (
"will attempt to set the values inplace instead|without a format specified"
)
with tm.assert_produces_warning((FutureWarning, UserWarning), match=msg):
df.loc[:, ("Respondent", "StartDate")] = to_datetime(
df.loc[:, ("Respondent", "StartDate")]
)
with tm.assert_produces_warning(FutureWarning, match=msg):
with tm.assert_produces_warning((FutureWarning, UserWarning), match=msg):
df.loc[:, ("Respondent", "EndDate")] = to_datetime(
df.loc[:, ("Respondent", "EndDate")]
)
Expand Down
Loading