Skip to content

BUG: Fix some parse dates tests for the Arrow CSV reader #43312

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Sep 3, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions pandas/io/parsers/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -1090,7 +1090,9 @@ def _isindex(colspec):
colspec = orig_names[colspec]
if _isindex(colspec):
continue
data_dict[colspec] = converter(data_dict[colspec])
# Pyarrow engine returns Series which we need to convert to
# numpy array before converter, its a no-op for other parsers
data_dict[colspec] = converter(np.asarray(data_dict[colspec]))
else:
new_name, col, old_names = _try_convert_dates(
converter, colspec, data_dict, orig_names
Expand Down Expand Up @@ -1139,7 +1141,7 @@ def _try_convert_dates(parser: Callable, colspec, data_dict, columns):
colnames.append(c)

new_name = "_".join([str(x) for x in colnames])
to_parse = [data_dict[c] for c in colnames if c in data_dict]
to_parse = [np.asarray(data_dict[c]) for c in colnames if c in data_dict]

new_col = parser(*to_parse)
return new_name, new_col, colnames
Expand Down
36 changes: 35 additions & 1 deletion pandas/tests/io/parser/test_parse_dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
import pandas.io.date_converters as conv
from pandas.io.parsers import read_csv

pytestmark = pytest.mark.usefixtures("pyarrow_skip")
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")

# constant
_DEFAULT_DATETIME = datetime(1, 1, 1)
Expand All @@ -54,6 +54,7 @@
date_strategy = st.datetimes()


@xfail_pyarrow
def test_read_csv_with_custom_date_parser(all_parsers):
# GH36111
def __custom_date_parser(time):
Expand Down Expand Up @@ -91,6 +92,7 @@ def __custom_date_parser(time):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow
def test_separator_date_conflict(all_parsers):
# Regression test for gh-4678
#
Expand All @@ -112,6 +114,7 @@ def test_separator_date_conflict(all_parsers):
tm.assert_frame_equal(df, expected)


@xfail_pyarrow
@pytest.mark.parametrize("keep_date_col", [True, False])
def test_multiple_date_col_custom(all_parsers, keep_date_col):
data = """\
Expand Down Expand Up @@ -267,6 +270,7 @@ def test_concat_date_col_fail(container, dim):
parsing.concat_date_cols(date_cols)


@xfail_pyarrow
@pytest.mark.parametrize("keep_date_col", [True, False])
def test_multiple_date_col(all_parsers, keep_date_col):
data = """\
Expand Down Expand Up @@ -426,6 +430,7 @@ def test_date_col_as_index_col(all_parsers):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow
@pytest.mark.parametrize(
"date_parser, warning",
([conv.parse_date_time, FutureWarning], [pd.to_datetime, None]),
Expand Down Expand Up @@ -490,6 +495,7 @@ def test_multiple_date_cols_int_cast(all_parsers, date_parser, warning):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow
def test_multiple_date_col_timestamp_parse(all_parsers):
parser = all_parsers
data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25
Expand Down Expand Up @@ -524,6 +530,7 @@ def test_multiple_date_col_timestamp_parse(all_parsers):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow
def test_multiple_date_cols_with_header(all_parsers):
parser = all_parsers
data = """\
Expand Down Expand Up @@ -693,6 +700,7 @@ def test_date_parser_int_bug(all_parsers):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow
def test_nat_parse(all_parsers):
# see gh-3062
parser = all_parsers
Expand All @@ -708,6 +716,7 @@ def test_nat_parse(all_parsers):
tm.assert_frame_equal(result, df)


@xfail_pyarrow
def test_csv_custom_parser(all_parsers):
data = """A,B,C
20090101,a,1,2
Expand All @@ -722,6 +731,7 @@ def test_csv_custom_parser(all_parsers):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow
def test_parse_dates_implicit_first_col(all_parsers):
data = """A,B,C
20090101,a,1,2
Expand All @@ -735,6 +745,7 @@ def test_parse_dates_implicit_first_col(all_parsers):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow
def test_parse_dates_string(all_parsers):
data = """date,A,B,C
20090101,a,1,2
Expand Down Expand Up @@ -779,6 +790,7 @@ def test_yy_format_with_year_first(all_parsers, parse_dates):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow
@pytest.mark.parametrize("parse_dates", [[0, 2], ["a", "c"]])
def test_parse_dates_column_list(all_parsers, parse_dates):
data = "a,b,c\n01/01/2010,1,15/02/2010"
Expand All @@ -795,6 +807,7 @@ def test_parse_dates_column_list(all_parsers, parse_dates):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow
@pytest.mark.parametrize("index_col", [[0, 1], [1, 0]])
def test_multi_index_parse_dates(all_parsers, index_col):
data = """index1,index2,A,B,C
Expand Down Expand Up @@ -840,6 +853,7 @@ def test_multi_index_parse_dates(all_parsers, index_col):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow
@pytest.mark.parametrize("kwargs", [{"dayfirst": True}, {"day_first": True}])
def test_parse_dates_custom_euro_format(all_parsers, kwargs):
parser = all_parsers
Expand Down Expand Up @@ -884,6 +898,7 @@ def test_parse_dates_custom_euro_format(all_parsers, kwargs):
)


@xfail_pyarrow
def test_parse_tz_aware(all_parsers):
# See gh-1693
parser = all_parsers
Expand All @@ -897,6 +912,7 @@ def test_parse_tz_aware(all_parsers):
assert result.index.tz is pytz.utc


@xfail_pyarrow
@pytest.mark.parametrize(
"parse_dates,index_col",
[({"nominal": [1, 2]}, "nominal"), ({"nominal": [1, 2]}, 0), ([[1, 2]], 0)],
Expand Down Expand Up @@ -997,6 +1013,7 @@ def test_multiple_date_cols_index(all_parsers, parse_dates, index_col):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow
def test_multiple_date_cols_chunked(all_parsers):
parser = all_parsers
data = """\
Expand Down Expand Up @@ -1089,6 +1106,7 @@ def test_multiple_date_cols_chunked(all_parsers):
tm.assert_frame_equal(chunks[2], expected[4:])


@xfail_pyarrow
def test_multiple_date_col_named_index_compat(all_parsers):
parser = all_parsers
data = """\
Expand All @@ -1112,6 +1130,7 @@ def test_multiple_date_col_named_index_compat(all_parsers):
tm.assert_frame_equal(with_indices, with_names)


@xfail_pyarrow
def test_multiple_date_col_multiple_index_compat(all_parsers):
parser = all_parsers
data = """\
Expand Down Expand Up @@ -1179,6 +1198,7 @@ def test_bad_date_parse(all_parsers, cache_dates, value):
)


@xfail_pyarrow
def test_parse_dates_empty_string(all_parsers):
# see gh-2263
parser = all_parsers
Expand All @@ -1191,6 +1211,7 @@ def test_parse_dates_empty_string(all_parsers):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow
@pytest.mark.parametrize(
"data,kwargs,expected",
[
Expand Down Expand Up @@ -1230,6 +1251,7 @@ def test_parse_dates_no_convert_thousands(all_parsers, data, kwargs, expected):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow
@pytest.mark.parametrize(
"date_parser, warning",
([conv.parse_date_time, FutureWarning], [pd.to_datetime, None]),
Expand Down Expand Up @@ -1258,6 +1280,7 @@ def test_parse_date_time_multi_level_column_name(all_parsers, date_parser, warni
tm.assert_frame_equal(result, expected)


@xfail_pyarrow
@pytest.mark.parametrize(
"date_parser, warning",
([conv.parse_date_time, FutureWarning], [pd.to_datetime, None]),
Expand Down Expand Up @@ -1346,6 +1369,7 @@ def test_parse_date_time(all_parsers, data, kwargs, expected, date_parser, warni
tm.assert_frame_equal(result, expected)


@xfail_pyarrow
@pytest.mark.parametrize(
"date_parser, warning",
([conv.parse_date_fields, FutureWarning], [pd.to_datetime, None]),
Expand All @@ -1368,6 +1392,7 @@ def test_parse_date_fields(all_parsers, date_parser, warning):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow
@pytest.mark.parametrize(
"date_parser, warning",
(
Expand Down Expand Up @@ -1399,6 +1424,7 @@ def test_parse_date_all_fields(all_parsers, date_parser, warning):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow
@pytest.mark.parametrize(
"date_parser, warning",
(
Expand Down Expand Up @@ -1430,6 +1456,7 @@ def test_datetime_fractional_seconds(all_parsers, date_parser, warning):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow
def test_generic(all_parsers):
parser = all_parsers
data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11."
Expand All @@ -1448,6 +1475,7 @@ def test_generic(all_parsers):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow
def test_date_parser_resolution_if_not_ns(all_parsers):
# see gh-10245
parser = all_parsers
Expand Down Expand Up @@ -1545,6 +1573,7 @@ def test_parse_timezone(all_parsers):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow
@pytest.mark.parametrize(
"date_string",
["32/32/2019", "02/30/2019", "13/13/2019", "13/2019", "a3/11/2018", "10/11/2o17"],
Expand All @@ -1556,6 +1585,7 @@ def test_invalid_parse_delimited_date(all_parsers, date_string):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow
@pytest.mark.parametrize(
"date_string,dayfirst,expected",
[
Expand All @@ -1578,6 +1608,7 @@ def test_parse_delimited_date_swap_no_warning(
tm.assert_frame_equal(result, expected)


@xfail_pyarrow
@pytest.mark.parametrize(
"date_string,dayfirst,expected",
[
Expand Down Expand Up @@ -1647,6 +1678,7 @@ def test_hypothesis_delimited_date(date_format, dayfirst, delimiter, test_dateti
assert result == expected


@xfail_pyarrow
@pytest.mark.parametrize(
"names, usecols, parse_dates, missing_cols",
[
Expand Down Expand Up @@ -1679,6 +1711,7 @@ def test_missing_parse_dates_column_raises(
)


@xfail_pyarrow
def test_date_parser_and_names(all_parsers):
# GH#33699
parser = all_parsers
Expand All @@ -1688,6 +1721,7 @@ def test_date_parser_and_names(all_parsers):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow
def test_date_parser_usecols_thousands(all_parsers):
# GH#39365
data = """A,B,C
Expand Down