Skip to content

BUG: read_csv not converting to float for python engine with decimal sep, usecols and parse_dates #38334

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 21 commits into from
Jan 3, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,7 @@ I/O
- Bug in :func:`read_csv` interpreting ``NA`` value as comment, when ``NA`` does contain the comment string fixed for ``engine="python"`` (:issue:`34002`)
- Bug in :func:`read_csv` raising ``IndexError`` with multiple header columns and ``index_col`` specified when file has no data rows (:issue:`38292`)
- Bug in :func:`read_csv` not accepting ``usecols`` with different length than ``names`` for ``engine="python"`` (:issue:`16469`)
- Bug in :meth:`read_csv` returning object dtype when ``delimiter=","`` with ``usecols`` and ``parse_dates`` specified for ``engine="python"`` (:issue:`35873`)
- Bug in :func:`read_csv` raising ``TypeError`` when ``names`` and ``parse_dates`` is specified for ``engine="c"`` (:issue:`33699`)
- Bug in :func:`read_clipboard`, :func:`DataFrame.to_clipboard` not working in WSL (:issue:`38527`)
- Allow custom error values for parse_dates argument of :func:`read_sql`, :func:`read_sql_query` and :func:`read_sql_table` (:issue:`35185`)
Expand Down
26 changes: 13 additions & 13 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2293,7 +2293,7 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds):

# Get columns in two steps: infer from data, then
# infer column indices from self.usecols if it is specified.
self._col_indices = None
self._col_indices: Optional[List[int]] = None
try:
(
self.columns,
Expand Down Expand Up @@ -2335,6 +2335,9 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds):
if self.index_names is None:
self.index_names = index_names

if self._col_indices is None:
self._col_indices = list(range(len(self.columns)))

self._validate_parse_dates_presence(self.columns)
if self.parse_dates:
self._no_thousands_columns = self._set_no_thousands_columns()
Expand All @@ -2358,7 +2361,9 @@ def _set(x):
if is_integer(x):
noconvert_columns.add(x)
else:
noconvert_columns.add(self.columns.index(x))
assert self._col_indices is not None
col_indices = self._col_indices
noconvert_columns.add(col_indices[self.columns.index(x)])

if isinstance(self.parse_dates, list):
for val in self.parse_dates:
Expand Down Expand Up @@ -2700,7 +2705,6 @@ def _infer_columns(self):
# overwritten.
self._handle_usecols(columns, names)
else:
self._col_indices = None
num_original_columns = len(names)
columns = [names]
else:
Expand Down Expand Up @@ -2782,7 +2786,7 @@ def _handle_usecols(self, columns, usecols_key):
[n for i, n in enumerate(column) if i in col_indices]
for column in columns
]
self._col_indices = col_indices
self._col_indices = sorted(col_indices)
return columns

def _buffered_line(self):
Expand Down Expand Up @@ -3180,25 +3184,21 @@ def _rows_to_cols(self, content):
zipped_content = list(lib.to_object_array(content, min_width=col_len).T)

if self.usecols:
assert self._col_indices is not None
col_indices = self._col_indices

if self._implicit_index:
zipped_content = [
a
for i, a in enumerate(zipped_content)
if (
i < len(self.index_col)
# pandas\io\parsers.py:3159: error: Unsupported right
# operand type for in ("Optional[Any]") [operator]
or i - len(self.index_col) # type: ignore[operator]
in self._col_indices
or i - len(self.index_col) in col_indices
)
]
else:
zipped_content = [
# pandas\io\parsers.py:3164: error: Unsupported right
# operand type for in ("Optional[Any]") [operator]
a
for i, a in enumerate(zipped_content)
if i in self._col_indices # type: ignore[operator]
a for i, a in enumerate(zipped_content) if i in col_indices
]
return zipped_content

Expand Down
Empty file.
18 changes: 17 additions & 1 deletion pandas/tests/io/parser/dtypes/test_dtypes_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from pandas.errors import ParserWarning

import pandas as pd
from pandas import DataFrame
from pandas import DataFrame, Timestamp
import pandas._testing as tm


Expand Down Expand Up @@ -165,3 +165,19 @@ def test_boolean_dtype(all_parsers):
)

tm.assert_frame_equal(result, expected)


def test_delimiter_with_usecols_and_parse_dates(all_parsers):
# GH#35873
result = all_parsers.read_csv(
StringIO('"dump","-9,1","-9,1",20101010'),
engine="python",
names=["col", "col1", "col2", "col3"],
usecols=["col1", "col2", "col3"],
parse_dates=["col3"],
decimal=",",
)
expected = DataFrame(
{"col1": [-9.1], "col2": [-9.1], "col3": [Timestamp("2010-10-10")]}
)
tm.assert_frame_equal(result, expected)
Empty file.