diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 9d1b3eaebdf8b..133872d1d2f78 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -262,6 +262,7 @@ I/O - Bug in :func:`read_csv` interpreting ``NA`` value as comment, when ``NA`` does contain the comment string fixed for ``engine="python"`` (:issue:`34002`) - Bug in :func:`read_csv` raising ``IndexError`` with multiple header columns and ``index_col`` specified when file has no data rows (:issue:`38292`) - Bug in :func:`read_csv` not accepting ``usecols`` with different length than ``names`` for ``engine="python"`` (:issue:`16469`) +- Bug in :meth:`read_csv` returning object dtype when ``delimiter=","`` with ``usecols`` and ``parse_dates`` specified for ``engine="python"`` (:issue:`35873`) - Bug in :func:`read_csv` raising ``TypeError`` when ``names`` and ``parse_dates`` is specified for ``engine="c"`` (:issue:`33699`) - Bug in :func:`read_clipboard`, :func:`DataFrame.to_clipboard` not working in WSL (:issue:`38527`) - Allow custom error values for parse_dates argument of :func:`read_sql`, :func:`read_sql_query` and :func:`read_sql_table` (:issue:`35185`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 68c0bbf0787e6..670da07869b72 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2293,7 +2293,7 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds): # Get columns in two steps: infer from data, then # infer column indices from self.usecols if it is specified. - self._col_indices = None + self._col_indices: Optional[List[int]] = None try: ( self.columns, @@ -2335,6 +2335,9 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds): if self.index_names is None: self.index_names = index_names + if self._col_indices is None: + self._col_indices = list(range(len(self.columns))) + self._validate_parse_dates_presence(self.columns) if self.parse_dates: self._no_thousands_columns = self._set_no_thousands_columns() @@ -2358,7 +2361,9 @@ def _set(x): if is_integer(x): noconvert_columns.add(x) else: - noconvert_columns.add(self.columns.index(x)) + assert self._col_indices is not None + col_indices = self._col_indices + noconvert_columns.add(col_indices[self.columns.index(x)]) if isinstance(self.parse_dates, list): for val in self.parse_dates: @@ -2700,7 +2705,6 @@ def _infer_columns(self): # overwritten. self._handle_usecols(columns, names) else: - self._col_indices = None num_original_columns = len(names) columns = [names] else: @@ -2782,7 +2786,7 @@ def _handle_usecols(self, columns, usecols_key): [n for i, n in enumerate(column) if i in col_indices] for column in columns ] - self._col_indices = col_indices + self._col_indices = sorted(col_indices) return columns def _buffered_line(self): @@ -3180,25 +3184,21 @@ def _rows_to_cols(self, content): zipped_content = list(lib.to_object_array(content, min_width=col_len).T) if self.usecols: + assert self._col_indices is not None + col_indices = self._col_indices + if self._implicit_index: zipped_content = [ a for i, a in enumerate(zipped_content) if ( i < len(self.index_col) - # pandas\io\parsers.py:3159: error: Unsupported right - # operand type for in ("Optional[Any]") [operator] - or i - len(self.index_col) # type: ignore[operator] - in self._col_indices + or i - len(self.index_col) in col_indices ) ] else: zipped_content = [ - # pandas\io\parsers.py:3164: error: Unsupported right - # operand type for in ("Optional[Any]") [operator] - a - for i, a in enumerate(zipped_content) - if i in self._col_indices # type: ignore[operator] + a for i, a in enumerate(zipped_content) if i in col_indices ] return zipped_content diff --git a/pandas/tests/io/parser/dtypes/__init__.py b/pandas/tests/io/parser/dtypes/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index e416d8dcdd905..fc34d65fdad52 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -10,7 +10,7 @@ from pandas.errors import ParserWarning import pandas as pd -from pandas import DataFrame +from pandas import DataFrame, Timestamp import pandas._testing as tm @@ -165,3 +165,19 @@ def test_boolean_dtype(all_parsers): ) tm.assert_frame_equal(result, expected) + + +def test_delimiter_with_usecols_and_parse_dates(all_parsers): + # GH#35873 + result = all_parsers.read_csv( + StringIO('"dump","-9,1","-9,1",20101010'), + engine="python", + names=["col", "col1", "col2", "col3"], + usecols=["col1", "col2", "col3"], + parse_dates=["col3"], + decimal=",", + ) + expected = DataFrame( + {"col1": [-9.1], "col2": [-9.1], "col3": [Timestamp("2010-10-10")]} + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/usecols/__init__.py b/pandas/tests/io/parser/usecols/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d