diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 75b4c5c0fe14d..f44b63bd55563 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1106,6 +1106,7 @@ I/O - Bug in :meth:`HDFStore.get` was failing to save data of dtype datetime64[s] correctly (:issue:`59004`) - Bug in :meth:`HDFStore.select` causing queries on categorical string columns to return unexpected results (:issue:`57608`) - Bug in :meth:`MultiIndex.factorize` incorrectly raising on length-0 indexes (:issue:`57517`) +- Bug in :meth:`python_parser` where :class:`MyDialect` did not appropriately skip a line when instructed, causing Empty Data Error (:issue:`62739`) - Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string. (:issue:`59059`) - Bug in :meth:`read_csv` for the ``c`` and ``python`` engines where parsing numbers with large exponents caused overflows. Now, numbers with large positive exponents are parsed as ``inf`` or ``-inf`` depending on the sign of the mantissa, while those with large negative exponents are parsed as ``0.0`` (:issue:`62617`, :issue:`38794`, :issue:`62740`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index dc7a21c859a33..b08e013d7acb8 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -218,6 +218,14 @@ class MyDialect(csv.Dialect): if sep is not None: dia.delimiter = sep + # Skip rows at file level before csv.reader sees them + # prevents CSV parsing errors on lines that will be discarded + if self.skiprows is not None: + while self.skipfunc(self.pos): + line = f.readline() + if not line: + break + self.pos += 1 else: # attempt to sniff the delimiter from the first valid line, # i.e. no comment line and not in skiprows diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 0de65ab889be8..ee6484709be63 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -599,3 +599,45 @@ def fixer(bad_line): ) tm.assert_frame_equal(result, expected) + + +def test_read_csv_leading_quote_skip(python_parser_only): + # GH 62739 + tbl = """\ + " +a b +1 3 +""" + parser = python_parser_only + result = parser.read_csv( + StringIO(tbl), + delimiter=" ", + skiprows=1, + ) + expected = DataFrame({"a": [1], "b": [3]}) + tm.assert_frame_equal(result, expected) + + +def test_read_csv_unclosed_double_quote_in_data_still_errors(python_parser_only): + # GH 62739 + tbl = """\ +a b +" +1 3 +""" + parser = python_parser_only + with pytest.raises(ParserError, match="unexpected end of data"): + parser.read_csv(StringIO(tbl), delimiter=" ", skiprows=1) + + +def test_read_csv_skiprows_zero(python_parser_only): + # GH 62739 + tbl = """\ +" +a b +1 3 +""" + parser = python_parser_only + # don't skip anything + with pytest.raises(ParserError, match="unexpected end of data"): + parser.read_csv(StringIO(tbl), delimiter=" ", skiprows=0, engine="python")