From 982757503002cad7035fb4ba1ec255ae1f4f3e99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Mon, 19 Sep 2022 21:43:08 -0400 Subject: [PATCH 1/5] REGR: TextIOWrapper raising an error in read_csv --- doc/source/whatsnew/v1.5.1.rst | 2 +- pandas/io/parsers/c_parser_wrapper.py | 3 +++ pandas/tests/io/parser/common/test_common_basic.py | 14 ++++++++++++++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.1.rst b/doc/source/whatsnew/v1.5.1.rst index f8069b5476d9e..9d40d9118db32 100644 --- a/doc/source/whatsnew/v1.5.1.rst +++ b/doc/source/whatsnew/v1.5.1.rst @@ -14,7 +14,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ -- +- Regression in :func:`.read_csv` causing an ``EmptyDataError`` when using an UTF-8 file handle that was already read from (:issue:`48646`) - .. --------------------------------------------------------------------------- diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 99051ec661413..69a315a2fa8fd 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -74,6 +74,9 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None: and src.encoding == "utf-8" and (src.errors or "strict") == kwds["encoding_errors"] ): + # the internal buffer TextIOWrapper.buffer might have read ahead, make sure + # to first go back where TextIOWrapper is + src.seek(src.tell()) # error: Incompatible types in assignment (expression has type "BinaryIO", # variable has type "ReadCsvBuffer[str]") src = src.buffer # type: ignore[assignment] diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index a7cdc3c1a84d2..a7ef18ef228da 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -928,3 +928,17 @@ def test_read_table_posargs_deprecation(all_parsers): "except for the argument 'filepath_or_buffer' will be keyword-only" ) parser.read_table_check_warnings(FutureWarning, msg, data, " ") + + +def test_read_seek(all_parsers): + # GH48646 + parser = all_parsers + prefix = "### DATA\n" + content = "nkey,value\ntables,rectangular\n" + with tm.ensure_clean() as path: + Path(path).write_text(prefix + content) + with open(path, mode="r") as file: + file.readline() + actual = parser.read_csv(file) + expected = parser.read_csv(StringIO(content)) + tm.assert_frame_equal(actual, expected) From cfe3446e93711d7f17cffbb8a05c539e835dc5fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Mon, 19 Sep 2022 22:08:49 -0400 Subject: [PATCH 2/5] pyupgrade --- pandas/tests/io/parser/common/test_common_basic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index a7ef18ef228da..359b059252556 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -937,7 +937,7 @@ def test_read_seek(all_parsers): content = "nkey,value\ntables,rectangular\n" with tm.ensure_clean() as path: Path(path).write_text(prefix + content) - with open(path, mode="r") as file: + with open(path, encoding="utf-8") as file: file.readline() actual = parser.read_csv(file) expected = parser.read_csv(StringIO(content)) From b081cc8eec3c50234942d44c61a0be01d9210140 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Tue, 20 Sep 2022 07:39:52 -0400 Subject: [PATCH 3/5] do not try to seek on unseekable buffers --- pandas/io/parsers/c_parser_wrapper.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 69a315a2fa8fd..9dddf863ec83e 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -73,10 +73,8 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None: isinstance(src, TextIOWrapper) and src.encoding == "utf-8" and (src.errors or "strict") == kwds["encoding_errors"] + and (not src.seekable() or src.tell() == src.buffer.tell()) ): - # the internal buffer TextIOWrapper.buffer might have read ahead, make sure - # to first go back where TextIOWrapper is - src.seek(src.tell()) # error: Incompatible types in assignment (expression has type "BinaryIO", # variable has type "ReadCsvBuffer[str]") src = src.buffer # type: ignore[assignment] From 747300a69e21b0ea4019f1ba14c097bf8bf2ca09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Tue, 20 Sep 2022 07:43:37 -0400 Subject: [PATCH 4/5] unseekable buffer might also have read ahead --- pandas/io/parsers/c_parser_wrapper.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 9dddf863ec83e..141ff353dfe75 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -73,7 +73,8 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None: isinstance(src, TextIOWrapper) and src.encoding == "utf-8" and (src.errors or "strict") == kwds["encoding_errors"] - and (not src.seekable() or src.tell() == src.buffer.tell()) + and src.seekable() + and src.tell() == src.buffer.tell() ): # error: Incompatible types in assignment (expression has type "BinaryIO", # variable has type "ReadCsvBuffer[str]") From f5f7fab2d85d65f249693f9e1a085c415a98c67a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Tue, 20 Sep 2022 09:54:37 -0400 Subject: [PATCH 5/5] safer alternative: do not mess with internal/private(?) buffer of TextIOWrapper (effectively applies the shortcut only to files pandas opens) --- pandas/io/parsers/c_parser_wrapper.py | 14 -------------- pandas/io/parsers/readers.py | 11 +++++++++++ 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 141ff353dfe75..6e4ea85548230 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -2,7 +2,6 @@ from collections import defaultdict import inspect -from io import TextIOWrapper from typing import ( TYPE_CHECKING, Hashable, @@ -67,19 +66,6 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None: # Have to pass int, would break tests using TextReader directly otherwise :( kwds["on_bad_lines"] = self.on_bad_lines.value - # c-engine can cope with utf-8 bytes. Remove TextIOWrapper when its errors - # policy is the same as the one given to read_csv - if ( - isinstance(src, TextIOWrapper) - and src.encoding == "utf-8" - and (src.errors or "strict") == kwds["encoding_errors"] - and src.seekable() - and src.tell() == src.buffer.tell() - ): - # error: Incompatible types in assignment (expression has type "BinaryIO", - # variable has type "ReadCsvBuffer[str]") - src = src.buffer # type: ignore[assignment] - for key in ( "storage_options", "encoding", diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 20122d69748aa..eaec4c6bd5991 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -60,6 +60,7 @@ from pandas.io.common import ( IOHandles, get_handle, + stringify_path, validate_header_arg, ) from pandas.io.parsers.arrow_parser_wrapper import ArrowParserWrapper @@ -1727,6 +1728,16 @@ def _make_engine( if engine == "pyarrow": is_text = False mode = "rb" + elif ( + engine == "c" + and self.options.get("encoding", "utf-8") == "utf-8" + and isinstance(stringify_path(f), str) + ): + # c engine can decode utf-8 bytes, adding TextIOWrapper makes + # the c-engine especially for memory_map=True far slower + is_text = False + if "b" not in mode: + mode += "b" self.handles = get_handle( f, mode,