Skip to content

Commit b86eb99

Browse files
authored
BUG: #57954 encoding ignored for filelike (#57968)
* add exception when encodings exist and do not match * add exception when encodings exist and do not match * add test for mismatching encodings warning * add test for mismatching encodings warning * add encoding for python 3.10+ * move to _check_file; invert var and condition
1 parent 6c301c6 commit b86eb99

File tree

3 files changed

+19
-1
lines changed

3 files changed

+19
-1
lines changed

Diff for: pandas/io/parsers/readers.py

+11
Original file line numberDiff line numberDiff line change
@@ -1310,6 +1310,16 @@ def _check_file_or_buffer(self, f, engine: CSVEngine) -> None:
13101310
raise ValueError(
13111311
"The 'python' engine cannot iterate through this file buffer."
13121312
)
1313+
if hasattr(f, "encoding"):
1314+
file_encoding = f.encoding
1315+
orig_reader_enc = self.orig_options.get("encoding", None)
1316+
any_none = file_encoding is None or orig_reader_enc is None
1317+
if file_encoding != orig_reader_enc and not any_none:
1318+
file_path = getattr(f, "name", None)
1319+
raise ValueError(
1320+
f"The specified reader encoding {orig_reader_enc} is different "
1321+
f"from the encoding {file_encoding} of file {file_path}."
1322+
)
13131323

13141324
def _clean_options(
13151325
self, options: dict[str, Any], engine: CSVEngine
@@ -1485,6 +1495,7 @@ def _make_engine(
14851495
"pyarrow": ArrowParserWrapper,
14861496
"python-fwf": FixedWidthFieldParser,
14871497
}
1498+
14881499
if engine not in mapping:
14891500
raise ValueError(
14901501
f"Unknown engine: {engine} (valid options are {mapping.keys()})"

Diff for: pandas/tests/io/parser/test_c_parser_only.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -511,7 +511,7 @@ def __next__(self):
511511
def test_buffer_rd_bytes_bad_unicode(c_parser_only):
512512
# see gh-22748
513513
t = BytesIO(b"\xb0")
514-
t = TextIOWrapper(t, encoding="ascii", errors="surrogateescape")
514+
t = TextIOWrapper(t, encoding="UTF-8", errors="surrogateescape")
515515
msg = "'utf-8' codec can't encode character"
516516
with pytest.raises(UnicodeError, match=msg):
517517
c_parser_only.read_csv(t, encoding="UTF-8")

Diff for: pandas/tests/io/parser/test_textreader.py

+7
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,13 @@ def test_StringIO(self, csv_path):
4848
reader = TextReader(src, header=None)
4949
reader.read()
5050

51+
def test_encoding_mismatch_warning(self, csv_path):
52+
# GH-57954
53+
with open(csv_path, encoding="UTF-8") as f:
54+
msg = "latin1 is different from the encoding"
55+
with pytest.raises(ValueError, match=msg):
56+
read_csv(f, encoding="latin1")
57+
5158
def test_string_factorize(self):
5259
# should this be optional?
5360
data = "a\nb\na\nb\na"

0 commit comments

Comments
 (0)