diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index b234a6b78e051..3bdfab8d2e669 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -11,6 +11,7 @@ defaultdict, ) import csv +from io import TextIOBase import sys from textwrap import fill from typing import ( @@ -655,6 +656,14 @@ def _read( else: chunksize = validate_integer("chunksize", chunksize, 1) + encoding = kwds.get("encoding") + if ( + encoding + and isinstance(filepath_or_buffer, TextIOBase) + and filepath_or_buffer.encoding != encoding + ): + raise ValueError("File's encoding does not match with given encoding") + nrows = kwds.get("nrows", None) # Check for duplicates in names. diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py index 0827f64dccf46..39ed55930138f 100644 --- a/pandas/tests/io/parser/common/test_read_errors.py +++ b/pandas/tests/io/parser/common/test_read_errors.py @@ -8,6 +8,7 @@ from io import StringIO import os from pathlib import Path +import uuid import numpy as np import pytest @@ -322,3 +323,25 @@ def test_on_bad_lines_warn_correct_formatting(all_parsers): ): result = parser.read_csv(StringIO(data), on_bad_lines="warn") tm.assert_frame_equal(result, expected) + + +def test_filetype_encoding_miss_match_with_given_encoding(all_parsers): + # GH#57954 + + data = """ +A,B +Ü,Ä +""" + parser = all_parsers + path = f"__{uuid.uuid4()}__.csv" + + with tm.ensure_clean(path) as path: + bytes_data = data.encode("latin1") + + with open(path, "wb") as f: + f.write(bytes_data) + msg = "File's encoding does not match with given encoding" + err = ValueError + with pytest.raises(err, match=msg): + with open(path) as f: + parser.read_csv(f, encoding="latin1", on_bad_lines="warn")