diff --git a/numpydoc/tests/test_validate.py b/numpydoc/tests/test_validate.py index 68040add..d41e4bd0 100644 --- a/numpydoc/tests/test_validate.py +++ b/numpydoc/tests/test_validate.py @@ -1,7 +1,8 @@ import pytest import sys import warnings -from functools import cached_property +from contextlib import nullcontext +from functools import cached_property, partial from inspect import getsourcelines, getsourcefile from numpydoc import validate @@ -85,6 +86,50 @@ def test_extract_ignore_validation_comments(tmp_path, file_contents, expected): assert validate.extract_ignore_validation_comments(filepath) == expected +@pytest.mark.parametrize( + "assumed_encoding", + ( + pytest.param("utf-8", id="utf8_codec"), + pytest.param("cp1252", id="cp1252_codec"), + ), +) +@pytest.mark.parametrize( + ("classname", "actual_encoding"), + ( + pytest.param("MÿClass", "cp1252", id="cp1252_file"), + pytest.param("My\u0081Class", "utf-8", id="utf8_file"), + ), +) +def test_encodings(tmp_path, classname, actual_encoding, assumed_encoding): + """Test handling of different source file encodings.""" + # write file as bytes with `actual_encoding` + filepath = tmp_path / "ignore_comments.py" + file_contents = f"class {classname}:\n pass" + with open(filepath, "wb") as file: + file.write(file_contents.encode(actual_encoding)) + # this should fail on the ÿ in MÿClass. It represents the (presumed rare) case where + # a user's editor saved the source file in cp1252 (or anything other than utf-8). + if actual_encoding == "cp1252" and assumed_encoding == "utf-8": + context = partial( + pytest.raises, + UnicodeDecodeError, + match="can't decode byte 0xff in position 7: invalid start byte", + ) + # this is the more likely case: file was utf-8 encoded, but Python on Windows uses + # the system codepage to read the file. This case is fixed by numpy/numpydoc#510 + elif actual_encoding == "utf-8" and assumed_encoding == "cp1252": + context = partial( + pytest.raises, + UnicodeDecodeError, + match="can't decode byte 0x81 in position 9: character maps to ", + ) + else: + context = nullcontext + with context(): + result = validate.extract_ignore_validation_comments(filepath, assumed_encoding) + assert result == {} + + class GoodDocStrings: """ Collection of good doc strings. diff --git a/numpydoc/validate.py b/numpydoc/validate.py index e98d3851..7275758b 100644 --- a/numpydoc/validate.py +++ b/numpydoc/validate.py @@ -120,6 +120,7 @@ @functools.lru_cache(maxsize=2000) def extract_ignore_validation_comments( filepath: Optional[os.PathLike], + encoding: str = "utf-8", ) -> Dict[int, List[str]]: """ Extract inline comments indicating certain validation checks should be ignored. @@ -136,7 +137,7 @@ def extract_ignore_validation_comments( """ numpydoc_ignore_comments = {} try: - file = open(filepath) + file = open(filepath, encoding=encoding) except (OSError, TypeError): # can be None, nonexistent, or unreadable return numpydoc_ignore_comments with file: