diff --git a/pypdf/_utils.py b/pypdf/_utils.py index 4da2663fc..fdc52b126 100644 --- a/pypdf/_utils.py +++ b/pypdf/_utils.py @@ -163,31 +163,22 @@ def skip_over_comment(stream: StreamType) -> None: tok = stream.read(1) -def read_until_regex( - stream: StreamType, regex: Pattern[bytes], ignore_eof: bool = False -) -> bytes: +def read_until_regex(stream: StreamType, regex: Pattern[bytes]) -> bytes: """ Read until the regular expression pattern matched (ignore the match). + Treats EOF on the underlying stream as the end of the token to be matched. Args: - ignore_eof: If true, ignore end-of-line and return immediately regex: re.Pattern - ignore_eof: (Default value = False) Returns: The read bytes. - - Raises: - PdfStreamError: on premature end-of-file - """ name = b"" while True: tok = stream.read(16) if not tok: - if ignore_eof: - return name - raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) + return name m = regex.search(tok) if m is not None: name += tok[: m.start()] diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py index b1adcc557..d973515a5 100644 --- a/pypdf/generic/_base.py +++ b/pypdf/generic/_base.py @@ -620,7 +620,7 @@ def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader name = stream.read(1) if name != NameObject.surfix: raise PdfReadError("name read error") - name += read_until_regex(stream, NameObject.delimiter_pattern, ignore_eof=True) + name += read_until_regex(stream, NameObject.delimiter_pattern) try: # Name objects should represent irregular characters # with a '#' followed by the symbol's hex number diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 2e472f51c..27160b18b 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -969,7 +969,7 @@ def __parse_content_stream(self, stream: StreamType) -> None: break stream.seek(-1, 1) if peek.isalpha() or peek in (b"'", b'"'): - operator = read_until_regex(stream, NameObject.delimiter_pattern, True) + operator = read_until_regex(stream, NameObject.delimiter_pattern) if operator == b"BI": # begin inline image - a completely different parsing # mechanism is required, of course... thanks buddy... diff --git a/tests/test_utils.py b/tests/test_utils.py index 841c8d712..d05127c5d 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -62,20 +62,11 @@ def test_skip_over_comment(stream, remainder): assert stream.read() == remainder -def test_read_until_regex_premature_ending_raise(): - import re - - stream = io.BytesIO(b"") - with pytest.raises(PdfStreamError) as exc: - read_until_regex(stream, re.compile(b".")) - assert exc.value.args[0] == "Stream has ended unexpectedly" - - def test_read_until_regex_premature_ending_name(): import re stream = io.BytesIO(b"") - assert read_until_regex(stream, re.compile(b"."), ignore_eof=True) == b"" + assert read_until_regex(stream, re.compile(b".")) == b"" @pytest.mark.parametrize(