From 8417a8358b541f10658b5776e795c66acc7792fc Mon Sep 17 00:00:00 2001 From: Ronuk Raval Date: Thu, 29 Dec 2022 11:20:43 -0500 Subject: [PATCH] ROB: ignore_eof everywhere for read_until_regex This was initially motivated by `NumberObject.read_from_stream`, which was calling `read_until_regex` with the default value of `ignore_eof=False` and thus raising exceptions like: ``` PyPDF2.errors.PdfStreamError: Stream has ended unexpectedly ``` https://github.com/py-pdf/PyPDF2/commit/431ba7092037af7d1c296f8f280aca167859ce61 demonstrates a similar fix for `NameObject.read_from_stream`. From discussion in https://github.com/py-pdf/pypdf/pull/1505, it was realized that the change to `NumberObject.read_from_stream` had now made ALL callers of `read_until_regex` pass `ignore_eof=True`. It's cleaner to remove the parameter entirely and change the default behaviour. --- pypdf/_utils.py | 15 +++------------ pypdf/generic/_base.py | 2 +- pypdf/generic/_data_structures.py | 2 +- tests/test_utils.py | 11 +---------- 4 files changed, 6 insertions(+), 24 deletions(-) diff --git a/pypdf/_utils.py b/pypdf/_utils.py index 4da2663fc..fdc52b126 100644 --- a/pypdf/_utils.py +++ b/pypdf/_utils.py @@ -163,31 +163,22 @@ def skip_over_comment(stream: StreamType) -> None: tok = stream.read(1) -def read_until_regex( - stream: StreamType, regex: Pattern[bytes], ignore_eof: bool = False -) -> bytes: +def read_until_regex(stream: StreamType, regex: Pattern[bytes]) -> bytes: """ Read until the regular expression pattern matched (ignore the match). + Treats EOF on the underlying stream as the end of the token to be matched. Args: - ignore_eof: If true, ignore end-of-line and return immediately regex: re.Pattern - ignore_eof: (Default value = False) Returns: The read bytes. - - Raises: - PdfStreamError: on premature end-of-file - """ name = b"" while True: tok = stream.read(16) if not tok: - if ignore_eof: - return name - raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY) + return name m = regex.search(tok) if m is not None: name += tok[: m.start()] diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py index b1adcc557..d973515a5 100644 --- a/pypdf/generic/_base.py +++ b/pypdf/generic/_base.py @@ -620,7 +620,7 @@ def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader name = stream.read(1) if name != NameObject.surfix: raise PdfReadError("name read error") - name += read_until_regex(stream, NameObject.delimiter_pattern, ignore_eof=True) + name += read_until_regex(stream, NameObject.delimiter_pattern) try: # Name objects should represent irregular characters # with a '#' followed by the symbol's hex number diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 2e472f51c..27160b18b 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -969,7 +969,7 @@ def __parse_content_stream(self, stream: StreamType) -> None: break stream.seek(-1, 1) if peek.isalpha() or peek in (b"'", b'"'): - operator = read_until_regex(stream, NameObject.delimiter_pattern, True) + operator = read_until_regex(stream, NameObject.delimiter_pattern) if operator == b"BI": # begin inline image - a completely different parsing # mechanism is required, of course... thanks buddy... diff --git a/tests/test_utils.py b/tests/test_utils.py index 841c8d712..d05127c5d 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -62,20 +62,11 @@ def test_skip_over_comment(stream, remainder): assert stream.read() == remainder -def test_read_until_regex_premature_ending_raise(): - import re - - stream = io.BytesIO(b"") - with pytest.raises(PdfStreamError) as exc: - read_until_regex(stream, re.compile(b".")) - assert exc.value.args[0] == "Stream has ended unexpectedly" - - def test_read_until_regex_premature_ending_name(): import re stream = io.BytesIO(b"") - assert read_until_regex(stream, re.compile(b"."), ignore_eof=True) == b"" + assert read_until_regex(stream, re.compile(b".")) == b"" @pytest.mark.parametrize(