ROB: ignore_eof everywhere for read_until_regex

This was initially motivated by `NumberObject.read_from_stream`, which was calling `read_until_regex` with the default value of `ignore_eof=False` and thus raising exceptions like: ``` PyPDF2.errors.PdfStreamError: Stream has ended unexpectedly ``` 431ba70 demonstrates a similar fix for `NameObject.read_from_stream`. From discussion in #1505, it was realized that the change to `NumberObject.read_from_stream` had now made ALL callers of `read_until_regex` pass `ignore_eof=True`. It's cleaner to remove the parameter entirely and change the default behaviour.
py-pdf · Jan 9, 2023 · 8417a83 · 8417a83
1 parent e7e4ffc
commit 8417a83
Show file tree

Hide file tree

Showing 4 changed files with 6 additions and 24 deletions.
diff --git a/pypdf/_utils.py b/pypdf/_utils.py
@@ -163,31 +163,22 @@ def skip_over_comment(stream: StreamType) -> None:
  tok = stream.read(1)
 
 
-def read_until_regex(
- stream: StreamType, regex: Pattern[bytes], ignore_eof: bool = False
-) -> bytes:
+def read_until_regex(stream: StreamType, regex: Pattern[bytes]) -> bytes:
  """
  Read until the regular expression pattern matched (ignore the match).
+ Treats EOF on the underlying stream as the end of the token to be matched.
 
  Args:
- ignore_eof: If true, ignore end-of-line and return immediately
  regex: re.Pattern
- ignore_eof: (Default value = False)
 
  Returns:
  The read bytes.
-
- Raises:
- PdfStreamError: on premature end-of-file
-
  """
  name = b""
  while True:
  tok = stream.read(16)
  if not tok:
- if ignore_eof:
- return name
- raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
+ return name
  m = regex.search(tok)
  if m is not None:
  name += tok[: m.start()]

diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py
@@ -620,7 +620,7 @@ def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader
  name = stream.read(1)
  if name != NameObject.surfix:
  raise PdfReadError("name read error")
- name += read_until_regex(stream, NameObject.delimiter_pattern, ignore_eof=True)
+ name += read_until_regex(stream, NameObject.delimiter_pattern)
  try:
  # Name objects should represent irregular characters
  # with a '#' followed by the symbol's hex number

diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py
@@ -969,7 +969,7 @@ def __parse_content_stream(self, stream: StreamType) -> None:
  break
  stream.seek(-1, 1)
  if peek.isalpha() or peek in (b"'", b'"'):
- operator = read_until_regex(stream, NameObject.delimiter_pattern, True)
+ operator = read_until_regex(stream, NameObject.delimiter_pattern)
  if operator == b"BI":
  # begin inline image - a completely different parsing
  # mechanism is required, of course... thanks buddy...

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -62,20 +62,11 @@ def test_skip_over_comment(stream, remainder):
  assert stream.read() == remainder
 
 
-def test_read_until_regex_premature_ending_raise():
- import re
-
- stream = io.BytesIO(b"")
- with pytest.raises(PdfStreamError) as exc:
- read_until_regex(stream, re.compile(b"."))
- assert exc.value.args[0] == "Stream has ended unexpectedly"
-
-
 def test_read_until_regex_premature_ending_name():
  import re
 
  stream = io.BytesIO(b"")
- assert read_until_regex(stream, re.compile(b"."), ignore_eof=True) == b""
+ assert read_until_regex(stream, re.compile(b".")) == b""
 
 
 @pytest.mark.parametrize(