Skip to content

Commit

Permalink
ROB: ignore_eof everywhere for read_until_regex
Browse files Browse the repository at this point in the history
This was initially motivated by `NumberObject.read_from_stream`, which
was calling `read_until_regex` with the default value of
`ignore_eof=False` and thus raising exceptions like:

```
PyPDF2.errors.PdfStreamError: Stream has ended unexpectedly
```

431ba70
demonstrates a similar fix for `NameObject.read_from_stream`.

From discussion in #1505, it was
realized that the change to `NumberObject.read_from_stream` had now made
ALL callers of `read_until_regex` pass `ignore_eof=True`. It's cleaner
to remove the parameter entirely and change the default behaviour.
  • Loading branch information
rraval committed Jan 9, 2023
1 parent e7e4ffc commit 8417a83
Show file tree
Hide file tree
Showing 4 changed files with 6 additions and 24 deletions.
15 changes: 3 additions & 12 deletions pypdf/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,31 +163,22 @@ def skip_over_comment(stream: StreamType) -> None:
tok = stream.read(1)


def read_until_regex(
stream: StreamType, regex: Pattern[bytes], ignore_eof: bool = False
) -> bytes:
def read_until_regex(stream: StreamType, regex: Pattern[bytes]) -> bytes:
"""
Read until the regular expression pattern matched (ignore the match).
Treats EOF on the underlying stream as the end of the token to be matched.
Args:
ignore_eof: If true, ignore end-of-line and return immediately
regex: re.Pattern
ignore_eof: (Default value = False)
Returns:
The read bytes.
Raises:
PdfStreamError: on premature end-of-file
"""
name = b""
while True:
tok = stream.read(16)
if not tok:
if ignore_eof:
return name
raise PdfStreamError(STREAM_TRUNCATED_PREMATURELY)
return name
m = regex.search(tok)
if m is not None:
name += tok[: m.start()]
Expand Down
2 changes: 1 addition & 1 deletion pypdf/generic/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -620,7 +620,7 @@ def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader
name = stream.read(1)
if name != NameObject.surfix:
raise PdfReadError("name read error")
name += read_until_regex(stream, NameObject.delimiter_pattern, ignore_eof=True)
name += read_until_regex(stream, NameObject.delimiter_pattern)
try:
# Name objects should represent irregular characters
# with a '#' followed by the symbol's hex number
Expand Down
2 changes: 1 addition & 1 deletion pypdf/generic/_data_structures.py
Original file line number Diff line number Diff line change
Expand Up @@ -969,7 +969,7 @@ def __parse_content_stream(self, stream: StreamType) -> None:
break
stream.seek(-1, 1)
if peek.isalpha() or peek in (b"'", b'"'):
operator = read_until_regex(stream, NameObject.delimiter_pattern, True)
operator = read_until_regex(stream, NameObject.delimiter_pattern)
if operator == b"BI":
# begin inline image - a completely different parsing
# mechanism is required, of course... thanks buddy...
Expand Down
11 changes: 1 addition & 10 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,20 +62,11 @@ def test_skip_over_comment(stream, remainder):
assert stream.read() == remainder


def test_read_until_regex_premature_ending_raise():
import re

stream = io.BytesIO(b"")
with pytest.raises(PdfStreamError) as exc:
read_until_regex(stream, re.compile(b"."))
assert exc.value.args[0] == "Stream has ended unexpectedly"


def test_read_until_regex_premature_ending_name():
import re

stream = io.BytesIO(b"")
assert read_until_regex(stream, re.compile(b"."), ignore_eof=True) == b""
assert read_until_regex(stream, re.compile(b".")) == b""


@pytest.mark.parametrize(
Expand Down

0 comments on commit 8417a83

Please sign in to comment.