Skip to content

Commit

Permalink
Merge branch 'main' into reader-minor-sty
Browse files Browse the repository at this point in the history
  • Loading branch information
MartinThoma authored Sep 19, 2024
2 parents af2c05f + 7e4a0d6 commit 2f4365a
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 6 deletions.
16 changes: 16 additions & 0 deletions pypdf/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -659,7 +659,23 @@ def _find_eof_marker(self, stream: StreamType) -> None:
"""
HEADER_SIZE = 8 # to parse whole file, Header is e.g. '%PDF-1.6'
line = b""
first = True
while line[:5] != b"%%EOF":
if line != b"" and first:
if any(
line.strip().endswith(tr) for tr in (b"%%EO", b"%%E", b"%%", b"%")
):
# Consider the file as truncated while
# having enough confidence to carry on.
logger_warning("EOF marker seems truncated", __name__)
break
first = False
if b"startxref" in line:
logger_warning(
"CAUTION: startxref found while searching for %%EOF. "
"The file might be truncated and some data might not be read.",
__name__,
)
if stream.tell() < HEADER_SIZE:
if self.strict:
raise PdfReadError("EOF marker not found")
Expand Down
2 changes: 2 additions & 0 deletions pypdf/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,8 @@ def skip_over_comment(stream: StreamType) -> None:
if tok == b"%":
while tok not in (b"\n", b"\r"):
tok = stream.read(1)
if tok == b"":
raise PdfStreamError("File ended unexpectedly.")


def read_until_regex(stream: StreamType, regex: Pattern[bytes]) -> bytes:
Expand Down
11 changes: 5 additions & 6 deletions pypdf/generic/_data_structures.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,10 @@ def read_from_stream(
tok = stream.read(1)
while tok.isspace():
tok = stream.read(1)
if tok == b"%":
stream.seek(-1, 1)
skip_over_comment(stream)
continue
stream.seek(-1, 1)
# check for array ending
peek_ahead = stream.read(1)
Expand Down Expand Up @@ -1341,12 +1345,7 @@ def read_object(
return NullObject.read_from_stream(stream)
elif tok == b"%":
# comment
while tok not in (b"\r", b"\n"):
tok = stream.read(1)
# Prevents an infinite loop by raising an error if the stream is at
# the EOF
if len(tok) <= 0:
raise PdfStreamError("File ended unexpectedly.")
skip_over_comment(stream)
tok = read_non_whitespace(stream)
stream.seek(-1, 1)
return read_object(stream, pdf, forced_encoding)
Expand Down
40 changes: 40 additions & 0 deletions tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
EmptyFileError,
FileNotDecryptedError,
PdfReadError,
PdfStreamError,
WrongPasswordError,
)
from pypdf.generic import (
Expand Down Expand Up @@ -1617,3 +1618,42 @@ def test_iss2817():
reader.pages[0]["/Annots"][0].get_object()["/Contents"]
== "A\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 B"
)


@pytest.mark.enable_socket()
def test_truncated_files(caplog):
"""Cf #2853"""
url = "https://github.com/user-attachments/files/16796095/f5471sm-2.pdf"
name = "iss2780.pdf" # reused
b = get_data_from_url(url, name=name)
reader = PdfReader(BytesIO(b))
assert caplog.text == ""
# remove \n at end of file : invisible
reader = PdfReader(BytesIO(b[:-1]))
assert caplog.text == ""
# truncate but still detectable
for i in range(-2, -6, -1):
caplog.clear()
reader = PdfReader(BytesIO(b[:i]))
assert "EOF marker seems truncated" in caplog.text
assert reader._startxref == 100993
# remove completely EOF : we will not read last section
caplog.clear()
reader = PdfReader(BytesIO(b[:-6]))
assert "CAUTION: startxref found while searching for %%EOF" in caplog.text
assert reader._startxref < 100993


@pytest.mark.enable_socket()
def test_comments_in_array(caplog):
"""Cf #2843: this deals with comments"""
url = "https://github.com/user-attachments/files/16992416/crash-2347912aa2a6f0fab5df4ebc8a424735d5d0d128.pdf"
name = "iss2843.pdf" # reused
b = get_data_from_url(url, name=name)
reader = PdfReader(BytesIO(b))
reader.pages[0]
assert caplog.text == ""
reader = PdfReader(BytesIO(b))
reader.stream = BytesIO(b[:1149])
with pytest.raises(PdfStreamError):
reader.pages[0]

0 comments on commit 2f4365a

Please sign in to comment.