Merge branch 'main' into reader-minor-sty

py-pdf · Sep 19, 2024 · 2f4365a · 2f4365a
2 parents af2c05f + 7e4a0d6
commit 2f4365a
Show file tree

Hide file tree

Showing 4 changed files with 63 additions and 6 deletions.
diff --git a/pypdf/_reader.py b/pypdf/_reader.py
@@ -659,7 +659,23 @@ def _find_eof_marker(self, stream: StreamType) -> None:
  """
  HEADER_SIZE = 8 # to parse whole file, Header is e.g. '%PDF-1.6'
  line = b""
+ first = True
  while line[:5] != b"%%EOF":
+ if line != b"" and first:
+ if any(
+ line.strip().endswith(tr) for tr in (b"%%EO", b"%%E", b"%%", b"%")
+ ):
+ # Consider the file as truncated while
+ # having enough confidence to carry on.
+ logger_warning("EOF marker seems truncated", __name__)
+ break
+ first = False
+ if b"startxref" in line:
+ logger_warning(
+ "CAUTION: startxref found while searching for %%EOF. "
+ "The file might be truncated and some data might not be read.",
+ __name__,
+ )
  if stream.tell() < HEADER_SIZE:
  if self.strict:
  raise PdfReadError("EOF marker not found")

diff --git a/pypdf/_utils.py b/pypdf/_utils.py
@@ -206,6 +206,8 @@ def skip_over_comment(stream: StreamType) -> None:
  if tok == b"%":
  while tok not in (b"\n", b"\r"):
  tok = stream.read(1)
+ if tok == b"":
+ raise PdfStreamError("File ended unexpectedly.")
 
 
 def read_until_regex(stream: StreamType, regex: Pattern[bytes]) -> bytes:

diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py
@@ -232,6 +232,10 @@ def read_from_stream(
  tok = stream.read(1)
  while tok.isspace():
  tok = stream.read(1)
+ if tok == b"%":
+ stream.seek(-1, 1)
+ skip_over_comment(stream)
+ continue
  stream.seek(-1, 1)
  # check for array ending
  peek_ahead = stream.read(1)
@@ -1341,12 +1345,7 @@ def read_object(
  return NullObject.read_from_stream(stream)
  elif tok == b"%":
  # comment
- while tok not in (b"\r", b"\n"):
- tok = stream.read(1)
- # Prevents an infinite loop by raising an error if the stream is at
- # the EOF
- if len(tok) <= 0:
- raise PdfStreamError("File ended unexpectedly.")
+ skip_over_comment(stream)
  tok = read_non_whitespace(stream)
  stream.seek(-1, 1)
  return read_object(stream, pdf, forced_encoding)

diff --git a/tests/test_reader.py b/tests/test_reader.py
@@ -17,6 +17,7 @@
  EmptyFileError,
  FileNotDecryptedError,
  PdfReadError,
+ PdfStreamError,
  WrongPasswordError,
 )
 from pypdf.generic import (
@@ -1617,3 +1618,42 @@ def test_iss2817():
  reader.pages[0]["/Annots"][0].get_object()["/Contents"]
  == "A\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 B"
  )
+
+
+@pytest.mark.enable_socket()
+def test_truncated_files(caplog):
+ """Cf #2853"""
+ url = "https://github.com/user-attachments/files/16796095/f5471sm-2.pdf"
+ name = "iss2780.pdf" # reused
+ b = get_data_from_url(url, name=name)
+ reader = PdfReader(BytesIO(b))
+ assert caplog.text == ""
+ # remove \n at end of file : invisible
+ reader = PdfReader(BytesIO(b[:-1]))
+ assert caplog.text == ""
+ # truncate but still detectable
+ for i in range(-2, -6, -1):
+ caplog.clear()
+ reader = PdfReader(BytesIO(b[:i]))
+ assert "EOF marker seems truncated" in caplog.text
+ assert reader._startxref == 100993
+ # remove completely EOF : we will not read last section
+ caplog.clear()
+ reader = PdfReader(BytesIO(b[:-6]))
+ assert "CAUTION: startxref found while searching for %%EOF" in caplog.text
+ assert reader._startxref < 100993
+
+
+@pytest.mark.enable_socket()
+def test_comments_in_array(caplog):
+ """Cf #2843: this deals with comments"""
+ url = "https://github.com/user-attachments/files/16992416/crash-2347912aa2a6f0fab5df4ebc8a424735d5d0d128.pdf"
+ name = "iss2843.pdf" # reused
+ b = get_data_from_url(url, name=name)
+ reader = PdfReader(BytesIO(b))
+ reader.pages[0]
+ assert caplog.text == ""
+ reader = PdfReader(BytesIO(b))
+ reader.stream = BytesIO(b[:1149])
+ with pytest.raises(PdfStreamError):
+ reader.pages[0]