py-pdf · stefan6419846 · Sep 18, 2024 · Sep 17, 2024 · Sep 18, 2024 · Sep 18, 2024
diff --git a/pypdf/_reader.py b/pypdf/_reader.py
@@ -649,7 +649,23 @@ def _find_eof_marker(self, stream: StreamType) -> None:
  """
  HEADER_SIZE = 8 # to parse whole file, Header is e.g. '%PDF-1.6'
  line = b""
+ first = True
  while line[:5] != b"%%EOF":
+ if line != b"" and first:
+ if any(
+ line.strip().endswith(tr) for tr in (b"%%EO", b"%%E", b"%%", b"%")
+ ):
+ # Consider the file as truncated while
+ # having enough confidence to carry on.
+ logger_warning("EOF marker seems truncated", __name__)
+ break
+ first = False
+ if b"startxref" in line:
+ logger_warning(
+ "CAUTION: startxref found while searching for %%EOF. "
+ "The file might be truncated and some data might not be read.",
+ __name__,
+ )
  if stream.tell() < HEADER_SIZE:
  if self.strict:
  raise PdfReadError("EOF marker not found")

diff --git a/tests/test_reader.py b/tests/test_reader.py
@@ -1617,3 +1617,27 @@ def test_iss2817():
  reader.pages[0]["/Annots"][0].get_object()["/Contents"]
  == "A\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 B"
  )
+
+
+@pytest.mark.enable_socket()
+def test_truncated_files(caplog):
+ """Cf #2853"""
+ url = "https://github.com/user-attachments/files/16796095/f5471sm-2.pdf"
+ name = "iss2780.pdf" # reused
+ b = get_data_from_url(url, name=name)
+ reader = PdfReader(BytesIO(b))
+ assert caplog.text == ""
+ # remove \n at end of file : invisible
+ reader = PdfReader(BytesIO(b[:-1]))
+ assert caplog.text == ""
+ # truncate but still detectable
+ for i in range(-2, -6, -1):
+ caplog.clear()
+ reader = PdfReader(BytesIO(b[:i]))
+ assert "EOF marker seems truncated" in caplog.text
+ assert reader._startxref == 100993
+ # remove completely EOF : we will not read last section
+ caplog.clear()
+ reader = PdfReader(BytesIO(b[:-6]))
+ assert "CAUTION: startxref found while searching for %%EOF" in caplog.text
+ assert reader._startxref < 100993