From 640df5466cc7e5359db871c47f2fef8a4992f9cb Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Tue, 27 Aug 2024 19:23:22 +0200 Subject: [PATCH 1/2] ENH: Robustify parsing for Object streams in XRef rebuilding closes #2817 --- pypdf/_reader.py | 38 ++++++++++++++++++++++++++++++++++++-- tests/test_filters.py | 8 ++------ tests/test_reader.py | 42 +++++++++++++++++++++++++++++++++++------- 3 files changed, 73 insertions(+), 15 deletions(-) diff --git a/pypdf/_reader.py b/pypdf/_reader.py index 1452661a5..8b0a9f488 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -77,6 +77,7 @@ NullObject, NumberObject, PdfObject, + StreamObject, TextStringObject, read_object, ) @@ -315,8 +316,6 @@ def _get_object_from_stream( obj_stm: EncodedStreamObject = IndirectObject(stmnum, 0, self).get_object() # type: ignore # This is an xref to a stream, so its type better be a stream assert cast(str, obj_stm["/Type"]) == "/ObjStm" - # /N is the number of indirect objects in the stream - assert idx < obj_stm["/N"] stream_data = BytesIO(obj_stm.get_data()) for i in range(obj_stm["/N"]): # type: ignore read_non_whitespace(stream_data) @@ -997,6 +996,41 @@ def _rebuild_xref_table(self, stream: StreamType) -> None: if generation not in self.xref: self.xref[generation] = {} self.xref[generation][idnum] = m.start(1) + + logger_warning("parsing for Object Streams", __name__) + for g in self.xref: + for i in self.xref[g]: + # get_object in manual + stream.seek(self.xref[g][i], 0) + try: + _ = self.read_object_header(stream) + o = cast(StreamObject, read_object(stream, self)) + if o.get("/Type", "") != "/ObjStm": + continue + strm = BytesIO(o.get_data()) + cpt = 0 + while True: + s = read_until_whitespace(strm) + if not s.isdigit(): + break + _i = int(s) + skip_over_whitespace(strm) + strm.seek(-1, 1) + s = read_until_whitespace(strm) + if not s.isdigit(): + break + _o = int(s) + self.xref_objStm[_i] = (i, _o) + cpt += 1 + if cpt != o.get("/N"): + logger_warning( + f"found {cpt} objects within Object({i},{g})" + f" whereas {o.get('/N')} expected", + __name__, + ) + except Exception: # could be of many cause + pass + stream.seek(0, 0) for m in re.finditer(rb"[\r\n \t][ \t]*trailer[\r\n \t]*(<<)", f_): stream.seek(m.start(1), 0) diff --git a/tests/test_filters.py b/tests/test_filters.py index 146ce43cb..632095888 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -5,7 +5,6 @@ from io import BytesIO from itertools import product as cartesian_product from pathlib import Path -from unittest.mock import patch import pytest from PIL import Image @@ -225,14 +224,11 @@ def test_ccitt_fax_decode(): @pytest.mark.enable_socket() -@patch("pypdf._reader.logger_warning") -def test_decompress_zlib_error(mock_logger_warning): +def test_decompress_zlib_error(caplog): reader = PdfReader(BytesIO(get_data_from_url(name="tika-952445.pdf"))) for page in reader.pages: page.extract_text() - mock_logger_warning.assert_called_with( - "incorrect startxref pointer(3)", "pypdf._reader" - ) + assert "incorrect startxref pointer(3)" in caplog.text @pytest.mark.enable_socket() diff --git a/tests/test_reader.py b/tests/test_reader.py index d2394f95d..bd3e6aa68 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -276,14 +276,22 @@ def test_get_images(src, expected_images): False, 0, False, - ["startxref on same line as offset", "incorrect startxref pointer(1)"], + [ + "startxref on same line as offset", + "incorrect startxref pointer(1)", + "parsing for Object Streams", + ], ), # error on startxref, but no strict => xref rebuilt,no fail ( False, True, 0, False, - ["startxref on same line as offset", "incorrect startxref pointer(1)"], + [ + "startxref on same line as offset", + "incorrect startxref pointer(1)", + "parsing for Object Streams", + ], ), ], ) @@ -344,7 +352,10 @@ def test_issue297(caplog): assert caplog.text == "" assert "Broken xref table" in exc.value.args[0] reader = PdfReader(path, strict=False) - assert normalize_warnings(caplog.text) == ["incorrect startxref pointer(1)"] + assert normalize_warnings(caplog.text) == [ + "incorrect startxref pointer(1)", + "parsing for Object Streams", + ] reader.pages[0] @@ -898,11 +909,14 @@ def test_form_topname_with_and_without_acroform(caplog): def test_extract_text_xref_issue_2(caplog): # pdf/0264cf510015b2a4b395a15cb23c001e.pdf url = "https://corpora.tika.apache.org/base/docs/govdocs1/981/981961.pdf" - msg = "incorrect startxref pointer(2)" + msg = [ + "incorrect startxref pointer(2)", + "parsing for Object Streams", + ] reader = PdfReader(BytesIO(get_data_from_url(url, name="tika-981961.pdf"))) for page in reader.pages: page.extract_text() - assert normalize_warnings(caplog.text) == [msg] + assert normalize_warnings(caplog.text) == msg @pytest.mark.enable_socket() @@ -910,11 +924,13 @@ def test_extract_text_xref_issue_2(caplog): def test_extract_text_xref_issue_3(caplog): # pdf/0264cf510015b2a4b395a15cb23c001e.pdf url = "https://corpora.tika.apache.org/base/docs/govdocs1/977/977774.pdf" - msg = "incorrect startxref pointer(3)" + msg = [ + "incorrect startxref pointer(3)", + ] reader = PdfReader(BytesIO(get_data_from_url(url, name="tika-977774.pdf"))) for page in reader.pages: page.extract_text() - assert normalize_warnings(caplog.text) == [msg] + assert normalize_warnings(caplog.text) == msg @pytest.mark.enable_socket() @@ -1589,3 +1605,15 @@ def test_iss2761(): reader = PdfReader(BytesIO(get_data_from_url(url, name=name)), strict=False) with pytest.raises(PdfReadError): reader.pages[0].extract_text() + + +@pytest.mark.enable_socket() +def test_iss2817(): + """Test for rebuiling Xref_ObjStm""" + url = "https://github.com/user-attachments/files/16764070/crash-7e1356f1179b4198337f282304cb611aea26a199.pdf" + name = "iss2817.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + assert ( + reader.pages[0]["/Annots"][0].get_object()["/Contents"] + == "A\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 B" + ) From b587e9ad19c8a67e4c97f9a67ab752de358d7382 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Tue, 27 Aug 2024 19:41:45 +0200 Subject: [PATCH 2/2] coverage --- pypdf/_reader.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pypdf/_reader.py b/pypdf/_reader.py index 8b0a9f488..f505abf75 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -1017,13 +1017,13 @@ def _rebuild_xref_table(self, stream: StreamType) -> None: skip_over_whitespace(strm) strm.seek(-1, 1) s = read_until_whitespace(strm) - if not s.isdigit(): - break + if not s.isdigit(): # pragma: no cover + break # pragma: no cover _o = int(s) self.xref_objStm[_i] = (i, _o) cpt += 1 - if cpt != o.get("/N"): - logger_warning( + if cpt != o.get("/N"): # pragma: no cover + logger_warning( # pragma: no cover f"found {cpt} objects within Object({i},{g})" f" whereas {o.get('/N')} expected", __name__,