Skip to content

Commit

Permalink
ENH: Robustify parsing for Object streams in XRef rebuilding (#2818)
Browse files Browse the repository at this point in the history
Closes #2817.
  • Loading branch information
pubpub-zz authored Sep 13, 2024
1 parent 98d4425 commit 9d54f63
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 15 deletions.
38 changes: 36 additions & 2 deletions pypdf/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@
NullObject,
NumberObject,
PdfObject,
StreamObject,
TextStringObject,
read_object,
)
Expand Down Expand Up @@ -316,8 +317,6 @@ def _get_object_from_stream(
obj_stm: EncodedStreamObject = IndirectObject(stmnum, 0, self).get_object() # type: ignore
# This is an xref to a stream, so its type better be a stream
assert cast(str, obj_stm["/Type"]) == "/ObjStm"
# /N is the number of indirect objects in the stream
assert idx < obj_stm["/N"]
stream_data = BytesIO(obj_stm.get_data())
for i in range(obj_stm["/N"]): # type: ignore
read_non_whitespace(stream_data)
Expand Down Expand Up @@ -999,6 +998,41 @@ def _rebuild_xref_table(self, stream: StreamType) -> None:
if generation not in self.xref:
self.xref[generation] = {}
self.xref[generation][idnum] = m.start(1)

logger_warning("parsing for Object Streams", __name__)
for g in self.xref:
for i in self.xref[g]:
# get_object in manual
stream.seek(self.xref[g][i], 0)
try:
_ = self.read_object_header(stream)
o = cast(StreamObject, read_object(stream, self))
if o.get("/Type", "") != "/ObjStm":
continue
strm = BytesIO(o.get_data())
cpt = 0
while True:
s = read_until_whitespace(strm)
if not s.isdigit():
break
_i = int(s)
skip_over_whitespace(strm)
strm.seek(-1, 1)
s = read_until_whitespace(strm)
if not s.isdigit(): # pragma: no cover
break # pragma: no cover
_o = int(s)
self.xref_objStm[_i] = (i, _o)
cpt += 1
if cpt != o.get("/N"): # pragma: no cover
logger_warning( # pragma: no cover
f"found {cpt} objects within Object({i},{g})"
f" whereas {o.get('/N')} expected",
__name__,
)
except Exception: # could be of many cause
pass

stream.seek(0, 0)
for m in re.finditer(rb"[\r\n \t][ \t]*trailer[\r\n \t]*(<<)", f_):
stream.seek(m.start(1), 0)
Expand Down
8 changes: 2 additions & 6 deletions tests/test_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from io import BytesIO
from itertools import product as cartesian_product
from pathlib import Path
from unittest.mock import patch

import pytest
from PIL import Image
Expand Down Expand Up @@ -225,14 +224,11 @@ def test_ccitt_fax_decode():


@pytest.mark.enable_socket()
@patch("pypdf._reader.logger_warning")
def test_decompress_zlib_error(mock_logger_warning):
def test_decompress_zlib_error(caplog):
reader = PdfReader(BytesIO(get_data_from_url(name="tika-952445.pdf")))
for page in reader.pages:
page.extract_text()
mock_logger_warning.assert_called_with(
"incorrect startxref pointer(3)", "pypdf._reader"
)
assert "incorrect startxref pointer(3)" in caplog.text


@pytest.mark.enable_socket()
Expand Down
42 changes: 35 additions & 7 deletions tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,14 +276,22 @@ def test_get_images(src, expected_images):
False,
0,
False,
["startxref on same line as offset", "incorrect startxref pointer(1)"],
[
"startxref on same line as offset",
"incorrect startxref pointer(1)",
"parsing for Object Streams",
],
), # error on startxref, but no strict => xref rebuilt,no fail
(
False,
True,
0,
False,
["startxref on same line as offset", "incorrect startxref pointer(1)"],
[
"startxref on same line as offset",
"incorrect startxref pointer(1)",
"parsing for Object Streams",
],
),
],
)
Expand Down Expand Up @@ -344,7 +352,10 @@ def test_issue297(caplog):
assert caplog.text == ""
assert "Broken xref table" in exc.value.args[0]
reader = PdfReader(path, strict=False)
assert normalize_warnings(caplog.text) == ["incorrect startxref pointer(1)"]
assert normalize_warnings(caplog.text) == [
"incorrect startxref pointer(1)",
"parsing for Object Streams",
]
reader.pages[0]


Expand Down Expand Up @@ -898,23 +909,28 @@ def test_form_topname_with_and_without_acroform(caplog):
def test_extract_text_xref_issue_2(caplog):
# pdf/0264cf510015b2a4b395a15cb23c001e.pdf
url = "https://corpora.tika.apache.org/base/docs/govdocs1/981/981961.pdf"
msg = "incorrect startxref pointer(2)"
msg = [
"incorrect startxref pointer(2)",
"parsing for Object Streams",
]
reader = PdfReader(BytesIO(get_data_from_url(url, name="tika-981961.pdf")))
for page in reader.pages:
page.extract_text()
assert normalize_warnings(caplog.text) == [msg]
assert normalize_warnings(caplog.text) == msg


@pytest.mark.enable_socket()
@pytest.mark.slow()
def test_extract_text_xref_issue_3(caplog):
# pdf/0264cf510015b2a4b395a15cb23c001e.pdf
url = "https://corpora.tika.apache.org/base/docs/govdocs1/977/977774.pdf"
msg = "incorrect startxref pointer(3)"
msg = [
"incorrect startxref pointer(3)",
]
reader = PdfReader(BytesIO(get_data_from_url(url, name="tika-977774.pdf")))
for page in reader.pages:
page.extract_text()
assert normalize_warnings(caplog.text) == [msg]
assert normalize_warnings(caplog.text) == msg


@pytest.mark.enable_socket()
Expand Down Expand Up @@ -1589,3 +1605,15 @@ def test_iss2761():
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)), strict=False)
with pytest.raises(PdfReadError):
reader.pages[0].extract_text()


@pytest.mark.enable_socket()
def test_iss2817():
"""Test for rebuiling Xref_ObjStm"""
url = "https://github.com/user-attachments/files/16764070/crash-7e1356f1179b4198337f282304cb611aea26a199.pdf"
name = "iss2817.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
assert (
reader.pages[0]["/Annots"][0].get_object()["/Contents"]
== "A\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 B"
)

0 comments on commit 9d54f63

Please sign in to comment.