Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Robustify parsing for Object streams in XRef rebuilding #2818

Merged
merged 5 commits into from
Sep 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 36 additions & 2 deletions pypdf/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@
NullObject,
NumberObject,
PdfObject,
StreamObject,
TextStringObject,
read_object,
)
Expand Down Expand Up @@ -316,8 +317,6 @@ def _get_object_from_stream(
obj_stm: EncodedStreamObject = IndirectObject(stmnum, 0, self).get_object() # type: ignore
# This is an xref to a stream, so its type better be a stream
assert cast(str, obj_stm["/Type"]) == "/ObjStm"
# /N is the number of indirect objects in the stream
assert idx < obj_stm["/N"]
stream_data = BytesIO(obj_stm.get_data())
for i in range(obj_stm["/N"]): # type: ignore
read_non_whitespace(stream_data)
Expand Down Expand Up @@ -999,6 +998,41 @@ def _rebuild_xref_table(self, stream: StreamType) -> None:
if generation not in self.xref:
self.xref[generation] = {}
self.xref[generation][idnum] = m.start(1)

logger_warning("parsing for Object Streams", __name__)
pubpub-zz marked this conversation as resolved.
Show resolved Hide resolved
for g in self.xref:
for i in self.xref[g]:
# get_object in manual
stream.seek(self.xref[g][i], 0)
try:
_ = self.read_object_header(stream)
o = cast(StreamObject, read_object(stream, self))
if o.get("/Type", "") != "/ObjStm":
continue
strm = BytesIO(o.get_data())
cpt = 0
while True:
s = read_until_whitespace(strm)
if not s.isdigit():
break
_i = int(s)
skip_over_whitespace(strm)
strm.seek(-1, 1)
s = read_until_whitespace(strm)
if not s.isdigit(): # pragma: no cover
break # pragma: no cover
_o = int(s)
self.xref_objStm[_i] = (i, _o)
cpt += 1
if cpt != o.get("/N"): # pragma: no cover
logger_warning( # pragma: no cover
f"found {cpt} objects within Object({i},{g})"
f" whereas {o.get('/N')} expected",
__name__,
)
except Exception: # could be of many cause
pass

stream.seek(0, 0)
for m in re.finditer(rb"[\r\n \t][ \t]*trailer[\r\n \t]*(<<)", f_):
stream.seek(m.start(1), 0)
Expand Down
8 changes: 2 additions & 6 deletions tests/test_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from io import BytesIO
from itertools import product as cartesian_product
from pathlib import Path
from unittest.mock import patch

import pytest
from PIL import Image
Expand Down Expand Up @@ -225,14 +224,11 @@ def test_ccitt_fax_decode():


@pytest.mark.enable_socket()
@patch("pypdf._reader.logger_warning")
def test_decompress_zlib_error(mock_logger_warning):
def test_decompress_zlib_error(caplog):
reader = PdfReader(BytesIO(get_data_from_url(name="tika-952445.pdf")))
for page in reader.pages:
page.extract_text()
mock_logger_warning.assert_called_with(
"incorrect startxref pointer(3)", "pypdf._reader"
)
assert "incorrect startxref pointer(3)" in caplog.text


@pytest.mark.enable_socket()
Expand Down
42 changes: 35 additions & 7 deletions tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,14 +276,22 @@ def test_get_images(src, expected_images):
False,
0,
False,
["startxref on same line as offset", "incorrect startxref pointer(1)"],
[
"startxref on same line as offset",
"incorrect startxref pointer(1)",
"parsing for Object Streams",
],
), # error on startxref, but no strict => xref rebuilt,no fail
(
False,
True,
0,
False,
["startxref on same line as offset", "incorrect startxref pointer(1)"],
[
"startxref on same line as offset",
"incorrect startxref pointer(1)",
"parsing for Object Streams",
],
),
],
)
Expand Down Expand Up @@ -344,7 +352,10 @@ def test_issue297(caplog):
assert caplog.text == ""
assert "Broken xref table" in exc.value.args[0]
reader = PdfReader(path, strict=False)
assert normalize_warnings(caplog.text) == ["incorrect startxref pointer(1)"]
assert normalize_warnings(caplog.text) == [
"incorrect startxref pointer(1)",
"parsing for Object Streams",
]
reader.pages[0]


Expand Down Expand Up @@ -898,23 +909,28 @@ def test_form_topname_with_and_without_acroform(caplog):
def test_extract_text_xref_issue_2(caplog):
# pdf/0264cf510015b2a4b395a15cb23c001e.pdf
url = "https://corpora.tika.apache.org/base/docs/govdocs1/981/981961.pdf"
msg = "incorrect startxref pointer(2)"
msg = [
"incorrect startxref pointer(2)",
"parsing for Object Streams",
]
reader = PdfReader(BytesIO(get_data_from_url(url, name="tika-981961.pdf")))
for page in reader.pages:
page.extract_text()
assert normalize_warnings(caplog.text) == [msg]
assert normalize_warnings(caplog.text) == msg


@pytest.mark.enable_socket()
@pytest.mark.slow()
def test_extract_text_xref_issue_3(caplog):
# pdf/0264cf510015b2a4b395a15cb23c001e.pdf
url = "https://corpora.tika.apache.org/base/docs/govdocs1/977/977774.pdf"
msg = "incorrect startxref pointer(3)"
msg = [
"incorrect startxref pointer(3)",
]
reader = PdfReader(BytesIO(get_data_from_url(url, name="tika-977774.pdf")))
for page in reader.pages:
page.extract_text()
assert normalize_warnings(caplog.text) == [msg]
assert normalize_warnings(caplog.text) == msg


@pytest.mark.enable_socket()
Expand Down Expand Up @@ -1589,3 +1605,15 @@ def test_iss2761():
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)), strict=False)
with pytest.raises(PdfReadError):
reader.pages[0].extract_text()


@pytest.mark.enable_socket()
def test_iss2817():
"""Test for rebuiling Xref_ObjStm"""
url = "https://github.com/user-attachments/files/16764070/crash-7e1356f1179b4198337f282304cb611aea26a199.pdf"
name = "iss2817.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
assert (
reader.pages[0]["/Annots"][0].get_object()["/Contents"]
== "A\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 B"
)
Loading