Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ROB : cope with invalid length in streams #861

Merged
merged 11 commits into from
May 7, 2022
28 changes: 28 additions & 0 deletions PyPDF2/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -578,6 +578,29 @@ def writeToStream(self, stream, encryption_key):

@staticmethod
def readFromStream(stream, pdf):
def getNextObjPos(p, p1, remGens, pdf):
l = pdf.xref[remGens[0]]
for o in l:
if p1 > l[o] and p < l[o]:
p1 = l[o]
if len(remGens) == 1:
return p1
else:
return getNextObjPos(p, p1, remGens[1:], pdf)

def readUnsizedFromSteam(stream, pdf):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand this function. How does it know where the ream ends? Couldn't there be multiple objects in a stream?

Can you point me to a resource where I can read up on PDF streams in general? I would appreciate that a lot 🙏

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From my analysis/tests, the stream were most of the time compressed/encoded
Also we have to remember that this function will only be used to load some data when the PDF file is corrupted.

I peek up some information in the standard
https://opensource.adobe.com/dc-acrobat-sdk-docs/standards/pdfstandards/pdf/PDF32000_2008.pdf
and also in those:
https://www.adobe.com/technology/pdfs/presentations/KingPDFTutorial.pdf
https://www.oreilly.com/library/view/developing-with-pdf/9781449327903/ch01.html

for ref also so test files
https://github.com/pdf-association/pdf20examples

# we are just pointing at beginning of the stream
eon = getNextObjPos(stream.tell(), 2**32, [g for g in pdf.xref], pdf) - 1
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what does "eon" mean?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

end before next object : we have to find when to stop looking for endstream

curr = stream.tell()
rw = stream.read(eon - stream.tell())
p = rw.find(b_("endstream"))
if p < 0:
raise PdfReadError(
f"Unable to find 'endstream' marker for obj starting at {curr}."
)
stream.seek(curr + p + 9)
return rw[: p - 1]

tmp = stream.read(2)
if tmp != b_("<<"):
raise PdfReadError(
Expand Down Expand Up @@ -641,6 +664,7 @@ def readFromStream(stream, pdf):
t = stream.tell()
length = pdf.getObject(length)
stream.seek(t, 0)
pstart = stream.tell()
data["__streamdata__"] = stream.read(length)
e = readNonWhitespace(stream)
ndstream = stream.read(8)
Expand All @@ -657,6 +681,10 @@ def readFromStream(stream, pdf):
if end == b_("endstream"):
# we found it by looking back one character further.
data["__streamdata__"] = data["__streamdata__"][:-1]
elif not pdf.strict:
stream.seek(pstart, 0)
data["__streamdata__"] = readUnsizedFromSteam(stream, pdf)
pos = stream.tell()
Comment on lines +684 to +687
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let me check my understanding: If this part is reached, then we are in a situation in which the "endstream" marker was not found where it should be. As we are in best effort mode, we go back to the very beginning of the stream. From there, we read as much as necessary. Right?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we read up to the endstream tag

else:
stream.seek(pos, 0)
raise PdfReadError(
Expand Down
Binary file added resources/issue-301.pdf
Binary file not shown.
53 changes: 36 additions & 17 deletions tests/test_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,31 +280,50 @@ def test_DictionaryObject_read_from_stream_stream_no_newline():
assert exc.value.args[0] == "Stream data must be followed by a newline"


def test_DictionaryObject_read_from_stream_stream_no_stream_length():
@pytest.mark.parametrize(("strict"), [(True), (False)])
def test_DictionaryObject_read_from_stream_stream_no_stream_length(strict):
stream = BytesIO(b"<< /S /GoTo >>stream\n")
pdf = None

class tst: # to replace pdf
strict = False

pdf = tst()
pdf.strict = strict
with pytest.raises(PdfReadError) as exc:
DictionaryObject.readFromStream(stream, pdf)
assert exc.value.args[0] == "Stream length not defined"


def test_DictionaryObject_read_from_stream_stream_stream_missing_endstream2():
stream = BytesIO(b"<< /S /GoTo /Length 10 >>stream\n ")
pdf = None
with pytest.raises(PdfReadError) as exc:
DictionaryObject.readFromStream(stream, pdf)
assert (
exc.value.args[0]
== "Unable to find 'endstream' marker after stream at byte 0x21."
)
@pytest.mark.parametrize(
("strict", "length", "shouldFail"),
[
(True, 6, False),
(True, 10, False),
(True, 4, True),
(False, 6, False),
(False, 10, False),
],
)
def test_DictionaryObject_read_from_stream_stream_stream_valid(
strict, length, shouldFail
):
stream = BytesIO(b"<< /S /GoTo /Length %d >>stream\nBT /F1\nendstream\n" % length)

class tst: # to replace pdf
strict = True

pdf = tst()
pdf.strict = strict
with pytest.raises(PdfReadError) as exc:
do = DictionaryObject.readFromStream(stream, pdf)
# TODO: What should happen with the stream?
assert do == {"/S": "/GoTo"}
if length in (6, 10):
assert b"BT /F1" in do._StreamObject__data
raise PdfReadError("__ALLGOOD__")
print(exc.value)
assert shouldFail ^ (exc.value.args[0] == "__ALLGOOD__")

def test_DictionaryObject_read_from_stream_stream_stream_valid():
stream = BytesIO(b"<< /S /GoTo /Length 10 >>stream\nBT /F1\nendstream\n")
pdf = None
do = DictionaryObject.readFromStream(stream, pdf)
# TODO: What should happen with the stream?
assert do == {"/S": "/GoTo"}


def test_RectangleObject():
Expand Down
14 changes: 12 additions & 2 deletions tests/test_writer.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
from io import BytesIO

import pytest

Expand Down Expand Up @@ -334,8 +335,6 @@ def test_add_link():

def test_io_streams():
"""This is the example from the docs ("Streaming data")."""
# Arrange
from io import BytesIO

filepath = os.path.join(RESOURCE_ROOT, "pdflatex-outline.pdf")
with open(filepath, "rb") as fh:
Expand All @@ -359,3 +358,14 @@ def test_regression_issue670():
pdf_writer.addPage(reader.getPage(0))
with open("dont_commit_issue670.pdf", "wb") as f_pdf:
pdf_writer.write(f_pdf)

def test_issue301():
"""
Test with invalid stream length object
"""
with open(os.path.join(RESOURCE_ROOT, "issue-301.pdf"), "rb") as f:
r = PdfFileReader(f)
w = PdfFileWriter()
w.appendPagesFromReader(r)
o = BytesIO()
w.write(o)