py-pdf · MartinThoma · May 7, 2022 · May 1, 2022 · May 1, 2022 · May 1, 2022
diff --git a/PyPDF2/generic.py b/PyPDF2/generic.py
@@ -578,6 +578,29 @@ def writeToStream(self, stream, encryption_key):
 
     @staticmethod
     def readFromStream(stream, pdf):
+        def getNextObjPos(p, p1, remGens, pdf):
+            l = pdf.xref[remGens[0]]
+            for o in l:
+                if p1 > l[o] and p < l[o]:
+                    p1 = l[o]
+            if len(remGens) == 1:
+                return p1
+            else:
+                return getNextObjPos(p, p1, remGens[1:], pdf)
+
+        def readUnsizedFromSteam(stream, pdf):
+            # we are just pointing at beginning of the stream
+            eon = getNextObjPos(stream.tell(), 2**32, [g for g in pdf.xref], pdf) - 1
+            curr = stream.tell()
+            rw = stream.read(eon - stream.tell())
+            p = rw.find(b_("endstream"))
+            if p < 0:
+                raise PdfReadError(
+                    f"Unable to find 'endstream' marker for obj starting at {curr}."
+                )
+            stream.seek(curr + p + 9)
+            return rw[: p - 1]
+
         tmp = stream.read(2)
         if tmp != b_("<<"):
             raise PdfReadError(
@@ -641,6 +664,7 @@ def readFromStream(stream, pdf):
                 t = stream.tell()
                 length = pdf.getObject(length)
                 stream.seek(t, 0)
+            pstart = stream.tell()
             data["__streamdata__"] = stream.read(length)
             e = readNonWhitespace(stream)
             ndstream = stream.read(8)
@@ -657,6 +681,10 @@ def readFromStream(stream, pdf):
                 if end == b_("endstream"):
                     # we found it by looking back one character further.
                     data["__streamdata__"] = data["__streamdata__"][:-1]
+                elif not pdf.strict:
+                    stream.seek(pstart, 0)
+                    data["__streamdata__"] = readUnsizedFromSteam(stream, pdf)
+                    pos = stream.tell()
                 else:
                     stream.seek(pos, 0)
                     raise PdfReadError(

diff --git a/resources/issue-301.pdf b/resources/issue-301.pdf
diff --git a/tests/test_generic.py b/tests/test_generic.py
@@ -280,31 +280,50 @@ def test_DictionaryObject_read_from_stream_stream_no_newline():
     assert exc.value.args[0] == "Stream data must be followed by a newline"
 
 
-def test_DictionaryObject_read_from_stream_stream_no_stream_length():
+@pytest.mark.parametrize(("strict"), [(True), (False)])
+def test_DictionaryObject_read_from_stream_stream_no_stream_length(strict):
     stream = BytesIO(b"<< /S /GoTo >>stream\n")
-    pdf = None
+
+    class tst:  # to replace pdf
+        strict = False
+
+    pdf = tst()
+    pdf.strict = strict
     with pytest.raises(PdfReadError) as exc:
         DictionaryObject.readFromStream(stream, pdf)
     assert exc.value.args[0] == "Stream length not defined"
 
 
-def test_DictionaryObject_read_from_stream_stream_stream_missing_endstream2():
-    stream = BytesIO(b"<< /S /GoTo /Length 10 >>stream\n ")
-    pdf = None
-    with pytest.raises(PdfReadError) as exc:
-        DictionaryObject.readFromStream(stream, pdf)
-    assert (
-        exc.value.args[0]
-        == "Unable to find 'endstream' marker after stream at byte 0x21."
-    )
+@pytest.mark.parametrize(
+    ("strict", "length", "shouldFail"),
+    [
+        (True, 6, False),
+        (True, 10, False),
+        (True, 4, True),
+        (False, 6, False),
+        (False, 10, False),
+    ],
+)
+def test_DictionaryObject_read_from_stream_stream_stream_valid(
+    strict, length, shouldFail
+):
+    stream = BytesIO(b"<< /S /GoTo /Length %d >>stream\nBT /F1\nendstream\n" % length)
 
+    class tst:  # to replace pdf
+        strict = True
+
+    pdf = tst()
+    pdf.strict = strict
+    with pytest.raises(PdfReadError) as exc:
+        do = DictionaryObject.readFromStream(stream, pdf)
+        # TODO: What should happen with the stream?
+        assert do == {"/S": "/GoTo"}
+        if length in (6, 10):
+            assert b"BT /F1" in do._StreamObject__data
+        raise PdfReadError("__ALLGOOD__")
+    print(exc.value)
+    assert shouldFail ^ (exc.value.args[0] == "__ALLGOOD__")
 
-def test_DictionaryObject_read_from_stream_stream_stream_valid():
-    stream = BytesIO(b"<< /S /GoTo /Length 10 >>stream\nBT /F1\nendstream\n")
-    pdf = None
-    do = DictionaryObject.readFromStream(stream, pdf)
-    # TODO: What should happen with the stream?
-    assert do == {"/S": "/GoTo"}
 
 
 def test_RectangleObject():

diff --git a/tests/test_writer.py b/tests/test_writer.py
@@ -1,4 +1,5 @@
 import os
+from io import BytesIO
 
 import pytest
 
@@ -334,8 +335,6 @@ def test_add_link():
 
 def test_io_streams():
     """This is the example from the docs ("Streaming data")."""
-    # Arrange
-    from io import BytesIO
 
     filepath = os.path.join(RESOURCE_ROOT, "pdflatex-outline.pdf")
     with open(filepath, "rb") as fh:
@@ -359,3 +358,14 @@ def test_regression_issue670():
         pdf_writer.addPage(reader.getPage(0))
         with open("dont_commit_issue670.pdf", "wb") as f_pdf:
             pdf_writer.write(f_pdf)
+
+def test_issue301():
+    """
+    Test with invalid stream length object
+    """
+    with open(os.path.join(RESOURCE_ROOT, "issue-301.pdf"), "rb") as f:
+        r = PdfFileReader(f)
+        w = PdfFileWriter()
+        w.appendPagesFromReader(r)
+        o = BytesIO()
+        w.write(o)