Attempt to handle decompression error on some broken PDF files

from times to times we go through files where no text is detected, while readers like evince reads the pdf nicely. After digging it occured this is because the PDF includes some badly compressed data (unproper checksum). This may be fixed by uncompressing byte per byte and ignoring the error on the checksum bytes (arbitrarily found to be the 4 last, which seems consistent with a int32 checksum). This has been largely inspired by py-pdf/pypdf#422 and the test file has been taken from there, so credits to @zegrep.
pdfminer · Sep 20, 2021 · 0433a55 · 0433a55
1 parent 106b13b
commit 0433a55
Showing 1 changed file with 13 additions and 13 deletions.
diff --git a/pdfminer/pdftypes.py b/pdfminer/pdftypes.py
@@ -1,18 +1,14 @@
-import zlib
-import logging
 import io
+import logging
+import zlib
+
+from . import settings
+from .ascii85 import ascii85decode, asciihexdecode
+from .ccitt import ccittfaxdecode
 from .lzw import lzwdecode
-from .ascii85 import ascii85decode
-from .ascii85 import asciihexdecode
+from .psparser import LIT, PSException, PSObject
 from .runlength import rldecode
-from .ccitt import ccittfaxdecode
-from .psparser import PSException
-from .psparser import PSObject
-from .psparser import LIT
-from . import settings
-from .utils import apply_png_predictor
-from .utils import isnumber
-
+from .utils import apply_png_predictor, isnumber
 
 log = logging.getLogger(__name__)
 
@@ -187,7 +183,10 @@ def stream_value(x):
 
 
 def decompress_corrupted(data):
-    d = zlib.decompressobj(zlib.MAX_WBITS | 32)
+    """Called on some data that can't be properly decoded because of CRC checksum
+    error. Attempt to decode it skipping the CRC.
+    """
+    d = zlib.decompressobj()
     f = io.BytesIO(data)
     result_str = b''
     buffer = f.read(1)
@@ -198,6 +197,7 @@ def decompress_corrupted(data):
             buffer = f.read(1)
             i += 1
     except zlib.error:
+        # Let the error propagates if we're not yet in the CRC checksum
         if i < len(data) - 3:
             raise
     return result_str