Skip to content

Commit

Permalink
Attempt to handle decompression error on some broken PDF files
Browse files Browse the repository at this point in the history
from times to times we go through files where no text is detected, while readers
like evince reads the pdf nicely. After digging it occured this is because the
PDF includes some badly compressed data (unproper checksum). This may be fixed by
uncompressing byte per byte and ignoring the error on the checksum bytes (arbitrarily
found to be the 4 last, which seems consistent with a int32 checksum).

This has been largely inspired by py-pdf/pypdf#422
and the test file has been taken from there, so credits to @zegrep.
  • Loading branch information
Sylvain Thénault committed Sep 20, 2021
1 parent 106b13b commit 0433a55
Showing 1 changed file with 13 additions and 13 deletions.
26 changes: 13 additions & 13 deletions pdfminer/pdftypes.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,14 @@
import zlib
import logging
import io
import logging
import zlib

from . import settings
from .ascii85 import ascii85decode, asciihexdecode
from .ccitt import ccittfaxdecode
from .lzw import lzwdecode
from .ascii85 import ascii85decode
from .ascii85 import asciihexdecode
from .psparser import LIT, PSException, PSObject
from .runlength import rldecode
from .ccitt import ccittfaxdecode
from .psparser import PSException
from .psparser import PSObject
from .psparser import LIT
from . import settings
from .utils import apply_png_predictor
from .utils import isnumber

from .utils import apply_png_predictor, isnumber

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -187,7 +183,10 @@ def stream_value(x):


def decompress_corrupted(data):
d = zlib.decompressobj(zlib.MAX_WBITS | 32)
"""Called on some data that can't be properly decoded because of CRC checksum
error. Attempt to decode it skipping the CRC.
"""
d = zlib.decompressobj()
f = io.BytesIO(data)
result_str = b''
buffer = f.read(1)
Expand All @@ -198,6 +197,7 @@ def decompress_corrupted(data):
buffer = f.read(1)
i += 1
except zlib.error:
# Let the error propagates if we're not yet in the CRC checksum
if i < len(data) - 3:
raise
return result_str
Expand Down

0 comments on commit 0433a55

Please sign in to comment.