From fd25a6a08768051d2c16c6a558e6e33e8eb70fc9 Mon Sep 17 00:00:00 2001 From: Jeremy Singer-Vine Date: Tue, 21 Nov 2017 23:23:09 -0500 Subject: [PATCH] Fix issue #41 ... ... in which PDF-object-referenced cropboxes/mediaboxes weren't being fully resolved. Thanks to @dankeemahill for flagging! --- README.md | 2 +- pdfplumber/_version.py | 2 +- pdfplumber/page.py | 27 ++++++++++++++++----------- 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 377059f6..2bb3cd2f 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# PDFPlumber `v0.5.5` +# PDFPlumber `v0.5.6` Plumb a PDF for detailed information about each text character, rectangle, and line. Plus: Table extraction and visual debugging. diff --git a/pdfplumber/_version.py b/pdfplumber/_version.py index 7ecddb22..fd1ae936 100644 --- a/pdfplumber/_version.py +++ b/pdfplumber/_version.py @@ -1,2 +1,2 @@ -version_info = (0, 5, 5) +version_info = (0, 5, 6) __version__ = '.'.join(map(str, version_info)) diff --git a/pdfplumber/page.py b/pdfplumber/page.py index b784deda..4ba2bdfb 100644 --- a/pdfplumber/page.py +++ b/pdfplumber/page.py @@ -3,6 +3,7 @@ from .container import Container from copy import copy +from pdfminer.pdftypes import resolve_all from six import string_types import re lt_pat = re.compile(r"^LT") @@ -19,22 +20,26 @@ def __init__(self, pdf, page_obj, page_number=None, initial_doctop=0): self.page_obj.rotate = self.rotation self.initial_doctop = self.decimalize(initial_doctop) - cropbox = page_obj.attrs.get("CropBox", page_obj.attrs.get("MediaBox")) - self.cropbox = self.decimalize(cropbox) + cropbox = page_obj.attrs.get("CropBox") + mediabox = page_obj.attrs.get("MediaBox") + + self.cropbox = self.decimalize(resolve_all(cropbox)) if cropbox is not None else None + self.mediabox = self.decimalize(resolve_all(mediabox) or self.cropbox) + m = self.mediabox if self.rotation in [ 90, 270 ]: self.bbox = self.decimalize(( - min(cropbox[1], cropbox[3]), - min(cropbox[0], cropbox[2]), - max(cropbox[1], cropbox[3]), - max(cropbox[0], cropbox[2]), + min(m[1], m[3]), + min(m[0], m[2]), + max(m[1], m[3]), + max(m[0], m[2]), )) else: self.bbox = self.decimalize(( - min(cropbox[0], cropbox[2]), - min(cropbox[1], cropbox[3]), - max(cropbox[0], cropbox[2]), - max(cropbox[1], cropbox[3]), + min(m[0], m[2]), + min(m[1], m[3]), + max(m[0], m[2]), + max(m[1], m[3]), )) def decimalize(self, x): @@ -92,7 +97,7 @@ def point2coord(pt): ] def process_object(obj): - attr = dict((k, (v if k in NON_DECIMALIZE else d(v))) + attr = dict((k, (v if (k in NON_DECIMALIZE or v == None) else d(v))) for k, v in obj.__dict__.items() if k not in IGNORE)