Skip to content

Commit

Permalink
Fix bug in pdfminer.six's .resolve_all
Browse files Browse the repository at this point in the history
See #203

Root cause of decimalization bug flagged in that issue was
pdfminer.six's .resolve_all method not recusively resolving items within
tuples. Now we define our own method, with some slight optimization
tweaks.
  • Loading branch information
jsvine committed Apr 29, 2020
1 parent ab957de commit 85f422d
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 1 deletion.
3 changes: 2 additions & 1 deletion pdfplumber/page.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
from . import utils
from .utils import resolve_all
from .table import TableFinder
from .container import Container
from copy import copy

from pdfminer.pdftypes import resolve_all
from six import string_types
import re
lt_pat = re.compile(r"^LT")


class Page(Container):
cached_properties = Container.cached_properties + [ "_layout" ]
is_original = True
Expand Down
18 changes: 18 additions & 0 deletions pdfplumber/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from pdfminer.utils import PDFDocEncoding
from pdfminer.psparser import PSLiteral
from pdfminer.pdftypes import PDFObjRef
try:
from cdecimal import Decimal, ROUND_HALF_UP
except ImportError:
Expand Down Expand Up @@ -85,6 +86,23 @@ def decode_psl_list(_list):
return [ decode_text(value.name) if isinstance(value, PSLiteral) else value
for value in _list ]

# via pdfminer.pdftypes, altered slightly
def resolve_all(x):
"""
Recursively resolves the given object and all the internals.
"""
t = type(x)
if t == PDFObjRef:
return resolve_all(x.resolve())
elif t == list:
return [ resolve_all(v) for v in x ]
elif t == tuple:
return tuple(resolve_all(v) for v in x)
elif t == dict:
return dict((k, resolve_all(v)) for k, v in x.items())
else:
return x

@cache(maxsize = int(10e4))
def _decimalize(v, q = None):
# If already a decimal, just return itself
Expand Down
Binary file added tests/pdfs/issue-203-decimalize.pdf
Binary file not shown.
5 changes: 5 additions & 0 deletions tests/test-issues.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,3 +148,8 @@ def test_issue_140(self):
cropped_page = page.crop((0, 0, page.width, 122))
assert len(cropped_page.extract_table()) == 5


def test_issue_203(self):
path = os.path.join(HERE, "pdfs/issue-203-decimalize.pdf")
with pdfplumber.open(path) as pdf:
assert len(pdf.objects)

0 comments on commit 85f422d

Please sign in to comment.