Skip to content

Commit 85f422d

Browse files
committed
Fix bug in pdfminer.six's .resolve_all
See #203 Root cause of decimalization bug flagged in that issue was pdfminer.six's .resolve_all method not recusively resolving items within tuples. Now we define our own method, with some slight optimization tweaks.
1 parent ab957de commit 85f422d

File tree

4 files changed

+25
-1
lines changed

4 files changed

+25
-1
lines changed

pdfplumber/page.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
11
from . import utils
2+
from .utils import resolve_all
23
from .table import TableFinder
34
from .container import Container
45
from copy import copy
56

6-
from pdfminer.pdftypes import resolve_all
77
from six import string_types
88
import re
99
lt_pat = re.compile(r"^LT")
1010

11+
1112
class Page(Container):
1213
cached_properties = Container.cached_properties + [ "_layout" ]
1314
is_original = True

pdfplumber/utils.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from pdfminer.utils import PDFDocEncoding
22
from pdfminer.psparser import PSLiteral
3+
from pdfminer.pdftypes import PDFObjRef
34
try:
45
from cdecimal import Decimal, ROUND_HALF_UP
56
except ImportError:
@@ -85,6 +86,23 @@ def decode_psl_list(_list):
8586
return [ decode_text(value.name) if isinstance(value, PSLiteral) else value
8687
for value in _list ]
8788

89+
# via pdfminer.pdftypes, altered slightly
90+
def resolve_all(x):
91+
"""
92+
Recursively resolves the given object and all the internals.
93+
"""
94+
t = type(x)
95+
if t == PDFObjRef:
96+
return resolve_all(x.resolve())
97+
elif t == list:
98+
return [ resolve_all(v) for v in x ]
99+
elif t == tuple:
100+
return tuple(resolve_all(v) for v in x)
101+
elif t == dict:
102+
return dict((k, resolve_all(v)) for k, v in x.items())
103+
else:
104+
return x
105+
88106
@cache(maxsize = int(10e4))
89107
def _decimalize(v, q = None):
90108
# If already a decimal, just return itself

tests/pdfs/issue-203-decimalize.pdf

134 KB
Binary file not shown.

tests/test-issues.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,3 +148,8 @@ def test_issue_140(self):
148148
cropped_page = page.crop((0, 0, page.width, 122))
149149
assert len(cropped_page.extract_table()) == 5
150150

151+
152+
def test_issue_203(self):
153+
path = os.path.join(HERE, "pdfs/issue-203-decimalize.pdf")
154+
with pdfplumber.open(path) as pdf:
155+
assert len(pdf.objects)

0 commit comments

Comments
 (0)