Fix bug in pdfminer.six's .resolve_all

jsvine · jsvine · commit 85f422d0012a · 2020-04-29T08:31:39.000-04:00
See #203 Root cause of decimalization bug flagged in that issue was pdfminer.six's .resolve_all method not recusively resolving items within tuples. Now we define our own method, with some slight optimization tweaks.
diff --git a/pdfplumber/page.py b/pdfplumber/page.py
@@ -1,13 +1,14 @@
 from . import utils
+from .utils import resolve_all
 from .table import TableFinder
 from .container import Container
 from copy import copy
 
-from pdfminer.pdftypes import resolve_all
 from six import string_types
 import re
 lt_pat = re.compile(r"^LT")
 
+
 class Page(Container):
     cached_properties = Container.cached_properties + [ "_layout" ]
     is_original = True
diff --git a/pdfplumber/utils.py b/pdfplumber/utils.py
@@ -1,5 +1,6 @@
 from pdfminer.utils import PDFDocEncoding
 from pdfminer.psparser import PSLiteral
+from pdfminer.pdftypes import PDFObjRef
 try:
     from cdecimal import Decimal, ROUND_HALF_UP
 except ImportError:
@@ -85,6 +86,23 @@ def decode_psl_list(_list):
     return [ decode_text(value.name) if isinstance(value, PSLiteral) else value
         for value in _list ]
 
+# via pdfminer.pdftypes, altered slightly
+def resolve_all(x):
+    """
+    Recursively resolves the given object and all the internals.
+    """
+    t = type(x)
+    if t == PDFObjRef:
+        return resolve_all(x.resolve())
+    elif t == list:
+        return [ resolve_all(v) for v in x ]
+    elif t == tuple:
+        return tuple(resolve_all(v) for v in x)
+    elif t == dict:
+        return dict((k, resolve_all(v)) for k, v in x.items())
+    else:
+        return x
+
 @cache(maxsize = int(10e4))
 def _decimalize(v, q = None):
     # If already a decimal, just return itself
diff --git a/tests/pdfs/issue-203-decimalize.pdf b/tests/pdfs/issue-203-decimalize.pdf
diff --git a/tests/test-issues.py b/tests/test-issues.py
@@ -148,3 +148,8 @@ def test_issue_140(self):
             cropped_page = page.crop((0, 0, page.width, 122))
             assert len(cropped_page.extract_table()) == 5
 
+
+    def test_issue_203(self):
+        path = os.path.join(HERE, "pdfs/issue-203-decimalize.pdf")
+        with pdfplumber.open(path) as pdf:
+            assert len(pdf.objects)