Refactor/simplify Page.process_objects

This commit reworks the Page.process_objects method to be less complex and more readable. Many of the changes stem from a realization that pdfminer.six already stores most object attributes in the desired type; we only need to convert those requiring decimalization. This results in one notable change to the output, which is that `upright` is no longer converted from a bool to an int. The necessary downstream changes are reflected in utils.extract_words and convert.py. Overall the changes result in no substantial change to performance; it is possible that the code runs ever-so-slightly faster, but to a barely detectable degree.
jsvine · Aug 29, 2020 · 1f87898 · 1f87898
1 parent 8e74cb9
commit 1f87898
Show file tree

Hide file tree

Showing 3 changed files with 90 additions and 86 deletions.
diff --git a/pdfplumber/convert.py b/pdfplumber/convert.py
@@ -61,6 +61,7 @@ def try_decode_bytes(obj):
     PDFStream: lambda obj: {"rawdata": to_b64(obj.rawdata)},
     PSLiteral: lambda obj: decode_text(obj.name),
     bytes: try_decode_bytes,
+    bool: int,
 }
 
 
@@ -71,7 +72,7 @@ def serialize(obj):
     t = type(obj)
 
     # Basic types don't need to be converted
-    if t in (int, float, str, bool):
+    if t in (int, float, str):
         return obj
 
     # Use one of the custom converters above, if possible

diff --git a/pdfplumber/page.py b/pdfplumber/page.py
@@ -6,6 +6,42 @@
 
 lt_pat = re.compile(r"^LT")
 
+DECIMAL_ATTRS = set(
+    [
+        "adv",
+        "height",
+        "linewidth",
+        "pts",
+        "size",
+        "srcsize",
+        "width",
+        "x0",
+        "x1",
+        "y0",
+        "y1",
+    ]
+)
+
+ALL_ATTRS = DECIMAL_ATTRS | set(
+    [
+        "bits",
+        "upright",
+        "font",
+        "fontname",
+        "name",
+        "text",
+        "imagemask",
+        "colorspace",
+        "evenodd",
+        "fill",
+        "non_stroking_color",
+        "path",
+        "stream",
+        "stroke",
+        "stroking_color",
+    ]
+)
+
 
 class Page(Container):
     cached_properties = Container.cached_properties + ["_layout"]
@@ -106,99 +142,66 @@ def objects(self):
         self._objects = self.parse_objects()
         return self._objects
 
-    def parse_objects(self):
-        objects = {}
+    def process_object(self, obj):
+        kind = re.sub(lt_pat, "", obj.__class__.__name__).lower()
 
         d = self.decimalize
-        h = self.height
-        idc = self.initial_doctop
-        pno = self.page_number
-
-        def point2coord(pt):
-            x, y = pt
-            return (d(x), h - d(y))
-
-        def noop(x):
-            return x
-
-        def str_conv(x):
-            return str(x or "")
-
-        CONVERSIONS = {
-            # Decimals
-            "adv": d,
-            "height": d,
-            "linewidth": d,
-            "pts": d,
-            "size": d,
-            "srcsize": d,
-            "width": d,
-            "x0": d,
-            "x1": d,
-            "y0": d,
-            "y1": d,
-            # Integer
-            "bits": int,
-            "upright": int,
-            # Strings
-            "font": str_conv,
-            "fontname": str_conv,
-            "name": str_conv,
-            "object_type": str_conv,
-            "text": str_conv,
-            # No conversion
-            "imagemask": noop,
-            "colorspace": noop,
-            "evenodd": noop,
-            "fill": noop,
-            "non_stroking_color": noop,
-            "path": noop,
-            "stream": noop,
-            "stroke": noop,
-            "stroking_color": noop,
-        }
-
-        CONVERSIONS_KEYS = set(CONVERSIONS.keys())
-
-        def process_object(obj):
-            if hasattr(obj, "_objs"):
-                for child in obj._objs:
-                    process_object(child)
-                return
-
-            attr = dict(
-                (k, CONVERSIONS[k](resolve_all(v)))
-                for k, v in obj.__dict__.items()
-                if k in CONVERSIONS_KEYS
-            )
 
-            kind = re.sub(lt_pat, "", obj.__class__.__name__).lower()
-            attr["object_type"] = kind
-            attr["page_number"] = pno
+        def process_attr(item):
+            k, v = item
+            if k in ALL_ATTRS:
+                res = resolve_all(v)
+                if k in DECIMAL_ATTRS:
+                    return (k, d(res))
+                else:
+                    return (k, res)
+            else:
+                return None
 
-            if hasattr(obj, "graphicstate"):
-                gs = obj.graphicstate
-                attr["stroking_color"] = gs.scolor
-                attr["non_stroking_color"] = gs.ncolor
+        attr = dict(filter(None, map(process_attr, obj.__dict__.items())))
 
-            if hasattr(obj, "get_text"):
-                attr["text"] = obj.get_text()
+        attr["object_type"] = kind
+        attr["page_number"] = self.page_number
 
-            if kind == "curve":
-                attr["points"] = list(map(point2coord, obj.pts))
+        if hasattr(obj, "graphicstate"):
+            gs = obj.graphicstate
+            attr["stroking_color"] = gs.scolor
+            attr["non_stroking_color"] = gs.ncolor
 
-            if attr.get("y0") is not None:
-                attr["top"] = h - attr["y1"]
-                attr["bottom"] = h - attr["y0"]
-                attr["doctop"] = idc + attr["top"]
+        if hasattr(obj, "get_text"):
+            attr["text"] = obj.get_text()
 
-            if objects.get(kind) is None:
-                objects[kind] = []
-            objects[kind].append(attr)
+        if kind == "curve":
+
+            def point2coord(pt):
+                x, y = pt
+                return (self.decimalize(x), self.height - self.decimalize(y))
 
-        for obj in self.layout._objs:
-            process_object(obj)
+            attr["points"] = list(map(point2coord, obj.pts))
 
+        if attr.get("y0") is not None:
+            attr["top"] = self.height - attr["y1"]
+            attr["bottom"] = self.height - attr["y0"]
+            attr["doctop"] = self.initial_doctop + attr["top"]
+
+        return attr
+
+    def iter_layout_objects(self, layout_objects):
+        for obj in layout_objects:
+            # If object is, like LTFigure, a higher-level object
+            # then iterate through it's children
+            if hasattr(obj, "_objs"):
+                yield from self.iter_layout_objects(obj._objs)
+            else:
+                yield self.process_object(obj)
+
+    def parse_objects(self):
+        objects = {}
+        for obj in self.iter_layout_objects(self.layout._objs):
+            kind = obj["object_type"]
+            if objects.get(kind) is None:
+                objects[kind] = []
+            objects[kind].append(obj)
         return objects
 
     def debug_tablefinder(self, table_settings={}):

diff --git a/pdfplumber/utils.py b/pdfplumber/utils.py
@@ -271,10 +271,10 @@ def sort_key(x):
 
         return [process_word_chars(chars, upright) for chars in words]
 
-    chars_by_upright = {1: [], 0: []}
+    chars_by_upright = {True: [], False: []}
     words = []
     for char in to_list(chars):
-        chars_by_upright[char.get("upright", 1)].append(char)
+        chars_by_upright[char.get("upright", False)].append(char)
 
     for upright, char_group in chars_by_upright.items():
         clusters = cluster_objects(