Skip to content

Commit

Permalink
Refactor/simplify Page.process_objects
Browse files Browse the repository at this point in the history
This commit reworks the Page.process_objects method to be less complex
and more readable. Many of the changes stem from a realization that
pdfminer.six already stores most object attributes in the desired type;
we only need to convert those requiring decimalization.

This results in one notable change to the output, which is that
`upright` is no longer converted from a bool to an int. The necessary
downstream changes are reflected in utils.extract_words and convert.py.

Overall the changes result in no substantial change to performance; it
is possible that the code runs ever-so-slightly faster, but to a barely
detectable degree.
  • Loading branch information
jsvine committed Aug 29, 2020
1 parent 8e74cb9 commit 1f87898
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 86 deletions.
3 changes: 2 additions & 1 deletion pdfplumber/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ def try_decode_bytes(obj):
PDFStream: lambda obj: {"rawdata": to_b64(obj.rawdata)},
PSLiteral: lambda obj: decode_text(obj.name),
bytes: try_decode_bytes,
bool: int,
}


Expand All @@ -71,7 +72,7 @@ def serialize(obj):
t = type(obj)

# Basic types don't need to be converted
if t in (int, float, str, bool):
if t in (int, float, str):
return obj

# Use one of the custom converters above, if possible
Expand Down
169 changes: 86 additions & 83 deletions pdfplumber/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,42 @@

lt_pat = re.compile(r"^LT")

DECIMAL_ATTRS = set(
[
"adv",
"height",
"linewidth",
"pts",
"size",
"srcsize",
"width",
"x0",
"x1",
"y0",
"y1",
]
)

ALL_ATTRS = DECIMAL_ATTRS | set(
[
"bits",
"upright",
"font",
"fontname",
"name",
"text",
"imagemask",
"colorspace",
"evenodd",
"fill",
"non_stroking_color",
"path",
"stream",
"stroke",
"stroking_color",
]
)


class Page(Container):
cached_properties = Container.cached_properties + ["_layout"]
Expand Down Expand Up @@ -106,99 +142,66 @@ def objects(self):
self._objects = self.parse_objects()
return self._objects

def parse_objects(self):
objects = {}
def process_object(self, obj):
kind = re.sub(lt_pat, "", obj.__class__.__name__).lower()

d = self.decimalize
h = self.height
idc = self.initial_doctop
pno = self.page_number

def point2coord(pt):
x, y = pt
return (d(x), h - d(y))

def noop(x):
return x

def str_conv(x):
return str(x or "")

CONVERSIONS = {
# Decimals
"adv": d,
"height": d,
"linewidth": d,
"pts": d,
"size": d,
"srcsize": d,
"width": d,
"x0": d,
"x1": d,
"y0": d,
"y1": d,
# Integer
"bits": int,
"upright": int,
# Strings
"font": str_conv,
"fontname": str_conv,
"name": str_conv,
"object_type": str_conv,
"text": str_conv,
# No conversion
"imagemask": noop,
"colorspace": noop,
"evenodd": noop,
"fill": noop,
"non_stroking_color": noop,
"path": noop,
"stream": noop,
"stroke": noop,
"stroking_color": noop,
}

CONVERSIONS_KEYS = set(CONVERSIONS.keys())

def process_object(obj):
if hasattr(obj, "_objs"):
for child in obj._objs:
process_object(child)
return

attr = dict(
(k, CONVERSIONS[k](resolve_all(v)))
for k, v in obj.__dict__.items()
if k in CONVERSIONS_KEYS
)

kind = re.sub(lt_pat, "", obj.__class__.__name__).lower()
attr["object_type"] = kind
attr["page_number"] = pno
def process_attr(item):
k, v = item
if k in ALL_ATTRS:
res = resolve_all(v)
if k in DECIMAL_ATTRS:
return (k, d(res))
else:
return (k, res)
else:
return None

if hasattr(obj, "graphicstate"):
gs = obj.graphicstate
attr["stroking_color"] = gs.scolor
attr["non_stroking_color"] = gs.ncolor
attr = dict(filter(None, map(process_attr, obj.__dict__.items())))

if hasattr(obj, "get_text"):
attr["text"] = obj.get_text()
attr["object_type"] = kind
attr["page_number"] = self.page_number

if kind == "curve":
attr["points"] = list(map(point2coord, obj.pts))
if hasattr(obj, "graphicstate"):
gs = obj.graphicstate
attr["stroking_color"] = gs.scolor
attr["non_stroking_color"] = gs.ncolor

if attr.get("y0") is not None:
attr["top"] = h - attr["y1"]
attr["bottom"] = h - attr["y0"]
attr["doctop"] = idc + attr["top"]
if hasattr(obj, "get_text"):
attr["text"] = obj.get_text()

if objects.get(kind) is None:
objects[kind] = []
objects[kind].append(attr)
if kind == "curve":

def point2coord(pt):
x, y = pt
return (self.decimalize(x), self.height - self.decimalize(y))

for obj in self.layout._objs:
process_object(obj)
attr["points"] = list(map(point2coord, obj.pts))

if attr.get("y0") is not None:
attr["top"] = self.height - attr["y1"]
attr["bottom"] = self.height - attr["y0"]
attr["doctop"] = self.initial_doctop + attr["top"]

return attr

def iter_layout_objects(self, layout_objects):
for obj in layout_objects:
# If object is, like LTFigure, a higher-level object
# then iterate through it's children
if hasattr(obj, "_objs"):
yield from self.iter_layout_objects(obj._objs)
else:
yield self.process_object(obj)

def parse_objects(self):
objects = {}
for obj in self.iter_layout_objects(self.layout._objs):
kind = obj["object_type"]
if objects.get(kind) is None:
objects[kind] = []
objects[kind].append(obj)
return objects

def debug_tablefinder(self, table_settings={}):
Expand Down
4 changes: 2 additions & 2 deletions pdfplumber/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,10 +271,10 @@ def sort_key(x):

return [process_word_chars(chars, upright) for chars in words]

chars_by_upright = {1: [], 0: []}
chars_by_upright = {True: [], False: []}
words = []
for char in to_list(chars):
chars_by_upright[char.get("upright", 1)].append(char)
chars_by_upright[char.get("upright", False)].append(char)

for upright, char_group in chars_by_upright.items():
clusters = cluster_objects(
Expand Down

0 comments on commit 1f87898

Please sign in to comment.