Skip to content

Commit

Permalink
Refactor .extract_words and allow attrib-grouping
Browse files Browse the repository at this point in the history
This commit refactors and hopefully makes clearer the logic in
utils.extract_words. It also adds a new parameter, `extra_attrs`, which
allows the user to pass a list of attributes on which to group all
characters.

For instance, passing `extra_attrs=["fontname", "size"]` will not allow
characters with different font names or sizes to become part of the same
word. As a benefit, those resulting word dicts will contain `"fontname"`
and `"size"` attributes — providing a long-requested feature (cf. issue
  • Loading branch information
jsvine committed Aug 29, 2020
1 parent 6233bbd commit c8b200e
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 59 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ The `pdfplumber.Page` class is at the core of `pdfplumber`. Most things you'll d
|`.within_bbox(bounding_box, relative=False)`| Similar to `.crop`, but only retains objects that fall *entirely* within the bounding box.|
|`.filter(test_function)`| Returns a version of the page with only the `.objects` for which `test_function(obj)` returns `True`.|
|`.extract_text(x_tolerance=3, y_tolerance=3)`| Collates all of the page's character objects into a single string. Adds spaces where the difference between the `x1` of one character and the `x0` of the next is greater than `x_tolerance`. Adds newline characters where the difference between the `doctop` of one character and the `doctop` of the next is greater than `y_tolerance`.|
|`.extract_words(x_tolerance=3, y_tolerance=3, horizontal_ltr=True, vertical_ttb=True)`| Returns a list of all word-looking things and their bounding boxes. Words are considered to be sequences of characters where (for "upright" characters) the difference between the `x1` of one character and the `x0` of the next is less than or equal to `x_tolerance` *and* where the `doctop` of one character and the `doctop` of the next is less than or equal to `y_tolerance`. A similar approach is taken for non-upright characters, but instead measuring the vertical, rather than horizontal, distances between them. The parameters `horizontal_ltr` and `vertical_ttb` indicate whether the words should be read from left-to-right (for horizontal words) / top-to-bottom (for vertical words).|
|`.extract_words(x_tolerance=3, y_tolerance=3, keep_blank_chars=False, horizontal_ltr=True, vertical_ttb=True, extra_attrs=[])`| Returns a list of all word-looking things and their bounding boxes. Words are considered to be sequences of characters where (for "upright" characters) the difference between the `x1` of one character and the `x0` of the next is less than or equal to `x_tolerance` *and* where the `doctop` of one character and the `doctop` of the next is less than or equal to `y_tolerance`. A similar approach is taken for non-upright characters, but instead measuring the vertical, rather than horizontal, distances between them. The parameters `horizontal_ltr` and `vertical_ttb` indicate whether the words should be read from left-to-right (for horizontal words) / top-to-bottom (for vertical words). Changing `keep_blank_chars` to `True` will mean that blank characters are treated as part of a word, not as a space between words. Passing a list of `extra_attrs` (e.g., `["fontname", "size"]` will restrict each words to characters that share exactly the same value for each of those attributes, and the resulting word dicts will indicate those attributes.|
|`.extract_tables(table_settings)`| Extracts tabular data from the page. For more details see "[Extracting tables](#extracting-tables)" below.|
|`.to_image(**conversion_kwargs)`| Returns an instance of the `PageImage` class. For more details, see "[Visual debugging](#visual-debugging)" below. For conversion_kwargs, see [here](http://docs.wand-py.org/en/latest/wand/image.html#wand.image.Image).|

Expand Down
133 changes: 75 additions & 58 deletions pdfplumber/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,86 +205,103 @@ def bbox_to_rect(bbox):
return {"x0": bbox[0], "top": bbox[1], "x1": bbox[2], "bottom": bbox[3]}


def extract_words(
chars,
x_tolerance=DEFAULT_X_TOLERANCE,
y_tolerance=DEFAULT_Y_TOLERANCE,
keep_blank_chars=False,
horizontal_ltr=True, # Should words be read left-to-right?
vertical_ttb=True, # Should vertical words be read top-to-bottom?
):
def merge_chars(ordered_chars, extra_attrs=[]):
x0, top, x1, bottom = objects_to_bbox(ordered_chars)

word = {
"text": "".join(map(itemgetter("text"), ordered_chars)),
"x0": x0,
"x1": x1,
"top": top,
"bottom": bottom,
"upright": ordered_chars[0]["upright"],
}

x_tolerance = decimalize(x_tolerance)
y_tolerance = decimalize(y_tolerance)
for key in extra_attrs:
word[key] = ordered_chars[0][key]

def process_word_chars(chars, upright):
x0, top, x1, bottom = objects_to_bbox(chars)
return word

return {
"x0": x0,
"x1": x1,
"top": top,
"bottom": bottom,
"upright": upright,
"text": "".join(map(itemgetter("text"), chars)),
}

def get_line_words(chars, upright, tolerance):
get_text = itemgetter("text")
if upright:
min_key, max_key = ("x0", "x1") if horizontal_ltr else ("x1", "x0")
else:
min_key, max_key = ("top", "bottom") if vertical_ttb else ("bottom", "top")
def cluster_line_chars(
chars, tolerance, keep_blank_chars=False, min_key="x0", max_key="x1", sort_asc=True
):
get_text = itemgetter("text")

words = []
current_word = []

words = []
current_word = []
comp_fn = gt if sort_asc else lt
tol_fn = add if sort_asc else sub

asc_order = (upright and horizontal_ltr) or (not upright and vertical_ttb)
def sort_key(x):
return tol_fn(0, x[min_key])

comp_fn = gt if asc_order else lt
tol_fn = add if asc_order else sub
sorted_chars = sorted(chars, key=sort_key)

def sort_key(x):
return tol_fn(0, x[min_key])
for char in sorted_chars:
if not keep_blank_chars and get_text(char).isspace():
if len(current_word) > 0:
words.append(current_word)
current_word = []
elif len(current_word) == 0:
current_word.append(char)
else:
last_char = current_word[-1]
prev_pos = tol_fn(last_char[max_key], tolerance)
if comp_fn(char[min_key], prev_pos):
words.append(current_word)
current_word = []
current_word.append(char)

if len(current_word) > 0:
words.append(current_word)

sorted_chars = sorted(chars, key=sort_key)
return words

for char in sorted_chars:
if not keep_blank_chars and get_text(char).isspace():
if len(current_word) > 0:
words.append(current_word)
current_word = []
else:
pass
elif len(current_word) == 0:
current_word.append(char)
else:
last_char = current_word[-1]
prev_pos = tol_fn(last_char[max_key], tolerance)
if comp_fn(char[min_key], prev_pos):
words.append(current_word)
current_word = []
current_word.append(char)

if len(current_word) > 0:
words.append(current_word)
def extract_words(
chars,
x_tolerance=DEFAULT_X_TOLERANCE,
y_tolerance=DEFAULT_Y_TOLERANCE,
keep_blank_chars=False,
horizontal_ltr=True, # Should words be read left-to-right?
vertical_ttb=True, # Should vertical words be read top-to-bottom?
extra_attrs=[],
):

return [process_word_chars(chars, upright) for chars in words]
x_tolerance = decimalize(x_tolerance)
y_tolerance = decimalize(y_tolerance)

chars_by_upright = {True: [], False: []}
words = []
for char in to_list(chars):
chars_by_upright[char.get("upright", False)].append(char)
grouped = itertools.groupby(chars, itemgetter("upright", *extra_attrs))

for keyvals, char_group in grouped:
upright = keyvals[0] if len(extra_attrs) else keyvals

for upright, char_group in chars_by_upright.items():
clusters = cluster_objects(
char_group,
"doctop" if upright else "x0",
y_tolerance, # Still use y-tolerance here, even for vertical words
)

sort_asc = (upright and horizontal_ltr) or (not upright and vertical_ttb)
min_key, max_key = ("x0", "x1") if upright else ("top", "bottom")

if not sort_asc:
min_key, max_key = max_key, min_key

for line_chars in clusters:
words += get_line_words(line_chars, upright, tolerance=x_tolerance)
word_clusters = cluster_line_chars(
line_chars,
# Still use x-tolerance here, even for vertical words
tolerance=x_tolerance,
keep_blank_chars=keep_blank_chars,
min_key=min_key,
max_key=max_key,
sort_asc=sort_asc,
)
words += [merge_chars(c, extra_attrs) for c in word_clusters]

return words

Expand Down
9 changes: 9 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,11 +68,20 @@ def test_extract_words(self):
with pdfplumber.open(path) as pdf:
p = pdf.pages[0]
words = p.extract_words(vertical_ttb=False)
words_attr = p.extract_words(vertical_ttb=False, extra_attrs = [ "size" ])
words_w_spaces = p.extract_words(vertical_ttb=False, keep_blank_chars=True)
words_rtl = p.extract_words(horizontal_ltr=False)

assert words[0]["text"] == "Agaaaaa:"

assert "size" not in words[0]
assert float(words_attr[0]["size"]) == 9.960

assert words_w_spaces[0]["text"] == "Agaaaaa: AAAA"

vertical = [w for w in words if w["upright"] == 0]
assert vertical[0]["text"] == "Aaaaaabag8"

assert words_rtl[1]["text"] == "baaabaaA/AAA"

def test_extract_text(self):
Expand Down

0 comments on commit c8b200e

Please sign in to comment.