Skip to content

Commit c8b200e

Browse files
committed
Refactor .extract_words and allow attrib-grouping
This commit refactors and hopefully makes clearer the logic in utils.extract_words. It also adds a new parameter, `extra_attrs`, which allows the user to pass a list of attributes on which to group all characters. For instance, passing `extra_attrs=["fontname", "size"]` will not allow characters with different font names or sizes to become part of the same word. As a benefit, those resulting word dicts will contain `"fontname"` and `"size"` attributes — providing a long-requested feature (cf. issue
1 parent 6233bbd commit c8b200e

File tree

3 files changed

+85
-59
lines changed

3 files changed

+85
-59
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ The `pdfplumber.Page` class is at the core of `pdfplumber`. Most things you'll d
100100
|`.within_bbox(bounding_box, relative=False)`| Similar to `.crop`, but only retains objects that fall *entirely* within the bounding box.|
101101
|`.filter(test_function)`| Returns a version of the page with only the `.objects` for which `test_function(obj)` returns `True`.|
102102
|`.extract_text(x_tolerance=3, y_tolerance=3)`| Collates all of the page's character objects into a single string. Adds spaces where the difference between the `x1` of one character and the `x0` of the next is greater than `x_tolerance`. Adds newline characters where the difference between the `doctop` of one character and the `doctop` of the next is greater than `y_tolerance`.|
103-
|`.extract_words(x_tolerance=3, y_tolerance=3, horizontal_ltr=True, vertical_ttb=True)`| Returns a list of all word-looking things and their bounding boxes. Words are considered to be sequences of characters where (for "upright" characters) the difference between the `x1` of one character and the `x0` of the next is less than or equal to `x_tolerance` *and* where the `doctop` of one character and the `doctop` of the next is less than or equal to `y_tolerance`. A similar approach is taken for non-upright characters, but instead measuring the vertical, rather than horizontal, distances between them. The parameters `horizontal_ltr` and `vertical_ttb` indicate whether the words should be read from left-to-right (for horizontal words) / top-to-bottom (for vertical words).|
103+
|`.extract_words(x_tolerance=3, y_tolerance=3, keep_blank_chars=False, horizontal_ltr=True, vertical_ttb=True, extra_attrs=[])`| Returns a list of all word-looking things and their bounding boxes. Words are considered to be sequences of characters where (for "upright" characters) the difference between the `x1` of one character and the `x0` of the next is less than or equal to `x_tolerance` *and* where the `doctop` of one character and the `doctop` of the next is less than or equal to `y_tolerance`. A similar approach is taken for non-upright characters, but instead measuring the vertical, rather than horizontal, distances between them. The parameters `horizontal_ltr` and `vertical_ttb` indicate whether the words should be read from left-to-right (for horizontal words) / top-to-bottom (for vertical words). Changing `keep_blank_chars` to `True` will mean that blank characters are treated as part of a word, not as a space between words. Passing a list of `extra_attrs` (e.g., `["fontname", "size"]` will restrict each words to characters that share exactly the same value for each of those attributes, and the resulting word dicts will indicate those attributes.|
104104
|`.extract_tables(table_settings)`| Extracts tabular data from the page. For more details see "[Extracting tables](#extracting-tables)" below.|
105105
|`.to_image(**conversion_kwargs)`| Returns an instance of the `PageImage` class. For more details, see "[Visual debugging](#visual-debugging)" below. For conversion_kwargs, see [here](http://docs.wand-py.org/en/latest/wand/image.html#wand.image.Image).|
106106

pdfplumber/utils.py

Lines changed: 75 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -205,86 +205,103 @@ def bbox_to_rect(bbox):
205205
return {"x0": bbox[0], "top": bbox[1], "x1": bbox[2], "bottom": bbox[3]}
206206

207207

208-
def extract_words(
209-
chars,
210-
x_tolerance=DEFAULT_X_TOLERANCE,
211-
y_tolerance=DEFAULT_Y_TOLERANCE,
212-
keep_blank_chars=False,
213-
horizontal_ltr=True, # Should words be read left-to-right?
214-
vertical_ttb=True, # Should vertical words be read top-to-bottom?
215-
):
208+
def merge_chars(ordered_chars, extra_attrs=[]):
209+
x0, top, x1, bottom = objects_to_bbox(ordered_chars)
210+
211+
word = {
212+
"text": "".join(map(itemgetter("text"), ordered_chars)),
213+
"x0": x0,
214+
"x1": x1,
215+
"top": top,
216+
"bottom": bottom,
217+
"upright": ordered_chars[0]["upright"],
218+
}
216219

217-
x_tolerance = decimalize(x_tolerance)
218-
y_tolerance = decimalize(y_tolerance)
220+
for key in extra_attrs:
221+
word[key] = ordered_chars[0][key]
219222

220-
def process_word_chars(chars, upright):
221-
x0, top, x1, bottom = objects_to_bbox(chars)
223+
return word
222224

223-
return {
224-
"x0": x0,
225-
"x1": x1,
226-
"top": top,
227-
"bottom": bottom,
228-
"upright": upright,
229-
"text": "".join(map(itemgetter("text"), chars)),
230-
}
231225

232-
def get_line_words(chars, upright, tolerance):
233-
get_text = itemgetter("text")
234-
if upright:
235-
min_key, max_key = ("x0", "x1") if horizontal_ltr else ("x1", "x0")
236-
else:
237-
min_key, max_key = ("top", "bottom") if vertical_ttb else ("bottom", "top")
226+
def cluster_line_chars(
227+
chars, tolerance, keep_blank_chars=False, min_key="x0", max_key="x1", sort_asc=True
228+
):
229+
get_text = itemgetter("text")
230+
231+
words = []
232+
current_word = []
238233

239-
words = []
240-
current_word = []
234+
comp_fn = gt if sort_asc else lt
235+
tol_fn = add if sort_asc else sub
241236

242-
asc_order = (upright and horizontal_ltr) or (not upright and vertical_ttb)
237+
def sort_key(x):
238+
return tol_fn(0, x[min_key])
243239

244-
comp_fn = gt if asc_order else lt
245-
tol_fn = add if asc_order else sub
240+
sorted_chars = sorted(chars, key=sort_key)
246241

247-
def sort_key(x):
248-
return tol_fn(0, x[min_key])
242+
for char in sorted_chars:
243+
if not keep_blank_chars and get_text(char).isspace():
244+
if len(current_word) > 0:
245+
words.append(current_word)
246+
current_word = []
247+
elif len(current_word) == 0:
248+
current_word.append(char)
249+
else:
250+
last_char = current_word[-1]
251+
prev_pos = tol_fn(last_char[max_key], tolerance)
252+
if comp_fn(char[min_key], prev_pos):
253+
words.append(current_word)
254+
current_word = []
255+
current_word.append(char)
256+
257+
if len(current_word) > 0:
258+
words.append(current_word)
249259

250-
sorted_chars = sorted(chars, key=sort_key)
260+
return words
251261

252-
for char in sorted_chars:
253-
if not keep_blank_chars and get_text(char).isspace():
254-
if len(current_word) > 0:
255-
words.append(current_word)
256-
current_word = []
257-
else:
258-
pass
259-
elif len(current_word) == 0:
260-
current_word.append(char)
261-
else:
262-
last_char = current_word[-1]
263-
prev_pos = tol_fn(last_char[max_key], tolerance)
264-
if comp_fn(char[min_key], prev_pos):
265-
words.append(current_word)
266-
current_word = []
267-
current_word.append(char)
268262

269-
if len(current_word) > 0:
270-
words.append(current_word)
263+
def extract_words(
264+
chars,
265+
x_tolerance=DEFAULT_X_TOLERANCE,
266+
y_tolerance=DEFAULT_Y_TOLERANCE,
267+
keep_blank_chars=False,
268+
horizontal_ltr=True, # Should words be read left-to-right?
269+
vertical_ttb=True, # Should vertical words be read top-to-bottom?
270+
extra_attrs=[],
271+
):
271272

272-
return [process_word_chars(chars, upright) for chars in words]
273+
x_tolerance = decimalize(x_tolerance)
274+
y_tolerance = decimalize(y_tolerance)
273275

274-
chars_by_upright = {True: [], False: []}
275276
words = []
276-
for char in to_list(chars):
277-
chars_by_upright[char.get("upright", False)].append(char)
277+
grouped = itertools.groupby(chars, itemgetter("upright", *extra_attrs))
278+
279+
for keyvals, char_group in grouped:
280+
upright = keyvals[0] if len(extra_attrs) else keyvals
278281

279-
for upright, char_group in chars_by_upright.items():
280282
clusters = cluster_objects(
281283
char_group,
282284
"doctop" if upright else "x0",
283285
y_tolerance, # Still use y-tolerance here, even for vertical words
284286
)
285287

288+
sort_asc = (upright and horizontal_ltr) or (not upright and vertical_ttb)
289+
min_key, max_key = ("x0", "x1") if upright else ("top", "bottom")
290+
291+
if not sort_asc:
292+
min_key, max_key = max_key, min_key
293+
286294
for line_chars in clusters:
287-
words += get_line_words(line_chars, upright, tolerance=x_tolerance)
295+
word_clusters = cluster_line_chars(
296+
line_chars,
297+
# Still use x-tolerance here, even for vertical words
298+
tolerance=x_tolerance,
299+
keep_blank_chars=keep_blank_chars,
300+
min_key=min_key,
301+
max_key=max_key,
302+
sort_asc=sort_asc,
303+
)
304+
words += [merge_chars(c, extra_attrs) for c in word_clusters]
288305

289306
return words
290307

tests/test_utils.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,11 +68,20 @@ def test_extract_words(self):
6868
with pdfplumber.open(path) as pdf:
6969
p = pdf.pages[0]
7070
words = p.extract_words(vertical_ttb=False)
71+
words_attr = p.extract_words(vertical_ttb=False, extra_attrs = [ "size" ])
72+
words_w_spaces = p.extract_words(vertical_ttb=False, keep_blank_chars=True)
7173
words_rtl = p.extract_words(horizontal_ltr=False)
7274

7375
assert words[0]["text"] == "Agaaaaa:"
76+
77+
assert "size" not in words[0]
78+
assert float(words_attr[0]["size"]) == 9.960
79+
80+
assert words_w_spaces[0]["text"] == "Agaaaaa: AAAA"
81+
7482
vertical = [w for w in words if w["upright"] == 0]
7583
assert vertical[0]["text"] == "Aaaaaabag8"
84+
7685
assert words_rtl[1]["text"] == "baaabaaA/AAA"
7786

7887
def test_extract_text(self):

0 commit comments

Comments
 (0)