Refactor .extract_words and allow attrib-grouping

jsvine · jsvine · commit c8b200ee7c10 · 2020-08-29T16:05:31.000-04:00
This commit refactors and hopefully makes clearer the logic in
utils.extract_words. It also adds a new parameter, `extra_attrs`, which
allows the user to pass a list of attributes on which to group all
characters.

For instance, passing `extra_attrs=["fontname", "size"]` will not allow
characters with different font names or sizes to become part of the same
word. As a benefit, those resulting word dicts will contain `"fontname"`
and `"size"` attributes — providing a long-requested feature (cf. issue
diff --git a/README.md b/README.md
@@ -100,7 +100,7 @@ The `pdfplumber.Page` class is at the core of `pdfplumber`. Most things you'll d
 |`.within_bbox(bounding_box, relative=False)`| Similar to `.crop`, but only retains objects that fall *entirely* within the bounding box.|
 |`.filter(test_function)`| Returns a version of the page with only the `.objects` for which `test_function(obj)` returns `True`.|
 |`.extract_text(x_tolerance=3, y_tolerance=3)`| Collates all of the page's character objects into a single string. Adds spaces where the difference between the `x1` of one character and the `x0` of the next is greater than `x_tolerance`. Adds newline characters where the difference between the `doctop` of one character and the `doctop` of the next is greater than `y_tolerance`.|
-|`.extract_words(x_tolerance=3, y_tolerance=3, horizontal_ltr=True, vertical_ttb=True)`| Returns a list of all word-looking things and their bounding boxes. Words are considered to be sequences of characters where (for "upright" characters) the difference between the `x1` of one character and the `x0` of the next is less than or equal to `x_tolerance` *and* where the `doctop` of one character and the `doctop` of the next is less than or equal to `y_tolerance`. A similar approach is taken for non-upright characters, but instead measuring the vertical, rather than horizontal, distances between them. The parameters `horizontal_ltr` and `vertical_ttb` indicate whether the words should be read from left-to-right (for horizontal words) / top-to-bottom (for vertical words).|
+|`.extract_words(x_tolerance=3, y_tolerance=3, keep_blank_chars=False, horizontal_ltr=True, vertical_ttb=True, extra_attrs=[])`| Returns a list of all word-looking things and their bounding boxes. Words are considered to be sequences of characters where (for "upright" characters) the difference between the `x1` of one character and the `x0` of the next is less than or equal to `x_tolerance` *and* where the `doctop` of one character and the `doctop` of the next is less than or equal to `y_tolerance`. A similar approach is taken for non-upright characters, but instead measuring the vertical, rather than horizontal, distances between them. The parameters `horizontal_ltr` and `vertical_ttb` indicate whether the words should be read from left-to-right (for horizontal words) / top-to-bottom (for vertical words). Changing `keep_blank_chars` to `True` will mean that blank characters are treated as part of a word, not as a space between words. Passing a list of `extra_attrs`  (e.g., `["fontname", "size"]` will restrict each words to characters that share exactly the same value for each of those attributes, and the resulting word dicts will indicate those attributes.|
 |`.extract_tables(table_settings)`| Extracts tabular data from the page. For more details see "[Extracting tables](#extracting-tables)" below.|
 |`.to_image(**conversion_kwargs)`| Returns an instance of the `PageImage` class. For more details, see "[Visual debugging](#visual-debugging)" below. For conversion_kwargs, see [here](http://docs.wand-py.org/en/latest/wand/image.html#wand.image.Image).|
 
diff --git a/pdfplumber/utils.py b/pdfplumber/utils.py
@@ -205,86 +205,103 @@ def bbox_to_rect(bbox):
     return {"x0": bbox[0], "top": bbox[1], "x1": bbox[2], "bottom": bbox[3]}
 
 
-def extract_words(
-    chars,
-    x_tolerance=DEFAULT_X_TOLERANCE,
-    y_tolerance=DEFAULT_Y_TOLERANCE,
-    keep_blank_chars=False,
-    horizontal_ltr=True,  # Should words be read left-to-right?
-    vertical_ttb=True,  # Should vertical words be read top-to-bottom?
-):
+def merge_chars(ordered_chars, extra_attrs=[]):
+    x0, top, x1, bottom = objects_to_bbox(ordered_chars)
+
+    word = {
+        "text": "".join(map(itemgetter("text"), ordered_chars)),
+        "x0": x0,
+        "x1": x1,
+        "top": top,
+        "bottom": bottom,
+        "upright": ordered_chars[0]["upright"],
+    }
 
-    x_tolerance = decimalize(x_tolerance)
-    y_tolerance = decimalize(y_tolerance)
+    for key in extra_attrs:
+        word[key] = ordered_chars[0][key]
 
-    def process_word_chars(chars, upright):
-        x0, top, x1, bottom = objects_to_bbox(chars)
+    return word
 
-        return {
-            "x0": x0,
-            "x1": x1,
-            "top": top,
-            "bottom": bottom,
-            "upright": upright,
-            "text": "".join(map(itemgetter("text"), chars)),
-        }
 
-    def get_line_words(chars, upright, tolerance):
-        get_text = itemgetter("text")
-        if upright:
-            min_key, max_key = ("x0", "x1") if horizontal_ltr else ("x1", "x0")
-        else:
-            min_key, max_key = ("top", "bottom") if vertical_ttb else ("bottom", "top")
+def cluster_line_chars(
+    chars, tolerance, keep_blank_chars=False, min_key="x0", max_key="x1", sort_asc=True
+):
+    get_text = itemgetter("text")
+
+    words = []
+    current_word = []
 
-        words = []
-        current_word = []
+    comp_fn = gt if sort_asc else lt
+    tol_fn = add if sort_asc else sub
 
-        asc_order = (upright and horizontal_ltr) or (not upright and vertical_ttb)
+    def sort_key(x):
+        return tol_fn(0, x[min_key])
 
-        comp_fn = gt if asc_order else lt
-        tol_fn = add if asc_order else sub
+    sorted_chars = sorted(chars, key=sort_key)
 
-        def sort_key(x):
-            return tol_fn(0, x[min_key])
+    for char in sorted_chars:
+        if not keep_blank_chars and get_text(char).isspace():
+            if len(current_word) > 0:
+                words.append(current_word)
+                current_word = []
+        elif len(current_word) == 0:
+            current_word.append(char)
+        else:
+            last_char = current_word[-1]
+            prev_pos = tol_fn(last_char[max_key], tolerance)
+            if comp_fn(char[min_key], prev_pos):
+                words.append(current_word)
+                current_word = []
+            current_word.append(char)
+
+    if len(current_word) > 0:
+        words.append(current_word)
 
-        sorted_chars = sorted(chars, key=sort_key)
+    return words
 
-        for char in sorted_chars:
-            if not keep_blank_chars and get_text(char).isspace():
-                if len(current_word) > 0:
-                    words.append(current_word)
-                    current_word = []
-                else:
-                    pass
-            elif len(current_word) == 0:
-                current_word.append(char)
-            else:
-                last_char = current_word[-1]
-                prev_pos = tol_fn(last_char[max_key], tolerance)
-                if comp_fn(char[min_key], prev_pos):
-                    words.append(current_word)
-                    current_word = []
-                current_word.append(char)
 
-        if len(current_word) > 0:
-            words.append(current_word)
+def extract_words(
+    chars,
+    x_tolerance=DEFAULT_X_TOLERANCE,
+    y_tolerance=DEFAULT_Y_TOLERANCE,
+    keep_blank_chars=False,
+    horizontal_ltr=True,  # Should words be read left-to-right?
+    vertical_ttb=True,  # Should vertical words be read top-to-bottom?
+    extra_attrs=[],
+):
 
-        return [process_word_chars(chars, upright) for chars in words]
+    x_tolerance = decimalize(x_tolerance)
+    y_tolerance = decimalize(y_tolerance)
 
-    chars_by_upright = {True: [], False: []}
     words = []
-    for char in to_list(chars):
-        chars_by_upright[char.get("upright", False)].append(char)
+    grouped = itertools.groupby(chars, itemgetter("upright", *extra_attrs))
+
+    for keyvals, char_group in grouped:
+        upright = keyvals[0] if len(extra_attrs) else keyvals
 
-    for upright, char_group in chars_by_upright.items():
         clusters = cluster_objects(
             char_group,
             "doctop" if upright else "x0",
             y_tolerance,  # Still use y-tolerance here, even for vertical words
         )
 
+        sort_asc = (upright and horizontal_ltr) or (not upright and vertical_ttb)
+        min_key, max_key = ("x0", "x1") if upright else ("top", "bottom")
+
+        if not sort_asc:
+            min_key, max_key = max_key, min_key
+
         for line_chars in clusters:
-            words += get_line_words(line_chars, upright, tolerance=x_tolerance)
+            word_clusters = cluster_line_chars(
+                line_chars,
+                # Still  use x-tolerance here, even for vertical words
+                tolerance=x_tolerance,
+                keep_blank_chars=keep_blank_chars,
+                min_key=min_key,
+                max_key=max_key,
+                sort_asc=sort_asc,
+            )
+            words += [merge_chars(c, extra_attrs) for c in word_clusters]
 
     return words
 
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -68,11 +68,20 @@ def test_extract_words(self):
         with pdfplumber.open(path) as pdf:
             p = pdf.pages[0]
             words = p.extract_words(vertical_ttb=False)
+            words_attr = p.extract_words(vertical_ttb=False, extra_attrs = [ "size" ])
+            words_w_spaces = p.extract_words(vertical_ttb=False, keep_blank_chars=True)
             words_rtl = p.extract_words(horizontal_ltr=False)
 
         assert words[0]["text"] == "Agaaaaa:"
+
+        assert "size" not in words[0]
+        assert float(words_attr[0]["size"]) == 9.960
+
+        assert words_w_spaces[0]["text"] == "Agaaaaa: AAAA"
+
         vertical = [w for w in words if w["upright"] == 0]
         assert vertical[0]["text"] == "Aaaaaabag8"
+
         assert words_rtl[1]["text"] == "baaabaaA/AAA"
 
     def test_extract_text(self):