Allow *all* extract_text kwargs for tables

Any `text_`-prefixed table extraction setting now automatically gets passed to `.extract_text(...)` via `table.extract(...)`. This introduces one minor but breaking change, which is that `keep_blank_chars` (previously, a valid table extraction setting) now needs to be passed as `text_keep_blank_chars`.
jsvine · Feb 13, 2023 · c4e1b29 · c4e1b29
1 parent d3662de
commit c4e1b29
Show file tree

Hide file tree

Showing 4 changed files with 45 additions and 39 deletions.
diff --git a/README.md b/README.md
@@ -384,9 +384,9 @@ By default, `extract_tables` uses the page's vertical and horizontal lines (or r
 |`"edge_min_length"`| Edges shorter than `edge_min_length` will be discarded before attempting to reconstruct the table.|
 |`"min_words_vertical"`| When using `"vertical_strategy": "text"`, at least `min_words_vertical` words must share the same alignment.|
 |`"min_words_horizontal"`| When using `"horizontal_strategy": "text"`, at least `min_words_horizontal` words must share the same alignment.|
-|`"keep_blank_chars"`| When using the `text` strategy, consider `" "` chars to be *parts* of words and not word-separators.|
-|`"text_tolerance"`, `"text_x_tolerance"`, `"text_y_tolerance"`| When the `text` strategy searches for words, it will expect the individual letters in each word to be no more than `text_tolerance` pixels apart.|
 |`"intersection_tolerance"`, `"intersection_x_tolerance"`, `"intersection_y_tolerance"`| When combining edges into cells, orthogonal edges must be within `intersection_tolerance` pixels to be considered intersecting.|
+|`"text_*"`| All settings prefixed with `text_` are then used when extracting text from each discovered table. All possible arguments to `Page.extract_text(...)` are also valid here.|
+|`"text_x_tolerance"`, `"text_y_tolerance"`| These `text_`-prefixed settings *also* apply to the table-identification algorithm when the `text` strategy is used. I.e., when that algorithm searches for words, it will expect the individual letters in each word to be no more than `text_[x|y]_tolerance` pixels apart.|
 
 ### Table-extraction strategies
 

diff --git a/pdfplumber/page.py b/pdfplumber/page.py
@@ -273,12 +273,7 @@ def extract_tables(
     ) -> List[List[List[Optional[str]]]]:
         tset = TableSettings.resolve(table_settings)
         tables = self.find_tables(tset)
-
-        extract_kwargs = {
-            k: getattr(tset, "text_" + k) for k in ["x_tolerance", "y_tolerance"]
-        }
-
-        return [table.extract(**extract_kwargs) for table in tables]
+        return [table.extract(**(tset.text_settings or {})) for table in tables]
 
     def extract_table(
         self, table_settings: Optional[T_table_settings] = None
@@ -295,11 +290,7 @@ def sorter(x: Table) -> Tuple[int, T_num, T_num]:
 
         largest = list(sorted(tables, key=sorter))[0]
 
-        extract_kwargs = {
-            k: getattr(tset, "text_" + k) for k in ["x_tolerance", "y_tolerance"]
-        }
-
-        return largest.extract(**extract_kwargs)
+        return largest.extract(**(tset.text_settings or {}))
 
     def _get_textmap(self, **kwargs: Any) -> TextMap:
         defaults = dict(x_shift=self.bbox[0], y_shift=self.bbox[1])

diff --git a/pdfplumber/table.py b/pdfplumber/table.py
@@ -396,11 +396,7 @@ def rows(self) -> List[Row]:
             rows.append(row)
         return rows
 
-    def extract(
-        self,
-        x_tolerance: T_num = utils.DEFAULT_X_TOLERANCE,
-        y_tolerance: T_num = utils.DEFAULT_Y_TOLERANCE,
-    ) -> List[List[Optional[str]]]:
+    def extract(self, **kwargs: Any) -> List[List[Optional[str]]]:
 
         chars = self.page.chars
         table_arr = []
@@ -426,11 +422,12 @@ def char_in_bbox(char: T_obj, bbox: T_bbox) -> bool:
                     ]
 
                     if len(cell_chars):
-                        cell_text = utils.extract_text(
-                            cell_chars,
-                            x_tolerance=x_tolerance,
-                            y_tolerance=y_tolerance,
-                        ).strip()
+                        kwargs["x_shift"] = cell[0]
+                        kwargs["y_shift"] = cell[1]
+                        if "layout" in kwargs:
+                            kwargs["layout_width"] = cell[2] - cell[0]
+                            kwargs["layout_height"] = cell[3] - cell[1]
+                        cell_text = utils.extract_text(cell_chars, **kwargs)
                     else:
                         cell_text = ""
                 arr.append(cell_text)
@@ -450,9 +447,6 @@ def char_in_bbox(char: T_obj, bbox: T_bbox) -> bool:
     "edge_min_length",
     "min_words_vertical",
     "min_words_horizontal",
-    "text_tolerance",
-    "text_x_tolerance",
-    "text_y_tolerance",
     "intersection_tolerance",
     "intersection_x_tolerance",
     "intersection_y_tolerance",
@@ -481,13 +475,10 @@ class TableSettings:
     edge_min_length: T_num = 3
     min_words_vertical: int = DEFAULT_MIN_WORDS_VERTICAL
     min_words_horizontal: int = DEFAULT_MIN_WORDS_HORIZONTAL
-    keep_blank_chars: bool = False
-    text_tolerance: T_num = 3
-    text_x_tolerance: T_num = UNSET
-    text_y_tolerance: T_num = UNSET
     intersection_tolerance: T_num = 3
     intersection_x_tolerance: T_num = UNSET
     intersection_y_tolerance: T_num = UNSET
+    text_settings: Optional[Dict[str, Any]] = None
 
     def __post_init__(self) -> "TableSettings":
         """Clean up user-provided table settings.
@@ -517,9 +508,19 @@ def __post_init__(self) -> "TableSettings":
                     f'{{{",".join(TABLE_STRATEGIES)}}}'
                 )
 
+        if self.text_settings is None:
+            self.text_settings = {}
+
+        # This next section is for backwards compatibility
+        for attr in ["x_tolerance", "y_tolerance"]:
+            if attr not in self.text_settings:
+                self.text_settings[attr] = self.text_settings.get("tolerance", 3)
+
+        if "tolerance" in self.text_settings:
+            del self.text_settings["tolerance"]
+        # End of that section
+
         for attr, fallback in [
-            ("text_x_tolerance", "text_tolerance"),
-            ("text_y_tolerance", "text_tolerance"),
             ("snap_x_tolerance", "snap_tolerance"),
             ("snap_y_tolerance", "snap_tolerance"),
             ("join_x_tolerance", "join_tolerance"),
@@ -539,7 +540,15 @@ def resolve(cls, settings: Optional[T_table_settings]) -> "TableSettings":
         elif isinstance(settings, cls):
             return settings
         elif isinstance(settings, dict):
-            return cls(**settings)
+            core_settings = {}
+            text_settings = {}
+            for k, v in settings.items():
+                if k[:5] == "text_":
+                    text_settings[k[5:]] = v
+                else:
+                    core_settings[k] = v
+            core_settings["text_settings"] = text_settings
+            return cls(**core_settings)
         else:
             raise ValueError(f"Cannot resolve settings: {settings}")
 
@@ -588,11 +597,7 @@ def get_edges(self) -> T_obj_list:
         h_strat = settings.horizontal_strategy
 
         if v_strat == "text" or h_strat == "text":
-            words = self.page.extract_words(
-                x_tolerance=settings.text_x_tolerance,
-                y_tolerance=settings.text_y_tolerance,
-                keep_blank_chars=settings.keep_blank_chars,
-            )
+            words = self.page.extract_words(**(settings.text_settings or {}))
 
         v_explicit = []
         for desc in settings.explicit_vertical_lines or []:

diff --git a/tests/test_table.py b/tests/test_table.py
@@ -133,6 +133,16 @@ def test_text_tolerance(self):
         ]
         assert t_tol[-1] == t_tol_from_tables[-1]
 
+    def test_text_layout(self):
+        path = os.path.join(HERE, "pdfs/issue-53-example.pdf")
+        with pdfplumber.open(path) as pdf:
+            table = pdf.pages[0].extract_table(
+                {
+                    "text_layout": True,
+                }
+            )
+            assert table[3][0] == "   FY2013   \n   FY2014   "
+
     def test_text_without_words(self):
         assert table.words_to_edges_h([]) == []
         assert table.words_to_edges_v([]) == []
@@ -195,7 +205,7 @@ def test_discussion_539_null_value(self):
                 "edge_min_length": 3,
                 "min_words_vertical": 3,
                 "min_words_horizontal": 1,
-                "keep_blank_chars": False,
+                "text_keep_blank_chars": False,
                 "text_tolerance": 3,
                 "intersection_tolerance": 3,
             }