Skip to content

Commit

Permalink
Allow *all* extract_text kwargs for tables
Browse files Browse the repository at this point in the history
Any `text_`-prefixed table extraction setting now automatically gets
passed to `.extract_text(...)` via `table.extract(...)`.

This introduces one minor but breaking change, which is that
`keep_blank_chars` (previously, a valid table extraction setting) now
needs to be passed as `text_keep_blank_chars`.
  • Loading branch information
jsvine committed Feb 13, 2023
1 parent d3662de commit c4e1b29
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 39 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -384,9 +384,9 @@ By default, `extract_tables` uses the page's vertical and horizontal lines (or r
|`"edge_min_length"`| Edges shorter than `edge_min_length` will be discarded before attempting to reconstruct the table.|
|`"min_words_vertical"`| When using `"vertical_strategy": "text"`, at least `min_words_vertical` words must share the same alignment.|
|`"min_words_horizontal"`| When using `"horizontal_strategy": "text"`, at least `min_words_horizontal` words must share the same alignment.|
|`"keep_blank_chars"`| When using the `text` strategy, consider `" "` chars to be *parts* of words and not word-separators.|
|`"text_tolerance"`, `"text_x_tolerance"`, `"text_y_tolerance"`| When the `text` strategy searches for words, it will expect the individual letters in each word to be no more than `text_tolerance` pixels apart.|
|`"intersection_tolerance"`, `"intersection_x_tolerance"`, `"intersection_y_tolerance"`| When combining edges into cells, orthogonal edges must be within `intersection_tolerance` pixels to be considered intersecting.|
|`"text_*"`| All settings prefixed with `text_` are then used when extracting text from each discovered table. All possible arguments to `Page.extract_text(...)` are also valid here.|
|`"text_x_tolerance"`, `"text_y_tolerance"`| These `text_`-prefixed settings *also* apply to the table-identification algorithm when the `text` strategy is used. I.e., when that algorithm searches for words, it will expect the individual letters in each word to be no more than `text_[x|y]_tolerance` pixels apart.|

### Table-extraction strategies

Expand Down
13 changes: 2 additions & 11 deletions pdfplumber/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,12 +273,7 @@ def extract_tables(
) -> List[List[List[Optional[str]]]]:
tset = TableSettings.resolve(table_settings)
tables = self.find_tables(tset)

extract_kwargs = {
k: getattr(tset, "text_" + k) for k in ["x_tolerance", "y_tolerance"]
}

return [table.extract(**extract_kwargs) for table in tables]
return [table.extract(**(tset.text_settings or {})) for table in tables]

def extract_table(
self, table_settings: Optional[T_table_settings] = None
Expand All @@ -295,11 +290,7 @@ def sorter(x: Table) -> Tuple[int, T_num, T_num]:

largest = list(sorted(tables, key=sorter))[0]

extract_kwargs = {
k: getattr(tset, "text_" + k) for k in ["x_tolerance", "y_tolerance"]
}

return largest.extract(**extract_kwargs)
return largest.extract(**(tset.text_settings or {}))

def _get_textmap(self, **kwargs: Any) -> TextMap:
defaults = dict(x_shift=self.bbox[0], y_shift=self.bbox[1])
Expand Down
55 changes: 30 additions & 25 deletions pdfplumber/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -396,11 +396,7 @@ def rows(self) -> List[Row]:
rows.append(row)
return rows

def extract(
self,
x_tolerance: T_num = utils.DEFAULT_X_TOLERANCE,
y_tolerance: T_num = utils.DEFAULT_Y_TOLERANCE,
) -> List[List[Optional[str]]]:
def extract(self, **kwargs: Any) -> List[List[Optional[str]]]:

chars = self.page.chars
table_arr = []
Expand All @@ -426,11 +422,12 @@ def char_in_bbox(char: T_obj, bbox: T_bbox) -> bool:
]

if len(cell_chars):
cell_text = utils.extract_text(
cell_chars,
x_tolerance=x_tolerance,
y_tolerance=y_tolerance,
).strip()
kwargs["x_shift"] = cell[0]
kwargs["y_shift"] = cell[1]
if "layout" in kwargs:
kwargs["layout_width"] = cell[2] - cell[0]
kwargs["layout_height"] = cell[3] - cell[1]
cell_text = utils.extract_text(cell_chars, **kwargs)
else:
cell_text = ""
arr.append(cell_text)
Expand All @@ -450,9 +447,6 @@ def char_in_bbox(char: T_obj, bbox: T_bbox) -> bool:
"edge_min_length",
"min_words_vertical",
"min_words_horizontal",
"text_tolerance",
"text_x_tolerance",
"text_y_tolerance",
"intersection_tolerance",
"intersection_x_tolerance",
"intersection_y_tolerance",
Expand Down Expand Up @@ -481,13 +475,10 @@ class TableSettings:
edge_min_length: T_num = 3
min_words_vertical: int = DEFAULT_MIN_WORDS_VERTICAL
min_words_horizontal: int = DEFAULT_MIN_WORDS_HORIZONTAL
keep_blank_chars: bool = False
text_tolerance: T_num = 3
text_x_tolerance: T_num = UNSET
text_y_tolerance: T_num = UNSET
intersection_tolerance: T_num = 3
intersection_x_tolerance: T_num = UNSET
intersection_y_tolerance: T_num = UNSET
text_settings: Optional[Dict[str, Any]] = None

def __post_init__(self) -> "TableSettings":
"""Clean up user-provided table settings.
Expand Down Expand Up @@ -517,9 +508,19 @@ def __post_init__(self) -> "TableSettings":
f'{{{",".join(TABLE_STRATEGIES)}}}'
)

if self.text_settings is None:
self.text_settings = {}

# This next section is for backwards compatibility
for attr in ["x_tolerance", "y_tolerance"]:
if attr not in self.text_settings:
self.text_settings[attr] = self.text_settings.get("tolerance", 3)

if "tolerance" in self.text_settings:
del self.text_settings["tolerance"]
# End of that section

for attr, fallback in [
("text_x_tolerance", "text_tolerance"),
("text_y_tolerance", "text_tolerance"),
("snap_x_tolerance", "snap_tolerance"),
("snap_y_tolerance", "snap_tolerance"),
("join_x_tolerance", "join_tolerance"),
Expand All @@ -539,7 +540,15 @@ def resolve(cls, settings: Optional[T_table_settings]) -> "TableSettings":
elif isinstance(settings, cls):
return settings
elif isinstance(settings, dict):
return cls(**settings)
core_settings = {}
text_settings = {}
for k, v in settings.items():
if k[:5] == "text_":
text_settings[k[5:]] = v
else:
core_settings[k] = v
core_settings["text_settings"] = text_settings
return cls(**core_settings)
else:
raise ValueError(f"Cannot resolve settings: {settings}")

Expand Down Expand Up @@ -588,11 +597,7 @@ def get_edges(self) -> T_obj_list:
h_strat = settings.horizontal_strategy

if v_strat == "text" or h_strat == "text":
words = self.page.extract_words(
x_tolerance=settings.text_x_tolerance,
y_tolerance=settings.text_y_tolerance,
keep_blank_chars=settings.keep_blank_chars,
)
words = self.page.extract_words(**(settings.text_settings or {}))

v_explicit = []
for desc in settings.explicit_vertical_lines or []:
Expand Down
12 changes: 11 additions & 1 deletion tests/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,16 @@ def test_text_tolerance(self):
]
assert t_tol[-1] == t_tol_from_tables[-1]

def test_text_layout(self):
path = os.path.join(HERE, "pdfs/issue-53-example.pdf")
with pdfplumber.open(path) as pdf:
table = pdf.pages[0].extract_table(
{
"text_layout": True,
}
)
assert table[3][0] == " FY2013 \n FY2014 "

def test_text_without_words(self):
assert table.words_to_edges_h([]) == []
assert table.words_to_edges_v([]) == []
Expand Down Expand Up @@ -195,7 +205,7 @@ def test_discussion_539_null_value(self):
"edge_min_length": 3,
"min_words_vertical": 3,
"min_words_horizontal": 1,
"keep_blank_chars": False,
"text_keep_blank_chars": False,
"text_tolerance": 3,
"intersection_tolerance": 3,
}
Expand Down

0 comments on commit c4e1b29

Please sign in to comment.