From 3772af60c72db6504695e4992aec7a19ed70ce62 Mon Sep 17 00:00:00 2001 From: Jeremy Singer-Vine Date: Tue, 4 Jul 2023 09:41:19 -0400 Subject: [PATCH] Add Page.find_table(...) (#873) Previously, `pdfplumber.Page` had these table-getting methods: - `.find_tables(...)` - `.extract_tables(...)` - `.extract_table(...)` For consistency/completeness's sake, this commit adds: - `.find_table(...)` ... which, analogous to `.extract_table(...)`, returns the largest table on the page. Indeed, `.extract_table(...)` now uses `.find_table(...)` beneath the hood. Thanks to @pdille for the suggestion, here: https://github.com/jsvine/pdfplumber/discussions/864#discussioncomment-5668209 --- README.md | 3 ++- pdfplumber/page.py | 30 ++++++++++++++++++++---------- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index ee34ff3b..d7684fba 100644 --- a/README.md +++ b/README.md @@ -346,8 +346,9 @@ If you're using `pdfplumber` on a Debian-based system and encounter a `PolicyErr | Method | Description | |--------|-------------| |`.find_tables(table_settings={})`|Returns a list of `Table` objects. The `Table` object provides access to the `.cells`, `.rows`, and `.bbox` properties, as well as the `.extract(x_tolerance=3, y_tolerance=3)` method.| +|`.find_table(table_settings={})`|Similar to `.find_tables(...)`, but returns the *largest* table on the page, as a `Table` object. If multiple tables have the same size — as measured by the number of cells — this method returns the table closest to the top of the page.| |`.extract_tables(table_settings={})`|Returns the text extracted from *all* tables found on the page, represented as a list of lists of lists, with the structure `table -> row -> cell`.| -|`.extract_table(table_settings={})`|Returns the text extracted from the *largest* table on the page, represented as a list of lists, with the structure `row -> cell`. (If multiple tables have the same size — as measured by the number of cells — this method returns the table closest to the top of the page.)| +|`.extract_table(table_settings={})`|Returns the text extracted from the *largest* table on the page (see `.find_table(...)` above), represented as a list of lists, with the structure `row -> cell`.| |`.debug_tablefinder(table_settings={})`|Returns an instance of the `TableFinder` class, with access to the `.edges`, `.intersections`, `.cells`, and `.tables` properties.| For example: diff --git a/pdfplumber/page.py b/pdfplumber/page.py index e6a13ff8..b317857a 100644 --- a/pdfplumber/page.py +++ b/pdfplumber/page.py @@ -307,16 +307,9 @@ def find_tables( tset = TableSettings.resolve(table_settings) return TableFinder(self, tset).tables - def extract_tables( - self, table_settings: Optional[T_table_settings] = None - ) -> List[List[List[Optional[str]]]]: - tset = TableSettings.resolve(table_settings) - tables = self.find_tables(tset) - return [table.extract(**(tset.text_settings or {})) for table in tables] - - def extract_table( + def find_table( self, table_settings: Optional[T_table_settings] = None - ) -> Optional[List[List[Optional[str]]]]: + ) -> Optional[Table]: tset = TableSettings.resolve(table_settings) tables = self.find_tables(tset) @@ -329,7 +322,24 @@ def sorter(x: Table) -> Tuple[int, T_num, T_num]: largest = list(sorted(tables, key=sorter))[0] - return largest.extract(**(tset.text_settings or {})) + return largest + + def extract_tables( + self, table_settings: Optional[T_table_settings] = None + ) -> List[List[List[Optional[str]]]]: + tset = TableSettings.resolve(table_settings) + tables = self.find_tables(tset) + return [table.extract(**(tset.text_settings or {})) for table in tables] + + def extract_table( + self, table_settings: Optional[T_table_settings] = None + ) -> Optional[List[List[Optional[str]]]]: + tset = TableSettings.resolve(table_settings) + table = self.find_table(tset) + if table is None: + return None + else: + return table.extract(**(tset.text_settings or {})) def _get_textmap(self, **kwargs: Any) -> TextMap: defaults = dict(x_shift=self.bbox[0], y_shift=self.bbox[1])