From c3e878a4a6a105336df620c1f23031d65fc3b311 Mon Sep 17 00:00:00 2001 From: tis-abe-akira Date: Tue, 23 Sep 2025 08:30:58 +0900 Subject: [PATCH 1/2] to_markdown: add parameters to specify minimum table size for inclusion --- pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py index ad6ca0c1..660bfc1c 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py +++ b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py @@ -329,6 +329,8 @@ def to_markdown( show_progress=False, use_glyphs=False, ignore_alpha=False, + min_table_rows=2, + min_table_cols=2, ) -> str: """Process the document and return the text of the selected pages. @@ -354,6 +356,8 @@ def to_markdown( show_progress: (bool, False) print progress as each page is processed. use_glyphs: (bool, False) replace the Invalid Unicode by glyph numbers. ignore_alpha: (bool, True) ignore text with alpha = 0 (transparent). + min_table_rows: (int, 2) minimum number of rows for a table to be included. + min_table_cols: (int, 2) minimum number of columns for a table to be included. """ if write_images is False and embed_images is False and force_text is False: @@ -929,7 +933,7 @@ def sort_words(words: list) -> list: return nwords def get_page_output( - doc, pno, margins, textflags, FILENAME, IGNORE_IMAGES, IGNORE_GRAPHICS + doc, pno, margins, textflags, FILENAME, IGNORE_IMAGES, IGNORE_GRAPHICS, min_table_rows, min_table_cols ): """Process one page. @@ -1036,7 +1040,7 @@ def get_page_output( tabs = page.find_tables(clip=parms.clip, strategy=table_strategy) for t in tabs.tables: # remove tables with too few rows or columns - if t.row_count < 2 or t.col_count < 2: + if t.row_count < min_table_rows or t.col_count < min_table_cols: omitted_table_rects.append(pymupdf.Rect(t.bbox)) continue parms.tabs.append(t) @@ -1200,7 +1204,7 @@ def get_page_output( pages = ProgressBar(pages) for pno in pages: parms = get_page_output( - doc, pno, margins, textflags, FILENAME, IGNORE_IMAGES, IGNORE_GRAPHICS + doc, pno, margins, textflags, FILENAME, IGNORE_IMAGES, IGNORE_GRAPHICS, min_table_rows, min_table_cols ) if page_chunks is False: document_output += parms.md_string From ec081ca8d6f03c0e60aeab3a84dabfcdb60490cf Mon Sep 17 00:00:00 2001 From: tis-abe-akira Date: Tue, 23 Sep 2025 08:37:23 +0900 Subject: [PATCH 2/2] to_markdown: validate min_table_rows and min_table_cols parameters --- pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py index 660bfc1c..726078b8 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py +++ b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py @@ -360,6 +360,12 @@ def to_markdown( min_table_cols: (int, 2) minimum number of columns for a table to be included. """ + # Validate min_table_rows and min_table_cols parameters + if not isinstance(min_table_rows, int) or min_table_rows < 1: + raise ValueError("min_table_rows must be a positive integer (>= 1)") + if not isinstance(min_table_cols, int) or min_table_cols < 1: + raise ValueError("min_table_cols must be a positive integer (>= 1)") + if write_images is False and embed_images is False and force_text is False: raise ValueError("Image and text on images cannot both be suppressed.") if embed_images is True: