diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index 8b6796d6..10086917 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -265,3 +265,9 @@ class PipelineOptions(BaseModel): do_ocr: bool = False # True: perform OCR, replace programmatic PDF text table_structure_options: TableStructureOptions = TableStructureOptions() + + +class AssembleOptions(BaseModel): + keep_page_images: bool = ( + False # False: page images are removed in the assemble step + ) diff --git a/docling/document_converter.py b/docling/document_converter.py index 95b30a06..9954bc9b 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -14,6 +14,7 @@ from docling.backend.abstract_backend import PdfDocumentBackend from docling.datamodel.base_models import ( AssembledUnit, + AssembleOptions, ConversionStatus, Page, PipelineOptions, @@ -44,6 +45,7 @@ def __init__( pipeline_options: PipelineOptions = PipelineOptions(), pdf_backend: Type[PdfDocumentBackend] = DocumentConversionInput.DEFAULT_BACKEND, pipeline_cls: Type[BaseModelPipeline] = StandardModelPipeline, + assemble_options: AssembleOptions = AssembleOptions(), ): if not artifacts_path: artifacts_path = self.download_models_hf() @@ -57,6 +59,7 @@ def __init__( self.page_assemble_model = PageAssembleModel(config={}) self.glm_model = GlmModel(config={}) self.pdf_backend = pdf_backend + self.assemble_options = assemble_options @staticmethod def download_models_hf( @@ -174,17 +177,23 @@ def process_document(self, in_doc: InputDocument) -> ConvertedDocument: pages_with_images, ) + # 4. Run pipeline stages pipeline_pages = self.model_pipeline.apply(pages_with_cells) - # 7. Assemble page elements (per page) + # 5. Assemble page elements (per page) assembled_pages = self.page_assemble_model(pipeline_pages) # exhaust assembled_pages for assembled_page in assembled_pages: # Free up mem resources before moving on with next batch - assembled_page.image = ( - None # Comment this if you want to visualize page images - ) + + # Remove page images (can be disabled) + if not self.assemble_options.keep_page_images: + assembled_page.image = ( + None # Comment this if you want to visualize page images + ) + + # Unload backend assembled_page._backend.unload() all_assembled_pages.append(assembled_page) diff --git a/examples/export_figures.py b/examples/export_figures.py new file mode 100644 index 00000000..6cd98430 --- /dev/null +++ b/examples/export_figures.py @@ -0,0 +1,102 @@ +import logging +import time +from pathlib import Path +from typing import Tuple + +from docling.datamodel.base_models import ( + AssembleOptions, + ConversionStatus, + FigureElement, + PageElement, + TableElement, +) +from docling.datamodel.document import ConvertedDocument, DocumentConversionInput +from docling.document_converter import DocumentConverter + +_log = logging.getLogger(__name__) + + +def export_page_images( + doc: ConvertedDocument, + output_dir: Path, +): + output_dir.mkdir(parents=True, exist_ok=True) + + doc_filename = doc.input.file.stem + + for page in doc.pages: + page_no = page.page_no + 1 + page_image_filename = output_dir / f"{doc_filename}-{page_no}.png" + with page_image_filename.open("wb") as fp: + page.image.save(fp, format="PNG") + + +def export_element_images( + doc: ConvertedDocument, + output_dir: Path, + allowed_element_types: Tuple[PageElement] = (FigureElement,), +): + output_dir.mkdir(parents=True, exist_ok=True) + + doc_filename = doc.input.file.stem + + for element_ix, element in enumerate(doc.assembled.elements): + if isinstance(element, allowed_element_types): + page_ix = element.page_no + crop_bbox = element.cluster.bbox.to_top_left_origin( + page_height=doc.pages[page_ix].size.height + ) + + cropped_im = doc.pages[page_ix].image.crop(crop_bbox.as_tuple()) + element_image_filename = ( + output_dir / f"{doc_filename}-element-{element_ix}.png" + ) + with element_image_filename.open("wb") as fp: + cropped_im.save(fp, "PNG") + + +def main(): + logging.basicConfig(level=logging.INFO) + + input_doc_paths = [ + Path("./test/data/2206.01062.pdf"), + ] + + input_files = DocumentConversionInput.from_paths(input_doc_paths) + + # Important: For operating with page images, we must keep them, otherwise the DocumentConverter + # will destroy them for cleaning up memory. + assemble_options = AssembleOptions() + assemble_options.keep_page_images = True + + doc_converter = DocumentConverter(assemble_options=assemble_options) + + start_time = time.time() + + converted_docs = doc_converter.convert(input_files) + + for doc in converted_docs: + if doc.status != ConversionStatus.SUCCESS: + _log.info(f"Document {doc.input.file} failed to convert.") + continue + + # Export page images + export_page_images(doc, output_dir=Path("./scratch")) + + # Export figures + # export_element_images(doc, output_dir=Path("./scratch"), allowed_element_types=(FigureElement,)) + + # Export figures and tables + export_element_images( + doc, + output_dir=Path("./scratch"), + allowed_element_types=(FigureElement, TableElement), + ) + + end_time = time.time() - start_time + + _log.info(f"All documents were converted in {end_time:.2f} seconds.") + + +if __name__ == "__main__": + main()