Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Test cases for RTL programmatic PDFs and fixes for the formula model #903

Merged
merged 22 commits into from
Feb 7, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions docling/models/code_formula_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from pathlib import Path
from typing import Iterable, List, Literal, Optional, Tuple, Union

import numpy as np
from docling_core.types.doc import (
CodeItem,
DocItemLabel,
Expand Down Expand Up @@ -103,7 +104,7 @@ def __init__(
artifacts_path = artifacts_path / self._model_repo_folder

self.code_formula_model = CodeFormulaPredictor(
artifacts_path=artifacts_path,
artifacts_path=str(artifacts_path),
device=device,
num_threads=accelerator_options.num_threads,
)
Expand All @@ -123,7 +124,7 @@ def download_models(
repo_id="ds4sd/CodeFormula",
force_download=force,
local_dir=local_dir,
revision="v1.0.0",
revision="v1.0.1",
)

return Path(download_path)
Expand Down Expand Up @@ -231,7 +232,7 @@ def __call__(
return

labels: List[str] = []
images: List[Image.Image] = []
images: List[Union[Image.Image, np.ndarray]] = []
elements: List[TextItem] = []
for el in element_batch:
assert isinstance(el.item, TextItem)
Expand Down
5 changes: 3 additions & 2 deletions docling/models/document_picture_classifier.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from pathlib import Path
from typing import Iterable, List, Literal, Optional, Tuple, Union

import numpy as np
from docling_core.types.doc import (
DoclingDocument,
NodeItem,
Expand Down Expand Up @@ -94,7 +95,7 @@ def __init__(
artifacts_path = artifacts_path / self._model_repo_folder

self.document_picture_classifier = DocumentFigureClassifierPredictor(
artifacts_path=artifacts_path,
artifacts_path=str(artifacts_path),
device=device,
num_threads=accelerator_options.num_threads,
)
Expand Down Expand Up @@ -161,7 +162,7 @@ def __call__(
yield element
return

images: List[Image.Image] = []
images: List[Union[Image.Image, np.ndarray]] = []
elements: List[PictureItem] = []
for el in element_batch:
assert isinstance(el, PictureItem)
Expand Down
4 changes: 3 additions & 1 deletion docling/models/layout_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,10 +150,12 @@ def __call__(
else:
with TimeRecorder(conv_res, "layout"):
assert page.size is not None
page_image = page.get_image(scale=1.0)
assert page_image is not None

clusters = []
for ix, pred_item in enumerate(
self.layout_predictor.predict(page.get_image(scale=1.0))
self.layout_predictor.predict(page_image)
):
label = DocItemLabel(
pred_item["label"]
Expand Down
8 changes: 4 additions & 4 deletions docs/examples/batch_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,10 +103,10 @@ def main():
logging.basicConfig(level=logging.INFO)

input_doc_paths = [
Path("./tests/data/2206.01062.pdf"),
Path("./tests/data/2203.01017v2.pdf"),
Path("./tests/data/2305.03393v1.pdf"),
Path("./tests/data/redp5110_sampled.pdf"),
Path("./tests/data/pdf/2206.01062.pdf"),
Path("./tests/data/pdf/2203.01017v2.pdf"),
Path("./tests/data/pdf/2305.03393v1.pdf"),
Path("./tests/data/pdf/redp5110_sampled.pdf"),
]

# buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
Expand Down
2 changes: 1 addition & 1 deletion docs/examples/custom_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
def main():
logging.basicConfig(level=logging.INFO)

input_doc_path = Path("./tests/data/2206.01062.pdf")
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")

###########################################################################

Expand Down
2 changes: 1 addition & 1 deletion docs/examples/develop_formula_understanding.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def get_default_options(cls) -> ExampleFormulaUnderstandingPipelineOptions:
def main():
logging.basicConfig(level=logging.INFO)

input_doc_path = Path("./tests/data/2203.01017v2.pdf")
input_doc_path = Path("./tests/data/pdf/2203.01017v2.pdf")

pipeline_options = ExampleFormulaUnderstandingPipelineOptions()
pipeline_options.do_formula_understanding = True
Expand Down
2 changes: 1 addition & 1 deletion docs/examples/develop_picture_enrichment.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def get_default_options(cls) -> ExamplePictureClassifierPipelineOptions:
def main():
logging.basicConfig(level=logging.INFO)

input_doc_path = Path("./tests/data/2206.01062.pdf")
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")

pipeline_options = ExamplePictureClassifierPipelineOptions()
pipeline_options.images_scale = 2.0
Expand Down
2 changes: 1 addition & 1 deletion docs/examples/export_figures.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
def main():
logging.basicConfig(level=logging.INFO)

input_doc_path = Path("./tests/data/2206.01062.pdf")
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
output_dir = Path("scratch")

# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
Expand Down
2 changes: 1 addition & 1 deletion docs/examples/export_multimodal.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
def main():
logging.basicConfig(level=logging.INFO)

input_doc_path = Path("./tests/data/2206.01062.pdf")
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
output_dir = Path("scratch")

# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
Expand Down
2 changes: 1 addition & 1 deletion docs/examples/export_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
def main():
logging.basicConfig(level=logging.INFO)

input_doc_path = Path("./tests/data/2206.01062.pdf")
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
output_dir = Path("scratch")

doc_converter = DocumentConverter()
Expand Down
2 changes: 1 addition & 1 deletion docs/examples/full_page_ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@


def main():
input_doc = Path("./tests/data/2206.01062.pdf")
input_doc = Path("./tests/data/pdf/2206.01062.pdf")

pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
Expand Down
2 changes: 1 addition & 1 deletion docs/examples/inspect_picture_content.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption

source = "tests/data/amt_handbook_sample.pdf"
source = "tests/data/pdf/amt_handbook_sample.pdf"

pipeline_options = PdfPipelineOptions()
pipeline_options.images_scale = 2
Expand Down
2 changes: 1 addition & 1 deletion docs/examples/run_with_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@


def main():
input_doc = Path("./tests/data/2206.01062.pdf")
input_doc = Path("./tests/data/pdf/2206.01062.pdf")

# Explicitly set the accelerator
# accelerator_options = AcceleratorOptions(
Expand Down
5 changes: 2 additions & 3 deletions docs/examples/run_with_formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,8 @@ def main():
Path("tests/data/docx/lorem_ipsum.docx"),
Path("tests/data/pptx/powerpoint_sample.pptx"),
Path("tests/data/2305.03393v1-pg9-img.png"),
Path("tests/data/2206.01062.pdf"),
Path("tests/data/test_01.asciidoc"),
Path("tests/data/test_01.asciidoc"),
Path("tests/data/pdf/2206.01062.pdf"),
Path("tests/data/asciidoc/test_01.asciidoc"),
]

## for defaults use:
Expand Down
2 changes: 1 addition & 1 deletion docs/examples/tesseract_lang_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@


def main():
input_doc = Path("./tests/data/2206.01062.pdf")
input_doc = Path("./tests/data/pdf/2206.01062.pdf")

# Set lang=["auto"] with a tesseract OCR engine: TesseractOcrOptions, TesseractCliOcrOptions
# ocr_options = TesseractOcrOptions(lang=["auto"])
Expand Down
2 changes: 1 addition & 1 deletion docs/examples/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def translate(text: str, src: str = "en", dest: str = "de"):
def main():
logging.basicConfig(level=logging.INFO)

input_doc_path = Path("./tests/data/2206.01062.pdf")
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
output_dir = Path("scratch")

# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
Expand Down
10 changes: 5 additions & 5 deletions docs/v2.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,12 +117,12 @@ conv_result: ConversionResult = doc_converter.convert("https://arxiv.org/pdf/240
## Convert several files at once:

input_files = [
"tests/data/wiki_duck.html",
"tests/data/word_sample.docx",
"tests/data/lorem_ipsum.docx",
"tests/data/powerpoint_sample.pptx",
"tests/data/html/wiki_duck.html",
"tests/data/docx/word_sample.docx",
"tests/data/docx/lorem_ipsum.docx",
"tests/data/pptx/powerpoint_sample.pptx",
"tests/data/2305.03393v1-pg9-img.png",
"tests/data/2206.01062.pdf",
"tests/data/pdf/2206.01062.pdf",
]

# Directly pass list of files or streams to `convert_all`
Expand Down
Loading