Merge remote-tracking branch 'origin/main' into fix-threadsafe-pypdfium

DS4SD · Mar 2, 2025 · b3cf5d4 · b3cf5d4
2 parents 346a49c + db3ceef
commit b3cf5d4
Show file tree

Hide file tree

Showing 18 changed files with 1,457 additions and 335 deletions.
diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml
@@ -28,7 +28,7 @@ jobs:
         run: |
           for file in docs/examples/*.py; do
             # Skip batch_convert.py
-            if [[ "$(basename "$file")" =~ ^(batch_convert|minimal|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api).py ]]; then
+            if [[ "$(basename "$file")" =~ ^(batch_convert|minimal_vlm_pipeline|minimal|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api).py ]]; then
                 echo "Skipping $file"
                 continue
             fi

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,19 @@
+## [v2.25.0](https://github.com/DS4SD/docling/releases/tag/v2.25.0) - 2025-02-26
+
+### Feature
+
+* [Experimental] Introduce VLM pipeline using HF AutoModelForVision2Seq, featuring SmolDocling model ([#1054](https://github.com/DS4SD/docling/issues/1054)) ([`3c9fe76`](https://github.com/DS4SD/docling/commit/3c9fe76b706b7714b25d49cb09050c42e3b8c849))
+* **cli:** Add option for downloading all models, refine help messages ([#1061](https://github.com/DS4SD/docling/issues/1061)) ([`ab683e4`](https://github.com/DS4SD/docling/commit/ab683e4fb6df4973d2efda04f00c269a2dc95f5b))
+
+### Fix
+
+* Vlm using artifacts path ([#1057](https://github.com/DS4SD/docling/issues/1057)) ([`e197225`](https://github.com/DS4SD/docling/commit/e1972257399151503d60b4806976c8b9b6911aa8))
+* **html:** Parse text in div elements as TextItem ([#1041](https://github.com/DS4SD/docling/issues/1041)) ([`1b0ead6`](https://github.com/DS4SD/docling/commit/1b0ead69078030a0e4d25b51450ef2aa4a2e79fc))
+
+### Documentation
+
+* Extend chunking docs, add FAQ on token limit ([#1053](https://github.com/DS4SD/docling/issues/1053)) ([`c84b973`](https://github.com/DS4SD/docling/commit/c84b973959a254db22ac9a7dc8810628e4808a2d))
+
 ## [v2.24.0](https://github.com/DS4SD/docling/releases/tag/v2.24.0) - 2025-02-20
 
 ### Feature

diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py
@@ -256,10 +256,16 @@ def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
                 parent=self.parents[self.level], name="list", label=GroupLabel.LIST
             )
         elif element.name == "ol":
+            start_attr = element.get("start")
+            start: int = (
+                int(start_attr)
+                if isinstance(start_attr, str) and start_attr.isnumeric()
+                else 1
+            )
             # create a list group
             self.parents[self.level + 1] = doc.add_group(
                 parent=self.parents[self.level],
-                name="ordered list",
+                name="ordered list" + (f" start {start}" if start != 1 else ""),
                 label=GroupLabel.ORDERED_LIST,
             )
         self.level += 1
@@ -270,15 +276,23 @@ def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
         self.level -= 1
 
     def handle_list_item(self, element: Tag, doc: DoclingDocument) -> None:
-        """Handles listitem tags (li)."""
+        """Handles list item tags (li)."""
         nested_list = element.find(["ul", "ol"])
 
         parent = self.parents[self.level]
         if parent is None:
-            _log.warning(f"list-item has no parent in DoclingDocument: {element}")
+            _log.debug(f"list-item has no parent in DoclingDocument: {element}")
             return
         parent_label: str = parent.label
         index_in_list = len(parent.children) + 1
+        if (
+            parent_label == GroupLabel.ORDERED_LIST
+            and isinstance(parent, GroupItem)
+            and parent.name
+        ):
+            start_in_list: str = parent.name.split(" ")[-1]
+            start: int = int(start_in_list) if start_in_list.isnumeric() else 1
+            index_in_list += start - 1
 
         if nested_list:
             # Text in list item can be hidden within hierarchy, hence
@@ -324,13 +338,13 @@ def handle_list_item(self, element: Tag, doc: DoclingDocument) -> None:
                 parent=parent,
             )
         else:
-            _log.warning(f"list-item has no text: {element}")
+            _log.debug(f"list-item has no text: {element}")
 
     @staticmethod
     def parse_table_data(element: Tag) -> Optional[TableData]:
         nested_tables = element.find("table")
         if nested_tables is not None:
-            _log.warning("Skipping nested table.")
+            _log.debug("Skipping nested table.")
             return None
 
         # Count the number of rows (number of <tr> elements)

diff --git a/docling/cli/models.py b/docling/cli/models.py
@@ -32,9 +32,19 @@ class _AvailableModels(str, Enum):
     CODE_FORMULA = "code_formula"
     PICTURE_CLASSIFIER = "picture_classifier"
     SMOLVLM = "smolvlm"
+    GRANITE_VISION = "granite_vision"
     EASYOCR = "easyocr"
 
 
+_default_models = [
+    _AvailableModels.LAYOUT,
+    _AvailableModels.TABLEFORMER,
+    _AvailableModels.CODE_FORMULA,
+    _AvailableModels.PICTURE_CLASSIFIER,
+    _AvailableModels.EASYOCR,
+]
+
+
 @app.command("download")
 def download(
     output_dir: Annotated[
@@ -43,18 +53,27 @@ def download(
             ...,
             "-o",
             "--output-dir",
-            help="The directory where all the models are downloaded.",
+            help="The directory where to download the models.",
         ),
     ] = (settings.cache_dir / "models"),
     force: Annotated[
-        bool, typer.Option(..., help="If true, the download will be forced")
+        bool, typer.Option(..., help="If true, the download will be forced.")
     ] = False,
     models: Annotated[
         Optional[list[_AvailableModels]],
         typer.Argument(
-            help=f"Models to download (default behavior: all will be downloaded)",
+            help=f"Models to download (default behavior: a predefined set of models will be downloaded).",
         ),
     ] = None,
+    all: Annotated[
+        bool,
+        typer.Option(
+            ...,
+            "--all",
+            help="If true, all available models will be downloaded (mutually exclusive with passing specific models).",
+            show_default=True,
+        ),
+    ] = False,
     quiet: Annotated[
         bool,
         typer.Option(
@@ -65,6 +84,10 @@ def download(
         ),
     ] = False,
 ):
+    if models and all:
+        raise typer.BadParameter(
+            "Cannot simultaneously set 'all' parameter and specify models to download."
+        )
     if not quiet:
         FORMAT = "%(message)s"
         logging.basicConfig(
@@ -73,7 +96,7 @@ def download(
             datefmt="[%X]",
             handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
         )
-    to_download = models or [m for m in _AvailableModels]
+    to_download = models or ([m for m in _AvailableModels] if all else _default_models)
     output_dir = download_models(
         output_dir=output_dir,
         force=force,
@@ -83,6 +106,7 @@ def download(
         with_code_formula=_AvailableModels.CODE_FORMULA in to_download,
         with_picture_classifier=_AvailableModels.PICTURE_CLASSIFIER in to_download,
         with_smolvlm=_AvailableModels.SMOLVLM in to_download,
+        with_granite_vision=_AvailableModels.GRANITE_VISION in to_download,
         with_easyocr=_AvailableModels.EASYOCR in to_download,
     )
 

diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py
@@ -154,6 +154,10 @@ class LayoutPrediction(BaseModel):
     clusters: List[Cluster] = []
 
 
+class VlmPrediction(BaseModel):
+    text: str = ""
+
+
 class ContainerElement(
     BasePageElement
 ):  # Used for Form and Key-Value-Regions, only for typing.
@@ -197,6 +201,7 @@ class PagePredictions(BaseModel):
     tablestructure: Optional[TableStructurePrediction] = None
     figures_classification: Optional[FigureClassificationPrediction] = None
     equations_prediction: Optional[EquationPrediction] = None
+    vlm_response: Optional[VlmPrediction] = None
 
 
 PageElement = Union[TextElement, Table, FigureElement, ContainerElement]

diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py
@@ -41,6 +41,7 @@ class AcceleratorOptions(BaseSettings):
 
     num_threads: int = 4
     device: Union[str, AcceleratorDevice] = "auto"
+    cuda_use_flash_attention2: bool = False
 
     @field_validator("device")
     def validate_device(cls, value):
@@ -254,6 +255,45 @@ def repo_cache_folder(self) -> str:
 )
 
 
+class BaseVlmOptions(BaseModel):
+    kind: str
+    prompt: str
+
+
+class ResponseFormat(str, Enum):
+    DOCTAGS = "doctags"
+    MARKDOWN = "markdown"
+
+
+class HuggingFaceVlmOptions(BaseVlmOptions):
+    kind: Literal["hf_model_options"] = "hf_model_options"
+
+    repo_id: str
+    load_in_8bit: bool = True
+    llm_int8_threshold: float = 6.0
+    quantized: bool = False
+
+    response_format: ResponseFormat
+
+    @property
+    def repo_cache_folder(self) -> str:
+        return self.repo_id.replace("/", "--")
+
+
+smoldocling_vlm_conversion_options = HuggingFaceVlmOptions(
+    repo_id="ds4sd/SmolDocling-256M-preview",
+    prompt="Convert this page to docling.",
+    response_format=ResponseFormat.DOCTAGS,
+)
+
+granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
+    repo_id="ibm-granite/granite-vision-3.1-2b-preview",
+    # prompt="OCR the full page to markdown.",
+    prompt="OCR this image.",
+    response_format=ResponseFormat.MARKDOWN,
+)
+
+
 # Define an enum for the backend options
 class PdfBackend(str, Enum):
     """Enum of valid PDF backends."""
@@ -285,7 +325,24 @@ class PipelineOptions(BaseModel):
     enable_remote_services: bool = False
 
 
-class PdfPipelineOptions(PipelineOptions):
+class PaginatedPipelineOptions(PipelineOptions):
+    images_scale: float = 1.0
+    generate_page_images: bool = False
+    generate_picture_images: bool = False
+
+
+class VlmPipelineOptions(PaginatedPipelineOptions):
+    artifacts_path: Optional[Union[Path, str]] = None
+
+    generate_page_images: bool = True
+    force_backend_text: bool = (
+        False  # (To be used with vlms, or other generative models)
+    )
+    # If True, text from backend will be used instead of generated text
+    vlm_options: Union[HuggingFaceVlmOptions] = smoldocling_vlm_conversion_options
+
+
+class PdfPipelineOptions(PaginatedPipelineOptions):
     """Options for the PDF pipeline."""
 
     artifacts_path: Optional[Union[Path, str]] = None
@@ -295,6 +352,10 @@ class PdfPipelineOptions(PipelineOptions):
     do_formula_enrichment: bool = False  # True: perform formula OCR, return Latex code
     do_picture_classification: bool = False  # True: classify pictures in documents
     do_picture_description: bool = False  # True: run describe pictures in documents
+    force_backend_text: bool = (
+        False  # (To be used with vlms, or other generative models)
+    )
+    # If True, text from backend will be used instead of generated text
 
     table_structure_options: TableStructureOptions = TableStructureOptions()
     ocr_options: Union[