Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into fix-threadsafe-pypdfium
Browse files Browse the repository at this point in the history
  • Loading branch information
dolfim-ibm committed Mar 2, 2025
2 parents 346a49c + db3ceef commit b3cf5d4
Show file tree
Hide file tree
Showing 18 changed files with 1,457 additions and 335 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/checks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ jobs:
run: |
for file in docs/examples/*.py; do
# Skip batch_convert.py
if [[ "$(basename "$file")" =~ ^(batch_convert|minimal|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api).py ]]; then
if [[ "$(basename "$file")" =~ ^(batch_convert|minimal_vlm_pipeline|minimal|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api).py ]]; then
echo "Skipping $file"
continue
fi
Expand Down
16 changes: 16 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
## [v2.25.0](https://github.com/DS4SD/docling/releases/tag/v2.25.0) - 2025-02-26

### Feature

* [Experimental] Introduce VLM pipeline using HF AutoModelForVision2Seq, featuring SmolDocling model ([#1054](https://github.com/DS4SD/docling/issues/1054)) ([`3c9fe76`](https://github.com/DS4SD/docling/commit/3c9fe76b706b7714b25d49cb09050c42e3b8c849))
* **cli:** Add option for downloading all models, refine help messages ([#1061](https://github.com/DS4SD/docling/issues/1061)) ([`ab683e4`](https://github.com/DS4SD/docling/commit/ab683e4fb6df4973d2efda04f00c269a2dc95f5b))

### Fix

* Vlm using artifacts path ([#1057](https://github.com/DS4SD/docling/issues/1057)) ([`e197225`](https://github.com/DS4SD/docling/commit/e1972257399151503d60b4806976c8b9b6911aa8))
* **html:** Parse text in div elements as TextItem ([#1041](https://github.com/DS4SD/docling/issues/1041)) ([`1b0ead6`](https://github.com/DS4SD/docling/commit/1b0ead69078030a0e4d25b51450ef2aa4a2e79fc))

### Documentation

* Extend chunking docs, add FAQ on token limit ([#1053](https://github.com/DS4SD/docling/issues/1053)) ([`c84b973`](https://github.com/DS4SD/docling/commit/c84b973959a254db22ac9a7dc8810628e4808a2d))

## [v2.24.0](https://github.com/DS4SD/docling/releases/tag/v2.24.0) - 2025-02-20

### Feature
Expand Down
24 changes: 19 additions & 5 deletions docling/backend/html_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,10 +256,16 @@ def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
parent=self.parents[self.level], name="list", label=GroupLabel.LIST
)
elif element.name == "ol":
start_attr = element.get("start")
start: int = (
int(start_attr)
if isinstance(start_attr, str) and start_attr.isnumeric()
else 1
)
# create a list group
self.parents[self.level + 1] = doc.add_group(
parent=self.parents[self.level],
name="ordered list",
name="ordered list" + (f" start {start}" if start != 1 else ""),
label=GroupLabel.ORDERED_LIST,
)
self.level += 1
Expand All @@ -270,15 +276,23 @@ def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
self.level -= 1

def handle_list_item(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles listitem tags (li)."""
"""Handles list item tags (li)."""
nested_list = element.find(["ul", "ol"])

parent = self.parents[self.level]
if parent is None:
_log.warning(f"list-item has no parent in DoclingDocument: {element}")
_log.debug(f"list-item has no parent in DoclingDocument: {element}")
return
parent_label: str = parent.label
index_in_list = len(parent.children) + 1
if (
parent_label == GroupLabel.ORDERED_LIST
and isinstance(parent, GroupItem)
and parent.name
):
start_in_list: str = parent.name.split(" ")[-1]
start: int = int(start_in_list) if start_in_list.isnumeric() else 1
index_in_list += start - 1

if nested_list:
# Text in list item can be hidden within hierarchy, hence
Expand Down Expand Up @@ -324,13 +338,13 @@ def handle_list_item(self, element: Tag, doc: DoclingDocument) -> None:
parent=parent,
)
else:
_log.warning(f"list-item has no text: {element}")
_log.debug(f"list-item has no text: {element}")

@staticmethod
def parse_table_data(element: Tag) -> Optional[TableData]:
nested_tables = element.find("table")
if nested_tables is not None:
_log.warning("Skipping nested table.")
_log.debug("Skipping nested table.")
return None

# Count the number of rows (number of <tr> elements)
Expand Down
32 changes: 28 additions & 4 deletions docling/cli/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,19 @@ class _AvailableModels(str, Enum):
CODE_FORMULA = "code_formula"
PICTURE_CLASSIFIER = "picture_classifier"
SMOLVLM = "smolvlm"
GRANITE_VISION = "granite_vision"
EASYOCR = "easyocr"


_default_models = [
_AvailableModels.LAYOUT,
_AvailableModels.TABLEFORMER,
_AvailableModels.CODE_FORMULA,
_AvailableModels.PICTURE_CLASSIFIER,
_AvailableModels.EASYOCR,
]


@app.command("download")
def download(
output_dir: Annotated[
Expand All @@ -43,18 +53,27 @@ def download(
...,
"-o",
"--output-dir",
help="The directory where all the models are downloaded.",
help="The directory where to download the models.",
),
] = (settings.cache_dir / "models"),
force: Annotated[
bool, typer.Option(..., help="If true, the download will be forced")
bool, typer.Option(..., help="If true, the download will be forced.")
] = False,
models: Annotated[
Optional[list[_AvailableModels]],
typer.Argument(
help=f"Models to download (default behavior: all will be downloaded)",
help=f"Models to download (default behavior: a predefined set of models will be downloaded).",
),
] = None,
all: Annotated[
bool,
typer.Option(
...,
"--all",
help="If true, all available models will be downloaded (mutually exclusive with passing specific models).",
show_default=True,
),
] = False,
quiet: Annotated[
bool,
typer.Option(
Expand All @@ -65,6 +84,10 @@ def download(
),
] = False,
):
if models and all:
raise typer.BadParameter(
"Cannot simultaneously set 'all' parameter and specify models to download."
)
if not quiet:
FORMAT = "%(message)s"
logging.basicConfig(
Expand All @@ -73,7 +96,7 @@ def download(
datefmt="[%X]",
handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
)
to_download = models or [m for m in _AvailableModels]
to_download = models or ([m for m in _AvailableModels] if all else _default_models)
output_dir = download_models(
output_dir=output_dir,
force=force,
Expand All @@ -83,6 +106,7 @@ def download(
with_code_formula=_AvailableModels.CODE_FORMULA in to_download,
with_picture_classifier=_AvailableModels.PICTURE_CLASSIFIER in to_download,
with_smolvlm=_AvailableModels.SMOLVLM in to_download,
with_granite_vision=_AvailableModels.GRANITE_VISION in to_download,
with_easyocr=_AvailableModels.EASYOCR in to_download,
)

Expand Down
5 changes: 5 additions & 0 deletions docling/datamodel/base_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,10 @@ class LayoutPrediction(BaseModel):
clusters: List[Cluster] = []


class VlmPrediction(BaseModel):
text: str = ""


class ContainerElement(
BasePageElement
): # Used for Form and Key-Value-Regions, only for typing.
Expand Down Expand Up @@ -197,6 +201,7 @@ class PagePredictions(BaseModel):
tablestructure: Optional[TableStructurePrediction] = None
figures_classification: Optional[FigureClassificationPrediction] = None
equations_prediction: Optional[EquationPrediction] = None
vlm_response: Optional[VlmPrediction] = None


PageElement = Union[TextElement, Table, FigureElement, ContainerElement]
Expand Down
63 changes: 62 additions & 1 deletion docling/datamodel/pipeline_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ class AcceleratorOptions(BaseSettings):

num_threads: int = 4
device: Union[str, AcceleratorDevice] = "auto"
cuda_use_flash_attention2: bool = False

@field_validator("device")
def validate_device(cls, value):
Expand Down Expand Up @@ -254,6 +255,45 @@ def repo_cache_folder(self) -> str:
)


class BaseVlmOptions(BaseModel):
kind: str
prompt: str


class ResponseFormat(str, Enum):
DOCTAGS = "doctags"
MARKDOWN = "markdown"


class HuggingFaceVlmOptions(BaseVlmOptions):
kind: Literal["hf_model_options"] = "hf_model_options"

repo_id: str
load_in_8bit: bool = True
llm_int8_threshold: float = 6.0
quantized: bool = False

response_format: ResponseFormat

@property
def repo_cache_folder(self) -> str:
return self.repo_id.replace("/", "--")


smoldocling_vlm_conversion_options = HuggingFaceVlmOptions(
repo_id="ds4sd/SmolDocling-256M-preview",
prompt="Convert this page to docling.",
response_format=ResponseFormat.DOCTAGS,
)

granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
repo_id="ibm-granite/granite-vision-3.1-2b-preview",
# prompt="OCR the full page to markdown.",
prompt="OCR this image.",
response_format=ResponseFormat.MARKDOWN,
)


# Define an enum for the backend options
class PdfBackend(str, Enum):
"""Enum of valid PDF backends."""
Expand Down Expand Up @@ -285,7 +325,24 @@ class PipelineOptions(BaseModel):
enable_remote_services: bool = False


class PdfPipelineOptions(PipelineOptions):
class PaginatedPipelineOptions(PipelineOptions):
images_scale: float = 1.0
generate_page_images: bool = False
generate_picture_images: bool = False


class VlmPipelineOptions(PaginatedPipelineOptions):
artifacts_path: Optional[Union[Path, str]] = None

generate_page_images: bool = True
force_backend_text: bool = (
False # (To be used with vlms, or other generative models)
)
# If True, text from backend will be used instead of generated text
vlm_options: Union[HuggingFaceVlmOptions] = smoldocling_vlm_conversion_options


class PdfPipelineOptions(PaginatedPipelineOptions):
"""Options for the PDF pipeline."""

artifacts_path: Optional[Union[Path, str]] = None
Expand All @@ -295,6 +352,10 @@ class PdfPipelineOptions(PipelineOptions):
do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code
do_picture_classification: bool = False # True: classify pictures in documents
do_picture_description: bool = False # True: run describe pictures in documents
force_backend_text: bool = (
False # (To be used with vlms, or other generative models)
)
# If True, text from backend will be used instead of generated text

table_structure_options: TableStructureOptions = TableStructureOptions()
ocr_options: Union[
Expand Down
Loading

0 comments on commit b3cf5d4

Please sign in to comment.