diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index e942e0f9..a797d8d8 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -4400,12 +4400,15 @@ def extract_items_range( "Start NodeItem must come before or be the same as the end NodeItem in the document structure." ) - new_doc = DoclingDocument(name=f"{self.name}- Extracted Range") - ref_items = start_parent.children[start_index:end_index] node_items = [ref.resolve(self) for ref in ref_items] - new_doc.add_node_items(node_items=node_items, doc=self) + doc_index = DoclingDocument._DocIndex() + for node_item in node_items: + doc_index.index(doc=self, root=node_item) + + new_doc = DoclingDocument(name="") + new_doc._update_from_index(doc_index) if delete: self.delete_items_range( @@ -6221,19 +6224,31 @@ def get_item_list(self, key: str) -> list[NodeItem]: return getattr(self, key) def index( - self, doc: "DoclingDocument", page_nrs: Optional[set[int]] = None + self, + doc: "DoclingDocument", + page_nrs: Optional[set[int]] = None, + root: Optional[NodeItem] = None, ) -> None: orig_ref_to_new_ref: dict[str, str] = {} - page_delta = self._max_page - min(doc.pages.keys()) + 1 if doc.pages else 0 + + if root: + if root.parent: + orig_ref_to_new_ref[root.parent.cref] = "#/body" + self._names.append(doc.name + root.self_ref) + page_delta = 0 + else: + self._names.append(doc.name) + page_delta = ( + self._max_page - min(doc.pages.keys()) + 1 if doc.pages else 0 + ) if self._body is None: self._body = GroupItem(**doc.body.model_dump(exclude={"children"})) - self._names.append(doc.name) - # collect items in traversal order for item, _ in doc._iterate_items_with_stack( + root=root, with_groups=True, traverse_pictures=True, included_content_layers={c for c in ContentLayer}, diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py index aa962255..9805201e 100644 --- a/test/test_docling_doc.py +++ b/test/test_docling_doc.py @@ -1759,6 +1759,12 @@ def test_rich_tables(rich_table_doc): def test_doc_manipulation_with_rich_tables(rich_table_doc): + rich_table = rich_table_doc.tables[0] + extracted_doc = rich_table_doc.extract_items_range(start=rich_table, end=rich_table) + extracted_md = extracted_doc.export_to_markdown() + assert len(extracted_md) > 0 + assert len(extracted_doc.tables) == 2 + rich_table_doc.delete_items(node_items=[rich_table_doc.texts[0]]) exp_file = Path("test/data/doc/rich_table_post_text_del.out.yaml")