From 0cd81a81226c0d4aa4f20e4e58c3b33e4fe50ce0 Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Date: Mon, 3 Feb 2025 10:20:03 +0100 Subject: [PATCH] fix(docx): merged table cells not properly converted (#857) * fix(docx): merged cells not properly converted Fix conversion issue of merged cells in Word tables leading to repeated text. Simplify Word table conversion code. Add docx file with several table formats for regression tests. Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * chore: add type hinting to docx backend Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --------- Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --- docling/backend/msword_backend.py | 265 +- tests/data/docx/word_tables.docx | Bin 0 -> 16404 bytes .../docling_v2/word_tables.docx.html | 75 + .../docling_v2/word_tables.docx.itxt | 19 + .../docling_v2/word_tables.docx.json | 2356 +++++++++++++++++ .../docling_v2/word_tables.docx.md | 44 + tests/test_backend_msword.py | 5 +- word_tables.html | 75 + 8 files changed, 2715 insertions(+), 124 deletions(-) create mode 100644 tests/data/docx/word_tables.docx create mode 100644 tests/data/groundtruth/docling_v2/word_tables.docx.html create mode 100644 tests/data/groundtruth/docling_v2/word_tables.docx.itxt create mode 100644 tests/data/groundtruth/docling_v2/word_tables.docx.json create mode 100644 tests/data/groundtruth/docling_v2/word_tables.docx.md create mode 100644 word_tables.html diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 02f8c86d2..4d4026e3f 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -2,21 +2,28 @@ import re from io import BytesIO from pathlib import Path -from typing import Set, Union +from typing import Any, Optional, Union -import docx from docling_core.types.doc import ( DocItemLabel, DoclingDocument, DocumentOrigin, GroupLabel, ImageRef, + NodeItem, TableCell, TableData, ) +from docx import Document +from docx.document import Document as DocxDocument +from docx.oxml.table import CT_Tc +from docx.oxml.xmlchemy import BaseOxmlElement +from docx.table import Table, _Cell +from docx.text.paragraph import Paragraph from lxml import etree from lxml.etree import XPath from PIL import Image, UnidentifiedImageError +from typing_extensions import override from docling.backend.abstract_backend import DeclarativeDocumentBackend from docling.datamodel.base_models import InputFormat @@ -26,7 +33,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): - def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): + @override + def __init__( + self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path] + ) -> None: super().__init__(in_doc, path_or_stream) self.XML_KEY = ( "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val" @@ -36,19 +46,19 @@ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path] } # self.initialise(path_or_stream) # Word file: - self.path_or_stream = path_or_stream - self.valid = False + self.path_or_stream: Union[BytesIO, Path] = path_or_stream + self.valid: bool = False # Initialise the parents for the hierarchy - self.max_levels = 10 - self.level_at_new_list = None - self.parents = {} # type: ignore + self.max_levels: int = 10 + self.level_at_new_list: Optional[int] = None + self.parents: dict[int, Optional[NodeItem]] = {} for i in range(-1, self.max_levels): self.parents[i] = None self.level = 0 self.listIter = 0 - self.history = { + self.history: dict[str, Any] = { "names": [None], "levels": [None], "numids": [None], @@ -58,9 +68,9 @@ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path] self.docx_obj = None try: if isinstance(self.path_or_stream, BytesIO): - self.docx_obj = docx.Document(self.path_or_stream) + self.docx_obj = Document(self.path_or_stream) elif isinstance(self.path_or_stream, Path): - self.docx_obj = docx.Document(str(self.path_or_stream)) + self.docx_obj = Document(str(self.path_or_stream)) self.valid = True except Exception as e: @@ -68,13 +78,16 @@ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path] f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}" ) from e + @override def is_valid(self) -> bool: return self.valid @classmethod + @override def supports_pagination(cls) -> bool: return False + @override def unload(self): if isinstance(self.path_or_stream, BytesIO): self.path_or_stream.close() @@ -82,11 +95,17 @@ def unload(self): self.path_or_stream = None @classmethod - def supported_formats(cls) -> Set[InputFormat]: + @override + def supported_formats(cls) -> set[InputFormat]: return {InputFormat.DOCX} + @override def convert(self) -> DoclingDocument: - # Parses the DOCX into a structured document model. + """Parses the DOCX into a structured document model. + + Returns: + The parsed document. + """ origin = DocumentOrigin( filename=self.file.name or "file", @@ -104,23 +123,29 @@ def convert(self) -> DoclingDocument: f"Cannot convert doc with {self.document_hash} because the backend failed to init." ) - def update_history(self, name, level, numid, ilevel): + def update_history( + self, + name: str, + level: Optional[int], + numid: Optional[int], + ilevel: Optional[int], + ): self.history["names"].append(name) self.history["levels"].append(level) self.history["numids"].append(numid) self.history["indents"].append(ilevel) - def prev_name(self): + def prev_name(self) -> Optional[str]: return self.history["names"][-1] - def prev_level(self): + def prev_level(self) -> Optional[int]: return self.history["levels"][-1] - def prev_numid(self): + def prev_numid(self) -> Optional[int]: return self.history["numids"][-1] - def prev_indent(self): + def prev_indent(self) -> Optional[int]: return self.history["indents"][-1] def get_level(self) -> int: @@ -130,7 +155,12 @@ def get_level(self) -> int: return k return 0 - def walk_linear(self, body, docx_obj, doc) -> DoclingDocument: + def walk_linear( + self, + body: BaseOxmlElement, + docx_obj: DocxDocument, + doc: DoclingDocument, + ) -> DoclingDocument: for element in body: tag_name = etree.QName(element).localname # Check for Inline Images (blip elements) @@ -150,7 +180,7 @@ def walk_linear(self, body, docx_obj, doc) -> DoclingDocument: _log.debug("could not parse a table, broken docx table") elif drawing_blip: - self.handle_pictures(element, docx_obj, drawing_blip, doc) + self.handle_pictures(docx_obj, drawing_blip, doc) # Check for the sdt containers, like table of contents elif tag_name in ["sdt"]: sdt_content = element.find(".//w:sdtContent", namespaces=namespaces) @@ -167,7 +197,7 @@ def walk_linear(self, body, docx_obj, doc) -> DoclingDocument: _log.debug(f"Ignoring element in DOCX with tag: {tag_name}") return doc - def str_to_int(self, s, default=0): + def str_to_int(self, s: Optional[str], default: Optional[int] = 0) -> Optional[int]: if s is None: return None try: @@ -175,7 +205,7 @@ def str_to_int(self, s, default=0): except ValueError: return default - def split_text_and_number(self, input_string): + def split_text_and_number(self, input_string: str) -> list[str]: match = re.match(r"(\D+)(\d+)$|^(\d+)(\D+)", input_string) if match: parts = list(filter(None, match.groups())) @@ -183,7 +213,9 @@ def split_text_and_number(self, input_string): else: return [input_string] - def get_numId_and_ilvl(self, paragraph): + def get_numId_and_ilvl( + self, paragraph: Paragraph + ) -> tuple[Optional[int], Optional[int]]: # Access the XML element of the paragraph numPr = paragraph._element.find( ".//w:numPr", namespaces=paragraph._element.nsmap @@ -196,13 +228,11 @@ def get_numId_and_ilvl(self, paragraph): numId = numId_elem.get(self.XML_KEY) if numId_elem is not None else None ilvl = ilvl_elem.get(self.XML_KEY) if ilvl_elem is not None else None - return self.str_to_int(numId, default=None), self.str_to_int( - ilvl, default=None - ) + return self.str_to_int(numId, None), self.str_to_int(ilvl, None) return None, None # If the paragraph is not part of a list - def get_label_and_level(self, paragraph): + def get_label_and_level(self, paragraph: Paragraph) -> tuple[str, Optional[int]]: if paragraph.style is None: return "Normal", None label = paragraph.style.style_id @@ -218,20 +248,25 @@ def get_label_and_level(self, paragraph): if "Heading" in label and len(parts) == 2: parts.sort() - label_str = "" - label_level = 0 + label_str: str = "" + label_level: Optional[int] = 0 if parts[0] == "Heading": label_str = parts[0] - label_level = self.str_to_int(parts[1], default=None) + label_level = self.str_to_int(parts[1], None) if parts[1] == "Heading": label_str = parts[1] - label_level = self.str_to_int(parts[0], default=None) + label_level = self.str_to_int(parts[0], None) return label_str, label_level else: return label, None - def handle_text_elements(self, element, docx_obj, doc): - paragraph = docx.text.paragraph.Paragraph(element, docx_obj) + def handle_text_elements( + self, + element: BaseOxmlElement, + docx_obj: DocxDocument, + doc: DoclingDocument, + ) -> None: + paragraph = Paragraph(element, docx_obj) if paragraph.text is None: return @@ -255,11 +290,7 @@ def handle_text_elements(self, element, docx_obj, doc): and p_style_id not in ["Title", "Heading"] ): self.add_listitem( - element, - docx_obj, doc, - p_style_id, - p_level, numid, ilevel, text, @@ -284,13 +315,13 @@ def handle_text_elements(self, element, docx_obj, doc): self.level = 0 if p_style_id in ["Title"]: - for key, val in self.parents.items(): + for key in range(len(self.parents)): self.parents[key] = None self.parents[0] = doc.add_text( parent=None, label=DocItemLabel.TITLE, text=text ) elif "Heading" in p_style_id: - self.add_header(element, docx_obj, doc, p_style_id, p_level, text) + self.add_header(doc, p_level, text) elif p_style_id in [ "Paragraph", @@ -318,7 +349,9 @@ def handle_text_elements(self, element, docx_obj, doc): self.update_history(p_style_id, p_level, numid, ilevel) return - def add_header(self, element, docx_obj, doc, curr_name, curr_level, text: str): + def add_header( + self, doc: DoclingDocument, curr_level: Optional[int], text: str + ) -> None: level = self.get_level() if isinstance(curr_level, int): if curr_level > level: @@ -331,7 +364,7 @@ def add_header(self, element, docx_obj, doc, curr_name, curr_level, text: str): ) elif curr_level < level: # remove the tail - for key, val in self.parents.items(): + for key in range(len(self.parents)): if key >= curr_level: self.parents[key] = None @@ -350,22 +383,18 @@ def add_header(self, element, docx_obj, doc, curr_name, curr_level, text: str): def add_listitem( self, - element, - docx_obj, - doc, - p_style_id, - p_level, - numid, - ilevel, + doc: DoclingDocument, + numid: int, + ilevel: int, text: str, - is_numbered=False, - ): - # is_numbered = is_numbered + is_numbered: bool = False, + ) -> None: enum_marker = "" level = self.get_level() + prev_indent = self.prev_indent() if self.prev_numid() is None: # Open new list - self.level_at_new_list = level # type: ignore + self.level_at_new_list = level self.parents[level] = doc.add_group( label=GroupLabel.LIST, name="list", parent=self.parents[level - 1] @@ -384,10 +413,13 @@ def add_listitem( ) elif ( - self.prev_numid() == numid and self.prev_indent() < ilevel + self.prev_numid() == numid + and self.level_at_new_list is not None + and prev_indent is not None + and prev_indent < ilevel ): # Open indented list for i in range( - self.level_at_new_list + self.prev_indent() + 1, + self.level_at_new_list + prev_indent + 1, self.level_at_new_list + ilevel + 1, ): # Determine if this is an unordered list or an ordered list. @@ -416,7 +448,12 @@ def add_listitem( text=text, ) - elif self.prev_numid() == numid and ilevel < self.prev_indent(): # Close list + elif ( + self.prev_numid() == numid + and self.level_at_new_list is not None + and prev_indent is not None + and ilevel < prev_indent + ): # Close list for k, v in self.parents.items(): if k > self.level_at_new_list + ilevel: self.parents[k] = None @@ -434,7 +471,7 @@ def add_listitem( ) self.listIter = 0 - elif self.prev_numid() == numid or self.prev_indent() == ilevel: + elif self.prev_numid() == numid or prev_indent == ilevel: # TODO: Set marker and enumerated arguments if this is an enumeration element. self.listIter += 1 if is_numbered: @@ -448,31 +485,16 @@ def add_listitem( ) return - def handle_tables(self, element, docx_obj, doc): - - # Function to check if a cell has a colspan (gridSpan) - def get_colspan(cell): - grid_span = cell._element.xpath("@w:gridSpan") - if grid_span: - return int(grid_span[0]) # Return the number of columns spanned - return 1 # Default is 1 (no colspan) - - # Function to check if a cell has a rowspan (vMerge) - def get_rowspan(cell): - v_merge = cell._element.xpath("@w:vMerge") - if v_merge: - return v_merge[ - 0 - ] # 'restart' indicates the beginning of a rowspan, others are continuation - return 1 - - table = docx.table.Table(element, docx_obj) - + def handle_tables( + self, + element: BaseOxmlElement, + docx_obj: DocxDocument, + doc: DoclingDocument, + ) -> None: + table: Table = Table(element, docx_obj) num_rows = len(table.rows) - num_cols = 0 - for row in table.rows: - # Calculate the max number of columns - num_cols = max(num_cols, sum(get_colspan(cell) for cell in row.cells)) + num_cols = len(table.columns) + _log.debug(f"Table grid with {num_rows} rows and {num_cols} columns") if num_rows == 1 and num_cols == 1: cell_element = table.rows[0].cells[0] @@ -481,59 +503,56 @@ def get_rowspan(cell): self.walk_linear(cell_element._element, docx_obj, doc) return - # Initialize the table grid - table_grid = [[None for _ in range(num_cols)] for _ in range(num_rows)] - - data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[]) - + data = TableData(num_rows=num_rows, num_cols=num_cols) + cell_set: set[CT_Tc] = set() for row_idx, row in enumerate(table.rows): + _log.debug(f"Row index {row_idx} with {len(row.cells)} populated cells") col_idx = 0 - for c, cell in enumerate(row.cells): - row_span = get_rowspan(cell) - col_span = get_colspan(cell) - - cell_text = cell.text - # In case cell doesn't return text via docx library: - if len(cell_text) == 0: - cell_xml = cell._element - - texts = [""] - for elem in cell_xml.iter(): - if elem.tag.endswith("t"): # tags that contain text - if elem.text: - texts.append(elem.text) - # Join the collected text - cell_text = " ".join(texts).strip() - - # Find the next available column in the grid - while table_grid[row_idx][col_idx] is not None: - col_idx += 1 - - # Fill the grid with the cell value, considering rowspan and colspan - for i in range(row_span if row_span == "restart" else 1): - for j in range(col_span): - table_grid[row_idx + i][col_idx + j] = "" - - cell = TableCell( - text=cell_text, - row_span=row_span, - col_span=col_span, - start_row_offset_idx=row_idx, - end_row_offset_idx=row_idx + row_span, + while col_idx < num_cols: + cell: _Cell = row.cells[col_idx] + _log.debug( + f" col {col_idx} grid_span {cell.grid_span} grid_cols_before {row.grid_cols_before}" + ) + if cell is None or cell._tc in cell_set: + _log.debug(f" skipped since repeated content") + col_idx += cell.grid_span + continue + else: + cell_set.add(cell._tc) + + spanned_idx = row_idx + spanned_tc: Optional[CT_Tc] = cell._tc + while spanned_tc == cell._tc: + spanned_idx += 1 + spanned_tc = ( + table.rows[spanned_idx].cells[col_idx]._tc + if spanned_idx < num_rows + else None + ) + _log.debug(f" spanned before row {spanned_idx}") + + table_cell = TableCell( + text=cell.text, + row_span=spanned_idx - row_idx, + col_span=cell.grid_span, + start_row_offset_idx=row.grid_cols_before + row_idx, + end_row_offset_idx=row.grid_cols_before + spanned_idx, start_col_offset_idx=col_idx, - end_col_offset_idx=col_idx + col_span, + end_col_offset_idx=col_idx + cell.grid_span, col_header=False, row_header=False, ) - - data.table_cells.append(cell) + data.table_cells.append(table_cell) + col_idx += cell.grid_span level = self.get_level() doc.add_table(data=data, parent=self.parents[level - 1]) return - def handle_pictures(self, element, docx_obj, drawing_blip, doc): - def get_docx_image(element, drawing_blip): + def handle_pictures( + self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument + ) -> None: + def get_docx_image(drawing_blip): rId = drawing_blip[0].get( "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed" ) @@ -546,7 +565,7 @@ def get_docx_image(element, drawing_blip): level = self.get_level() # Open the BytesIO object with PIL to create an Image try: - image_data = get_docx_image(element, drawing_blip) + image_data = get_docx_image(drawing_blip) image_bytes = BytesIO(image_data) pil_image = Image.open(image_bytes) doc.add_picture( diff --git a/tests/data/docx/word_tables.docx b/tests/data/docx/word_tables.docx new file mode 100644 index 0000000000000000000000000000000000000000..1513796e09a42730b23a5faf788dc96c60995469 GIT binary patch literal 16404 zcmeHuWpE}-uH|QDW^OaIncK|F%*@Qt<~B1kGuv%!Gcz+Y+ihm{dhVOqxpQZBH)8+3 zR}rO%%BrKR%92tkr6eZ_0*VR%2S5S<07AfK!Hks#5CHH63;;j@KmuzD+S)i7+c@be zx!V~#YSX$|TYbp`1*XUW0DnIJKgWOL5vWfXwdtWp6n;p2Mu`2TY;crUMg<(slSHF% z41wtZtnwB&*!JFm0xYi#6bo%dM9TEA!K51%=e$O8OgvJIOCz$DHl!^5Pzi zj}aExb^`qd|1hhs6?0oVmJ}-xJ#j;C+_)mNY({>DKbqPjAc`t6T7~3$BoltXD>~EG zL;D6u@&>6LrECKP4!lkdGE98!B8!)e7!jt?vg53pX`)nsUOj5)iRbZLc7kWJT#Uo2 zza!pd6gDPrikxZ`#Hka36SMr+p#h>ISN+oPYELCC_vW7k)H18MTer%Er1j4$G(vPh zu@h@)d6jCp>scQZ5JizdZCt{;t9S^Mv0K;x&O@01G%4O{k*m=*l!Cmq&=l+@E)mOh zs8}u!0HuTOjlNUEYC>YOBf#^DO|w@vM(_y4Zs0B*dL{4jq`GaEnk-7!PNc`HHt z5<5v5F037D;Yp`g^QFxVkQmrj$l$hMw*#AgaBuQ9LM%=>;vcS#7B-0u9{U)QGTw!< zPbGA}UkqbTL^(NUNK%)dWv{&WkmAjhlgco^U?HaI0a3;++!2J=GJ)j{ZNgiV%tR9t z-Lp}v-KTUC-^3VcL3u}wZADK{r97b=wQ#nVI!S>ydM z>;Q|G^@32nUU%CTCqtY6=F>3!yM7c<|K3T50{}or005-VL2WwyW&u-nx1p{@V`=ZSl}Wct;#eU9fUet*w(`l_Vn8qcMpEsk)scq|3FQ)BRz1 zY^SgW;c@W8H?fDFTy2lZx;|s3{>2pP(!2e|gs+f6{$>nOSUN3y!u05GT2cvJyV&Mxy`cDqrzr8biip}nio7H3U3fkRX>(ko2=BL(+BlC!}p3iKUv4`Z6OPLp>iwvLZc`HBw^}H&sQl zA&tC|NYWy{VlH#D4vtTkAK2!OL4HG@m^}PuhEKLhkW5gpWt<{oTU=-UOk@nK5&~5m zet{=JvZ*1NvMX*0<;fgPJ`F1iO?g$kZCKkhy!r9dTNmw(%!BtstrKZH&G>6woyc^w zQ=>$ZpuDw1&(Zw>G4vD?c^S92k`nGLA)Re&)hepcj$oT`x(1_6UEQ3;y0tV#y*-j&Rn8 zh8a26s*lYRQ0`njOtQhk3>qk;5SPO8Q=Lm^m5G$eqL{e_{gIJQzp0J4GXFiPFM2ex z{@8rxeJSPCR5(jxXoS;q8M0F5>)2wj9VN|^r{~S!8<9-UYASW?X$QzM2n06Johqx* zU2dlcuTzT!?^AMX>NBG|E=4^1E4BERHJ7`5wxqlN8g(zptKdB$QOQ0sUl50K%al?f zqYaFRwP?|XSkJeomsvyrj6n{;SX|o?t!GAIt7}I>oKui4^4k@QSL$%wCRbps_&a`J zSaZ+z(G6Yx9n-Vy^OI)11jOX*t$s||Nc9{_sW(u&Ypbx-US^{ zIrUY4i%z{ z2R-uqor&yu*c;^Yj@Zp5Z$eeKstBiyxAkgl0PD!IinO}-&HdfIa=ufnE7y7ogz#qi zDTa|^AdOfO4u(PQdr3PSQOOA+JUJA4NDud1n!79phw92B~0xUZPZbgsr6iS zIEn`l5iZMC@S!~T1h$pMFTd7{TF)M9SV}JE`2(5lv{!3>@f%N>%#lM|fLLti+31P>@LVmEZumYeoMGrMAiuefb6j+IkxS3QW`3d;Gao1({*U~CM3QtXj>PS zfEQ*}Ft;b!mNy=Y)GY(Dx{JRAHCcn$YKBi;<{OKWCa;)XBa1=or#x{C0(7RtRI5*?3~uy5VZ zTfq#ek5CMAcASc&Z37?QmA*ia5SVIvs!8=nU-p!;B&AUkG2c>SQ>%f#KsN1-&)Y?A zfUPbUHPW({#iNE=i9+xU%|fQ2Fr)_H^A%}o_eF_RzTF~49qRAY>!qUBw6GJ&ea&6~ zy=>P$CArZZi}Q{|SLUc30#RQI5I@`s+{KTM4kfnl=5yUulmlak3-ikgRwBrq5*pe@ zp;0HA@NY=vR9!46>4{@)gOXOV5(N!q5VLgf_hVL{mtvw^QZgsInRhDJYZ+%e&h z_#T87HOK@j;_F~F`*x_UptgVETc@b44Gg5HY0=0}SGveu6$j%LU+8UuvqWC+)HJyy z0^>5Q{;1xka|$X_tl~6|{%m7+4AI}>AVYVR&xpQk>1F>)u!3nSgtwetKAQr5c(W6M zuv@`h=gqYYBZ4I<6bjZ3T~}qwKd}94$CWN5II!kVli9@33f+Oyd65MV2a7(J7;B^kqdE45aCVltuF}$IuN(3i zLv_gcO2=ca^xR4bQs7`u{G|fM5n^0usZ*jEpt+~SP$%ejEL~2|F*PIP9SPw=h z_FZICHJ*VnWR}xwQ^g7Gu3MkdNuDN9-L@1~_v*3=~^PV%0$b>P` zXxDI9QXtu067Dib0JjO}T)-NgerFlsgX8HBk(3+XXN)go;Fss34$Sb)#F|ZO1}sce z1HtoO7MFt=u-RG9Y8$TB1J|OTf9CVR0A;^84^E`ezm%^wEMCga%sn<5r+!P2^lv?@ zPqPv~I<1F927Ye4<0$TdChZiZg4fauYND@c$;WhcwPmgM+ICj^8JjEnW$XNaef77U z?PIMVvk+Ey6p!_qS|1ymO|Qe6$b>yIT|o-s<`|Xk7}(CDliw0D5MO}(hpNaK-RRdnsl}^gprKC7eQp`0D7r6m-{c zy({=UPT{gU|P&0i$+xr>8yajL{LyYdfw> z!8h$_y7@_nHymaaNvw+-I%lKa1i;I-br%lfk%xy0oUDHc?18^QH&X3KY~W}3^cEZp z7sWk2*#4Rio5Po(X=8lFa|>($rIpIeno;W`6AE+-GU=g} zBz@VE9-owJI@^LDUX3BdyR33ul)H?x*e2l0B=5ei=b!gjE?YUs<#!$5J(>amR%p{8vz% zyK*fyqpG0o;UZi4%nLwerMlIAD5Ha`VRd3OPMteie)!&JP3yjjtmE_jg38MHb~R$ZrxXW|;f98bRn%CWtbvW|F?i)}c@=I(&soKD zkB2;pD4Ybrnc=yvA1s(vK*?uph5&`YmrRgnsRo;QNQGUj8&}lhM0wHq9`6braGNkS z?*A$SHk>+k{a!66F11^@LJ`9qS3)D7KDx>-z58_V9eq9VoaZQh)5utUL2ZYfMtg&| z=$UcS26qzGaU{A!dGcsaoJ)ngm}+@e_|`Zr%9xVj>xz_$?b{t|R@)KXb>g6Vy%_HY z!GH4P7L4lTZ$CXahR-+%E&vkf4^QsD!X1D1=l&7;0Q!tVetLxey^o5x(NEX^GYS{@ z?w{gC!|5d_uxBMfxVrEL5XR>%p@tg2-R`Uw6aPJ>*K+AlmEt(*%ep$*9p)M_=TV$9 zFVHeS$ zf5ZUPHN$b(P`&AHc|+V?xtQ)Bk#}f1SfyR!W7Z39UaY%Tb&V1UFaRs?Y$xkah6aey zE&leTY9;IxQcM`C`DV<3T07kH9MP><%0}kA>8yE&xzh&(6;Zg7e)BDh$gV(%!^3jl zkm!T-M|l*bey{`!BMGxhUAE$8tHLHE zl02biT>4R`7A-~!jVH}u&u(N6=RK#iZN4r@$WnhK=SkgkF_0Zc;@9z}gc_YG2Tt!O z8oyf6@N=70%aq5s_kpT+8_O*=xHfR(=<#(HnU-C7y)70B4_KE=uSEhU9B>`~+Q9|* zj&J9`-fH|7b3(R^YQk+AI5SnEHuw$_v>JL^E0b37_b-8!>4a6uvO~($!ne6c$dGd~ zVZda3)e6nvi;)*)!S?k6+$FX~fGDLXlGm#)+!-&^syd&{!h1&{lM3ppC^wH4JOSB^ z8DUwD(iYAVF~yzl-|vqO>qI*!ZW`Ddr8eSgCZoc33 zC%bUb?PkAS9@lMpf3V&4}-K^fc-M3#xP3- z;MFC^vxj}5mAs9Q=>?_2g85EO1ma}~235ZWK2*P}O*R_{_xOID#s5feAn8Zb^;_Nw zfIvYCGE!mLQguM3sh=8fKfGKU(F(U6YJI+)lsq`9G{D_hiYqdd;~p-S+>1ym24E4v zfK>0}=~?>u8z_r$nlGxV6)Sy$KJdvcaMp8tUhuyF-!dl9Ux+Sq1ub|3l2DKm&vkPLOEy@f+&~s0{-r2dI z?)=XhBy}UoMs?#XW&>pwg9bJMr-abvVA=ljDm&`kBtaDtH<5u@jJt+p zZ22@g!%q4I3S)Cc4e?FGhK62=8n(hykYDEy?%lK*dVT0>2Mfm9Rb%++Xoyi>U{((x zkqv(k=|z13e`$33Dh&I8L1e*cUQ?)SV`H<@19w-MM?Ns@NR{(Zbsc-xKWmGx`SVcm zY{{=$@ldHS?(C!p`j_@BEM@xkug(IjZes_SVZ}b+_7XE%sc!YTzSWAM zsN0rnYx&vC{Z=WdR$l~7I310lkT!`kQxmIS%$y|5Rdu1IZb3)KE9{61d>?5iLLdjs z2f9s-VGi5H6LKqhkL)dOaJ$N?V$Etb+N$fQ+H(`g;*dyeo5WEB>XRTupPf72JO8pP z<(Ex0NET_mA@i(^8&a|P61ogr>@dj!TxtS6o5hsOo3dw^A>pNEOeNM+a402gbEG;< zO}T3T>YmVjE7$mr3DXDYu8Fv-#Sv4myLoNY*wvJ%`rYA`G^=Y&p8td^Ei5cMonV-X z{QJoAhq6B{>2a@_3v|?Q+q3UU3iNqTnC3CnX4pV$wn4Z+oCrOy z{Kznu|L;S%%oz^m*)!$TGf#%WT!$IU?3PIk1(3o5b$j0u#n~+0jWt4D1LZw0f90f> zUyjAnU2b})qy#PSOz!g5?>w$k~B zcm4vxk03U9E?4GD0*U+(iu)f{=b^7PvMNuS9_yVq2Ru5-E6GL7Rf^gWiw!@$9WS2d zZO?&{lH9kV+lw|If*ZDnEa%hSOO$`tJ5FfZXmh{TI;Eoumc zTDH99ED$gUHK>?!U6OA!eRCjXmkoNjxs6>lu%2r7`80C>ZU$Q;t#9E$0RRlxKZU^^ zjh&p#ZA=~iW&-O~R&91!5xwZ9eCThrL@`FAh%4$scAJKY&e6-MpQS<*qysSg3U=JW z-#vm8%u1zyhPcLJq^B`CnlN+Wp7aiyPKd|~nN#%>g-qt@2?l&ts2tL3c7u-L zFCK_(f-e_~qwYV8^5cO^mSe;r7?lc6-aS%6uQjQMVz`YU$w$l)&QPJ7@|QvR8 z)0a@-3%u=nc5X{-%F+n7@MecFYnO3!ac{zhEXj6{9BMqo>eU)9rn=zOw>Ob0(Dykr zLBHw9h3t|o4=_Z1CLi!fE?kRRvLWWFB3Fq67lU!L_FM^ZXL5S8-SxQXG9qYXxv3TK zJ;B7aR-R4dfbA^r;0yweYkSJb*w=fxi=0K?U-2>*+j1S34So%}p0P8y z+WE>H>f^JG3gP!+9NAj^@ryCxZorq8Lk`^9vUECv&s^Iz_*u)Gud@59Ds#jeB3i^c z6k#Gb9;3kv`Et>cR9rfUn94>Wd(zw8S508dq7jw3?f7<;+gL>{I#cl>Wi-*kxvbZ=ygr1n+582O*H%&XT#VgTi@W`^6}mGoGhTPk9vh z@wVUjA19B%gPN6<&&{*rCIA58A8TkwCwD93Kc7{<@dYV(Dpz)E{6!shSN`_Kr8R>#?4xIycAW{6R4LZoY2y0wHE!K@fli=NBc3R0 zP+W`aoB$se_rg5I3KCkAdMBnsf7?WOR$%4AlzBUZdOZ1xgKio>D2MwewT3Bmf%^o) z_e6>|w9r{-6Gjlea`rK`3G{x1S3Rd_5>&Qq+#%WW0&w;oAY5SI-5BqF{S1o;FB|in zJ|y_Vrk+q@kikkOaK3teeq5PR?-1UewXTtTB8{CbiKzQP8p@%#y-~6qqHe+oSRph$ zD3}XrsW8VCd9cSOWZbGA)r)h5q*(W!ST`kbuAjhA)bEo|NIrX_&&oC!Hq!*?p^shd z_bACWeLZYW{H)@!;nmB|MV%+M*OR1@;aqNR!q@asw(dUe`Q<6UeC{~Lcey))Q4RuYp17iR+P$WCxW^SbUSPJPc$LzxZRdm3oA(8r%4%o`kF}d4i|BYZ zQ!<%Y$K4f8JxDz4Z#1xFOseA^qfpZ$4jHbbiAtQ3g)v?qj~5LDk*{{W&M@y5?H0ek zq@V7m1=O6Ps-q^DHVj|+9q}WFlYDn`(C~h$N79Ou?sg(Z=6kEudjK-zTE3cd&1aqUs3C4=EZzm^H?o-b`kG%a|?M zc>i4pYt&9?l%C*|m4!P>A#r_lwir_-ys_`0NW}g|0gHU~TUv{acKk!}hcvUb^}VD- z-xi8gBcaiQJYsMU5nEIvo@1T!>9uM}P*JOeSz~!nl*|0oRTVbJva+DUN~@Vz50|l^ zJ@%Eo&^taWh}?@EwS~~GT3^sWjTuy=VJtNEU>+!Q6zSA7dlI|Iu38VOpW@Fc^gxlH zvNY2hk*mBSP22YfP&Q3(#J)H-K0U6zXOkD7SI28`zj!@bx9`a!g~TSLggyfr)6|H9o>GFAk!Gi^o~yZJ&a6gKC@Mc3EEpck%( zv>8e|c@#B!3nRUh22K*jo+a89aV?zMPfW{@ISn>$hOLn8cZQ%;0qXBmm*x>N&eD7n zBs@D*_aY%gC!Py!*5R!K9ovWhHZS0LmJ+qi)5JY36p)j1I}McZ`V2XD(_tDK%#qbw>)iM4A<^bwe^gWVL3r25k+p9IB+zYKS3S0(;5hnmaL=WJ&l`KSo}R`Z}PmIMp88151!}qw{z-BbNHw-z8JtnOlONn{Xp$ zUPGFL%Pjn`A0o?#Yb=Ol!BB#0S8-hw-K`Y%t(Ma0OxrPI1#N_GK?wI298Ai!xjF2p3s zuy*zO?mDuC#<2CJ7RP_&^IKbuk=CfoEjpnbJ*Oh+fkgV{)0 zZQ%mqs~$@i+>*99kHhjtni`g?_{ykP1uhG|hG8o}=>+GpYXX+Bim+PW=&zQ}7^rIV z_4bTJu`0fD3MF^(Ow-k1E;EQLHmYiK16_Ay2qBDQblRkpt+L6m!*!Pa#>-|e{066C zU(_qB#Znuw9_>FHiDzK9Q6mT|!{-vhpE_g_B_ZyT(A_)?&%iFeOgOa+9R=BB1$Jh? z48787^fhEBu2N(S_!QB<7{mp#UXBn{qlDN5+66K}&aXLMju2X-1QD1<3DOUpD#Ld> z0Q$^cAn=}6Adup3r&+GR-ywg0rfhd@!{VGoDdYz_RRA9hnm|q<)nPdC{|ol{EEHM_ zClp#wR!`%S`68 z(Z-0ifON26VtcJ2TqnkMM$Jpk-9T@tmMttu=%M(SQRs?;#(H03vQUnU=extSS8#MF z&vmZkvks*D1gh_3kj9U+k&3T-(vm-i(w&7{%%My&>#9yIHGb&ZEhIV>(vF5JJlcUq zw9Z86bJe7{8e8__wlsYe#R;0~)lOZe4}d=(84TK%{}Ol8csI?*i>JDn3>;8p$BZa* z3aPCR)l4VgjyLPaFWHto|ULw9(GrgB3y1pX^@R*CBPvcyCQD-vlHrhM4(UfpE+tP zjZ#VYTcGv^eX~I}!Q25i>P1Y>fD;U!kJCSE(qKfyRh5h>k8@nJ7$zByqEF!F?2*Gh zgQVc8q<1qolQRoC`+TWg6-jHQR+#xH-$LwP8np_Mnl#S;n=F?OR3@#I{6Sn~CtN;1 z)xNw?<{yNVbG~U_0q0MI7{ci5=U9BUl7Eu8Op7&((#U@y!xAhu4RhlEAV$TXy;|h| z3;D}Scx{o@1dH;mSBR;AzGq>$3SS%Z5AbGOI;Kr$km{PDK61}Jl;p4Zn6^p2^Y%#DpysTQT`WT{)R$F6YvNJm0=m2# ztu%sh?1?_X*oK>Oqr#6^s?`P+YHqhWFc4>K_ys?I&887?SHsmda?MWVN`TvzD!rL; zg3|E3Z~Tx@#e03f&w3SjBk1^4>0Q07#ibbGp;xA3Rix|aQl>mQr# zw}6q;)Zmd)EK1{*;8>#lU0wx)yAPQ8)*5>zMxv++q}=M&YZN5WO;MpC^eo+TOe~NS zw4c@tIjS|@23wD*gI!}0K$ehV>Zi<CKtxlV& zpPf+y%6|N0nF1HpV30+ff9-8m0$uSBi7P9fR?W+Ke1}ORAqaV9(OmUOn+lA9+2s5d zGr7CA!mC=H^mE1Z8aDt*)gJFg9`eL<~+&OBJ%FG3lpC0o9lSw zW4Fr(cEjA6&PirBNy*Kmih$+3A5G*jXM0w z^b>iwnr#J1AhjdhIPDKtf|t`gsfeJomgZwW2~*~+mmQf$uZq7{?GPpX+1 ziJhbFB*%*mJ}Ajcp$J?ZLOii2@iwm1;i{%`*-fN`AYMjZ56U67R28t1nfRj{LntjL z>94~}dmqI~pDwC%(D~AK4q;qYh?h&t#aQfJ{x@s*3Lgem$)w_EIdz9BjrolccLj0U zp_S;1E|S`|_u9ey`9R`CwteR>JAPDMkta#~&E?}h2P|Z)zwTt%tc()~l@m0s4umNV zsl(04E>dw4q{d-Y7fo+TDP$T2<1JgA-LJyp?K2!?XUpvmI-653*w(8%4D#iL(qXCH zB!iy55HI>tU#zVg^NsLtp?I*9n>(TSlt+omwvFi8&9C|}kd;L5cb=Wju_q zcEDl8ZfV*hqI4jUPdBsrqmnmbm52Q1Yo7?nox32Kxc(8`IA`&@WD)a__7gZUChq2w zTqBL0SjZYlqNxH6d6#4tfe63X>7vU%;6+y;ney<1L&%x#Kn1b5W+GQpzW7&o6Mgeg zvN8u8io_i15+?rRqHxBL(dyA);hLJWV9yC*K0KS`C20ZEz19weLAnEH{LFPH`tegW+t+ZDWCTIJvPKSE;bbFmD_v6-7gwmrgu)!~~F1KOEn_2RrnI^m_ zPnd}qCs=rwgfRDqijBN3`a@%4i$x#FCL+bN;fuDDzXIFZ&lN~iZQ1#wk!UNMU~y>A z*ds`2$JxhY)^G$SVlUyIGD+d^OIz{`hzMF19h~a&Z~T9?_XnPR%h?6o?-^d#v5)Zj zq*Os9UMs%O`uxXKYO;-IG272OG@o%~)PID5T#XGB{}e{fm|eH&`GN}AdLi`)4)Q4E zV39OrQAbZ+r7F-8baZosl9A`@OCZzZ z1~R8-czRWVN0ZoRVRfTo4-Et1s?(ld95}1=K7+I&__UBv3>c>!7Tk7>P&l`Nj;|qp zrPr_JD^EqxZp}?wKC8mEn?t*Cph>)i;E0stW$bi;?`mmNeC3;yl z`*Fx1Oi_$&E7|R8{V^8Tky}ujX*G_2Ql)ZCPo-2(zXUeBckfeQ5tROB#Gtjgzd-+& z(-opN-HNukYX}vyZ0ZGg>d0Ox%8JcR9f4pSuIqlf{}`uxwZk5O|EvQchX4SO|M7!N zK1;Ne^bM?x{}$!ni(j?Lr$-FBP(9)NvdM<&V1~_7x?)XyHm*~#0$~>crQ*ehPflXn zX(*^rX4ar@i$Q|=QQ_Mr&T4UhnjNR=6RMI z2jaxrLFQs75srw~AcIi_hbx088JJx|uaf|QJQlfOY?=B4Yz;$gdCpNVzIS3?&`Nk}MJI$4IL{ib+1VsRk+LvrLyI zz4#U#pJ}`cHIV|yGR8mr6NT*DcHRS2VfzVBx|3)rlp&`YlwBR-P4rVlrWDU;gqlqT zehksqnJ#gq-AYkrPU)$7$ckZm@1Z$<0nU4iZFCr}R~n3mjs!JBX}E3UIQBR&GRu|r zTQe|#z-b|IyQUaFbS+S><%?QuQoODc;52?!-i}4#t$T4U_H@|cm~H%kA=oHDk9kFv zz83 zO{@fs^F1162JGkaXFbHfZxK-n5JtNEP4Pb^2{W2jYo}nlkV3o=y#^SHA1NfWCfER*caI_9om*z6C?UAdX29`a z;&EmFR*+?ylRZI1l|)hj+3!F^ti57reTRsL80mWvm}NvzSOw5zFVl+ayi^H)Wb(ncU!3MD^sc+F%r)Wfx6Sy_659hQF^P5;^_}&OT76A zs>#3JlnqNctJz~(P}?qN*yNI(FoGaAvvmAkw%w!z#_;HEwYVXSXw&>0!7C$dQsS0+ zHFN5`!G15W`|+F5!K488WHboyoiJ-%5s!j$JE7@bsrknie8ygeoh#utj2{$F+Q5)Z*7K&ZleAI z_E&t-ZG8WA7mKLz#k$WZu0R@;cL5aiD* z6(vRmHHqh-?YHDhfhLmL#i?;uYqf`{1%&OxyEQYqn3?S(b5vN;0+g>&#>W1qq)!Vdl3+JcGy*MlAbQ$N+dL&dNR?$=o_8GGFZ%{_`xbCw zV|bm?P0F+^{NN}OXrUBO(4XSAa{Vex7_L$DynyIvONLB~Vf4J9ev5z-qL2(uI}kW=TmmmO1#()8cU3e27Lp5SLO38d zg*sAPp@Y#~6fm_Ef!mPDh<*|P7OL)gkcn}qh&&@@JDk#KBdQZOw}uYGWyU99#IYwZ z!fSS?L0tm!h+zB!fP6K-C+cx{uQaffU2yt9Xa`NcJ?z!(*XP_|t+0u71pD;zeh&P9 zi7XH>&F50-KbKGc@qhkA{zEafoaDbU_}603KVbp!pGlE_D*gN`@UK;H|Af|k=EnV7 zP26AM|C)OFPcQ)B^|=uIza?V+mD69-@&3uH4*ma-ocC82e@$ihCyN%mzq9yrV#8nI zf8EUdC;Wl%AMpRZwfR@@UzfoD1iv%=9sHLC@n1Rob&d8<4%sY!=kPBpw!h;4)k^;p w4FIgO0RaDIv;PYJ*Lm~L@Gy=)!T&ys%1MHKipSpyi(vrWpEE5t_usDm7pd?I_5c6? literal 0 HcmV?d00001 diff --git a/tests/data/groundtruth/docling_v2/word_tables.docx.html b/tests/data/groundtruth/docling_v2/word_tables.docx.html new file mode 100644 index 000000000..30f6e8d35 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/word_tables.docx.html @@ -0,0 +1,75 @@ + + + + + + + Powered by Docling + + + +

Test with tables

+

A uniform table

+
Header 0.0Header 0.1Header 0.2
Cell 1.0Cell 1.1Cell 1.2
Cell 2.0Cell 2.1Cell 2.2
+

+

A non-uniform table with horizontal spans

+
Header 0.0Header 0.1Header 0.2
Cell 1.0Merged Cell 1.1 1.2
Cell 2.0Merged Cell 2.1 2.2
+

+

A non-uniform table with horizontal spans in inner columns

+
Header 0.0Header 0.1Header 0.2Header 0.3
Cell 1.0Merged Cell 1.1 1.2Cell 1.3
Cell 2.0Merged Cell 2.1 2.2Cell 2.3
+

+

A non-uniform table with vertical spans

+
Header 0.0Header 0.1Header 0.2
Cell 1.0Merged Cell 1.1 2.1Cell 1.2
Cell 2.0Cell 2.2
Cell 3.0Merged Cell 3.1 4.1Cell 3.2
Cell 4.0Cell 4.2
+

+

A non-uniform table with all kinds of spans and empty cells

+
Header 0.0Header 0.1Header 0.2
Cell 1.0Merged Cell 1.1 2.1Cell 1.2
Cell 2.0Cell 2.2
Cell 3.0Merged Cell 3.1 4.1Cell 3.2
Cell 4.0Cell 4.2Merged Cell 4.4 5.4
Cell 8.4
+

+

+ \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/word_tables.docx.itxt b/tests/data/groundtruth/docling_v2/word_tables.docx.itxt new file mode 100644 index 000000000..dd42eb0a2 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/word_tables.docx.itxt @@ -0,0 +1,19 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: section: group header-0 + item-2 at level 2: section_header: Test with tables + item-3 at level 3: paragraph: A uniform table + item-4 at level 3: table with [3x3] + item-5 at level 3: paragraph: + item-6 at level 3: paragraph: A non-uniform table with horizontal spans + item-7 at level 3: table with [3x3] + item-8 at level 3: paragraph: + item-9 at level 3: paragraph: A non-uniform table with horizontal spans in inner columns + item-10 at level 3: table with [3x4] + item-11 at level 3: paragraph: + item-12 at level 3: paragraph: A non-uniform table with vertical spans + item-13 at level 3: table with [5x3] + item-14 at level 3: paragraph: + item-15 at level 3: paragraph: A non-uniform table with all kinds of spans and empty cells + item-16 at level 3: table with [9x5] + item-17 at level 3: paragraph: + item-18 at level 3: paragraph: \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/word_tables.docx.json b/tests/data/groundtruth/docling_v2/word_tables.docx.json new file mode 100644 index 000000000..957a83c80 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/word_tables.docx.json @@ -0,0 +1,2356 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.0.0", + "name": "word_tables", + "origin": { + "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "binary_hash": 8379738677198259833, + "filename": "word_tables.docx" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/groups/0" + } + ], + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/0" + } + ], + "name": "header-0", + "label": "section" + } + ], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/groups/0" + }, + "children": [ + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/tables/0" + }, + { + "$ref": "#/texts/2" + }, + { + "$ref": "#/texts/3" + }, + { + "$ref": "#/tables/1" + }, + { + "$ref": "#/texts/4" + }, + { + "$ref": "#/texts/5" + }, + { + "$ref": "#/tables/2" + }, + { + "$ref": "#/texts/6" + }, + { + "$ref": "#/texts/7" + }, + { + "$ref": "#/tables/3" + }, + { + "$ref": "#/texts/8" + }, + { + "$ref": "#/texts/9" + }, + { + "$ref": "#/tables/4" + }, + { + "$ref": "#/texts/10" + }, + { + "$ref": "#/texts/11" + } + ], + "label": "section_header", + "prov": [], + "orig": "Test with tables", + "text": "Test with tables", + "level": 1 + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "A uniform table", + "text": "A uniform table" + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "A non-uniform table with horizontal spans", + "text": "A non-uniform table with horizontal spans" + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "A non-uniform table with horizontal spans in inner columns", + "text": "A non-uniform table with horizontal spans in inner columns" + }, + { + "self_ref": "#/texts/6", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/7", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "A non-uniform table with vertical spans", + "text": "A non-uniform table with vertical spans" + }, + { + "self_ref": "#/texts/8", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/9", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "A non-uniform table with all kinds of spans and empty cells", + "text": "A non-uniform table with all kinds of spans and empty cells" + }, + { + "self_ref": "#/texts/10", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/11", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + } + ], + "pictures": [], + "tables": [ + { + "self_ref": "#/tables/0", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "table", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 0.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Header 0.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Header 0.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 1.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Cell 1.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 1.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 2.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Cell 2.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 2.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + "num_rows": 3, + "num_cols": 3, + "grid": [ + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 0.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Header 0.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Header 0.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 1.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Cell 1.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 1.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 2.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Cell 2.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 2.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ] + ] + } + }, + { + "self_ref": "#/tables/1", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "table", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 0.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Header 0.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Header 0.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 1.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 1.1 1.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 2.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 2.1 2.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + "num_rows": 3, + "num_cols": 3, + "grid": [ + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 0.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Header 0.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Header 0.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 1.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 1.1 1.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 1.1 1.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 2.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 2.1 2.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 2.1 2.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ] + ] + } + }, + { + "self_ref": "#/tables/2", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "table", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 0.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Header 0.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Header 0.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "Header 0.3", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 1.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 1.1 1.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "Cell 1.3", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 2.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 2.1 2.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "Cell 2.3", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + "num_rows": 3, + "num_cols": 4, + "grid": [ + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 0.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Header 0.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Header 0.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "Header 0.3", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 1.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 1.1 1.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 1.1 1.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "Cell 1.3", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 2.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 2.1 2.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 2.1 2.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "Cell 2.3", + "column_header": false, + "row_header": false, + "row_section": false + } + ] + ] + } + }, + { + "self_ref": "#/tables/3", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "table", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 0.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Header 0.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Header 0.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 1.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 1.1 2.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 1.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 2.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 2.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 3.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 3.1 4.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 3.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 4.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 4.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + "num_rows": 5, + "num_cols": 3, + "grid": [ + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 0.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Header 0.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Header 0.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 1.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 1.1 2.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 1.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 2.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 1.1 2.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 2.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 3.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 3.1 4.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 3.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 4.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 3.1 4.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 4.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ] + ] + } + }, + { + "self_ref": "#/tables/4", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "table", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 0.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Header 0.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Header 0.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 1.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 1.1 2.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 1.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 2.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 2.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 3.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 3.1 4.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 3.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 3, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 6, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 4.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 4.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 6, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "Merged Cell 4.4 5.4", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 5, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 0, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "Cell 8.4", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + "num_rows": 9, + "num_cols": 5, + "grid": [ + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 0.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Header 0.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Header 0.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 1.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 1.1 2.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 1.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 2.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 1.1 2.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 2.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 3.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 3.1 4.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 3.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 3, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 6, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 4.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 3.1 4.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 4.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 3, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 6, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 6, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "Merged Cell 4.4 5.4", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 3, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 6, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 6, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "Merged Cell 4.4 5.4", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 5, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 0, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 5, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 0, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 5, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 0, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 5, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 0, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 5, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 0, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "Cell 8.4", + "column_header": false, + "row_header": false, + "row_section": false + } + ] + ] + } + } + ], + "key_value_items": [], + "pages": {} +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/word_tables.docx.md b/tests/data/groundtruth/docling_v2/word_tables.docx.md new file mode 100644 index 000000000..90123c3e7 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/word_tables.docx.md @@ -0,0 +1,44 @@ +## Test with tables + +A uniform table + +| Header 0.0 | Header 0.1 | Header 0.2 | +|--------------|--------------|--------------| +| Cell 1.0 | Cell 1.1 | Cell 1.2 | +| Cell 2.0 | Cell 2.1 | Cell 2.2 | + +A non-uniform table with horizontal spans + +| Header 0.0 | Header 0.1 | Header 0.2 | +|--------------|---------------------|---------------------| +| Cell 1.0 | Merged Cell 1.1 1.2 | Merged Cell 1.1 1.2 | +| Cell 2.0 | Merged Cell 2.1 2.2 | Merged Cell 2.1 2.2 | + +A non-uniform table with horizontal spans in inner columns + +| Header 0.0 | Header 0.1 | Header 0.2 | Header 0.3 | +|--------------|---------------------|---------------------|--------------| +| Cell 1.0 | Merged Cell 1.1 1.2 | Merged Cell 1.1 1.2 | Cell 1.3 | +| Cell 2.0 | Merged Cell 2.1 2.2 | Merged Cell 2.1 2.2 | Cell 2.3 | + +A non-uniform table with vertical spans + +| Header 0.0 | Header 0.1 | Header 0.2 | +|--------------|---------------------|--------------| +| Cell 1.0 | Merged Cell 1.1 2.1 | Cell 1.2 | +| Cell 2.0 | Merged Cell 1.1 2.1 | Cell 2.2 | +| Cell 3.0 | Merged Cell 3.1 4.1 | Cell 3.2 | +| Cell 4.0 | Merged Cell 3.1 4.1 | Cell 4.2 | + +A non-uniform table with all kinds of spans and empty cells + +| Header 0.0 | Header 0.1 | Header 0.2 | | | +|--------------|---------------------|--------------|----|---------------------| +| Cell 1.0 | Merged Cell 1.1 2.1 | Cell 1.2 | | | +| Cell 2.0 | Merged Cell 1.1 2.1 | Cell 2.2 | | | +| Cell 3.0 | Merged Cell 3.1 4.1 | Cell 3.2 | | | +| Cell 4.0 | Merged Cell 3.1 4.1 | Cell 4.2 | | Merged Cell 4.4 5.4 | +| | | | | Merged Cell 4.4 5.4 | +| | | | | | +| | | | | | +| | | | | Cell 8.4 | \ No newline at end of file diff --git a/tests/test_backend_msword.py b/tests/test_backend_msword.py index 9edcb3e66..86bd837d9 100644 --- a/tests/test_backend_msword.py +++ b/tests/test_backend_msword.py @@ -69,7 +69,6 @@ def verify_export(pred_text: str, gtfile: str): with open(gtfile, "r") as fr: true_text = fr.read() - assert pred_text == true_text, "pred_itxt==true_itxt" return pred_text == true_text @@ -101,3 +100,7 @@ def test_e2e_docx_conversions(): pred_json: str = json.dumps(doc.export_to_dict(), indent=2) assert verify_export(pred_json, str(gt_path) + ".json"), "export to json" + + if docx_path.name == "word_tables.docx": + pred_html: str = doc.export_to_html() + assert verify_export(pred_html, str(gt_path) + ".html"), "export to html" diff --git a/word_tables.html b/word_tables.html new file mode 100644 index 000000000..30f6e8d35 --- /dev/null +++ b/word_tables.html @@ -0,0 +1,75 @@ + + + + + + + Powered by Docling + + + +

Test with tables

+

A uniform table

+
Header 0.0Header 0.1Header 0.2
Cell 1.0Cell 1.1Cell 1.2
Cell 2.0Cell 2.1Cell 2.2
+

+

A non-uniform table with horizontal spans

+
Header 0.0Header 0.1Header 0.2
Cell 1.0Merged Cell 1.1 1.2
Cell 2.0Merged Cell 2.1 2.2
+

+

A non-uniform table with horizontal spans in inner columns

+
Header 0.0Header 0.1Header 0.2Header 0.3
Cell 1.0Merged Cell 1.1 1.2Cell 1.3
Cell 2.0Merged Cell 2.1 2.2Cell 2.3
+

+

A non-uniform table with vertical spans

+
Header 0.0Header 0.1Header 0.2
Cell 1.0Merged Cell 1.1 2.1Cell 1.2
Cell 2.0Cell 2.2
Cell 3.0Merged Cell 3.1 4.1Cell 3.2
Cell 4.0Cell 4.2
+

+

A non-uniform table with all kinds of spans and empty cells

+
Header 0.0Header 0.1Header 0.2
Cell 1.0Merged Cell 1.1 2.1Cell 1.2
Cell 2.0Cell 2.2
Cell 3.0Merged Cell 3.1 4.1Cell 3.2
Cell 4.0Cell 4.2Merged Cell 4.4 5.4
Cell 8.4
+

+

+ \ No newline at end of file