Skip to content

Commit

Permalink
fix(markdown): add support for HTML content (#855)
Browse files Browse the repository at this point in the history
Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
  • Loading branch information
vagenas authored Feb 3, 2025
1 parent 6a76b49 commit 94751a7
Show file tree
Hide file tree
Showing 4 changed files with 142 additions and 5 deletions.
54 changes: 49 additions & 5 deletions docling/backend/md_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,16 @@
from marko import Markdown

from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument

_log = logging.getLogger(__name__)

_MARKER_BODY = "DOCLING_DOC_MD_HTML_EXPORT"
_START_MARKER = f"#_#_{_MARKER_BODY}_START_#_#"
_STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#"


class MarkdownDocumentBackend(DeclarativeDocumentBackend):
def shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
Expand Down Expand Up @@ -67,6 +72,7 @@ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
self.in_table = False
self.md_table_buffer: list[str] = []
self.inline_texts: list[str] = []
self._html_blocks: int = 0

try:
if isinstance(self.path_or_stream, BytesIO):
Expand Down Expand Up @@ -295,16 +301,18 @@ def traverse(node: marko.block.BlockElement):
self.md_table_buffer.append("")

elif isinstance(element, marko.block.HTMLBlock):
self._html_blocks += 1
self.process_inline_text(parent_element, doc)
self.close_table(doc)
_log.debug("HTML Block: {}".format(element))
if (
len(element.children) > 0
len(element.body) > 0
): # If Marko doesn't return any content for HTML block, skip it
snippet_text = str(element.children).strip()
doc.add_text(
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
)
html_block = element.body.strip()

# wrap in markers to enable post-processing in convert()
text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
doc.add_code(parent=parent_element, text=text_to_add)
else:
if not isinstance(element, str):
self.close_table(doc)
Expand Down Expand Up @@ -360,6 +368,42 @@ def convert(self) -> DoclingDocument:
# Start iterating from the root of the AST
self.iterate_elements(parsed_ast, 0, doc, None)
self.process_inline_text(None, doc) # handle last hanging inline text

# if HTML blocks were detected, export to HTML and delegate to HTML backend
if self._html_blocks > 0:

# export to HTML
html_backend_cls = HTMLDocumentBackend
html_str = doc.export_to_html()

def _restore_original_html(txt, regex):
_txt, count = re.subn(regex, "", txt)
if count != self._html_blocks:
raise RuntimeError(
"An internal error has occurred during Markdown conversion."
)
return _txt

# restore original HTML by removing previouly added markers
for regex in [
rf"<pre>\s*<code>\s*{_START_MARKER}",
rf"{_STOP_MARKER}\s*</code>\s*</pre>",
]:
html_str = _restore_original_html(txt=html_str, regex=regex)
self._html_blocks = 0

# delegate to HTML backend
stream = BytesIO(bytes(html_str, encoding="utf-8"))
in_doc = InputDocument(
path_or_stream=stream,
format=InputFormat.HTML,
backend=html_backend_cls,
filename=self.file.name,
)
html_backend_obj = html_backend_cls(
in_doc=in_doc, path_or_stream=stream
)
doc = html_backend_obj.convert()
else:
raise RuntimeError(
f"Cannot convert md with {self.document_hash} because the backend failed to init."
Expand Down
25 changes: 25 additions & 0 deletions tests/data/groundtruth/docling_v2/mixed.md.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Title

Some text

## Famous ducks

Here is a table:

| Character | Name in German | Name in French | Name in Italian |
|----------------|------------------|------------------|-------------------|
| Scrooge McDuck | Dagobert Duck | Balthazar Picsou | Paperone |
| Huey | Tick | Riri | Qui |
| Dewey | Trick | Fifi | Quo |
| Louie | Track | Loulou | Qua |

And here is more HTML:

Some paragraph.

Now a div — almost there...

- foo
- bar

The end!
14 changes: 14 additions & 0 deletions tests/data/groundtruth/docling_v2/word_tables.docx.html
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,20 @@
table tr:nth-child(even) td{
background-color: LightGray;
}
math annotation {
display: none;
}
.formula-not-decoded {
background: repeating-linear-gradient(
45deg, /* Angle of the stripes */
LightGray, /* First color */
LightGray 10px, /* Length of the first color */
White 10px, /* Second color */
White 20px /* Length of the second color */
);
margin: 0;
text-align: center;
}
</style>
</head>
<h2>Test with tables</h2>
Expand Down
54 changes: 54 additions & 0 deletions tests/data/md/mixed.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Title

Some text

## Famous ducks

Here is a table:

<table>
<tr>
<th>Character</th>
<th>Name in German</th>
<th>Name in French</th>
<th>Name in Italian</th>
</tr>
<tr>
<td>Scrooge McDuck</td>
<td>Dagobert Duck</td>
<td>Balthazar Picsou</td>
<td>Paperone</td>
</tr>
<tr>
<td>Huey</td>
<td>Tick</td>
<td>Riri</td>
<td>Qui</td>
</tr>
<tr>
<td>Dewey</td>
<td>Trick</td>
<td>Fifi</td>
<td>Quo</td>
</tr>
<tr>
<td>Louie</td>
<td>Track</td>
<td>Loulou</td>
<td>Qua</td>
</tr>
</table>

And here is more HTML:

<p>Some paragraph.</p>

<div>
<p>Now a div — almost there...</p>
<ul>
<li>foo</li>
<li>bar</li>
</ul>
</div>

The end!

0 comments on commit 94751a7

Please sign in to comment.