huggingface · mishig25 · Sep 21, 2023 · Sep 20, 2023 · Sep 20, 2023 · Sep 20, 2023
diff --git a/src/doc_builder/convert_md_to_mdx.py b/src/doc_builder/convert_md_to_mdx.py
@@ -22,7 +22,21 @@
 
 
 _re_doctest_flags = re.compile(r"^(>>>.*\S)(\s+)# doctest:\s+\+[A-Z_]+\s*$", flags=re.MULTILINE)
-_re_lt_html = re.compile(r"<(((!(DOCTYPE|--))|((\/\s*)?[a-z]+))[^>]*?)>", re.IGNORECASE)
+_re_lt_html = re.compile(
+ r"""# This regex is meant to detect any HTML tag or comment, but not standalone '<' characters.
+ <( # HTML tag with...
+ (
+ !DOCTYPE # ... !DOCTYPE
+ |
+ ((\/\s*)?[a-z]+) # ... or any regular tag (i.e. starts with [a-z]
+ )
+ [^><]*? # ... followed by anything until next closing ">"
+ )>
+ |
+ <(!--[^>]*?--)> # Or an HTML comment
+ """,
+ re.IGNORECASE | re.VERBOSE,
+)
 _re_lcub_svelte = re.compile(
  r"<(Question|Tip|Added|Changed|Deprecated|DocNotebookDropdown|CourseFloatingBanner|FrameworkSwitch|audio|PipelineIcon|PipelineTag)(((?!<(Question|Tip|Added|Changed|Deprecated|DocNotebookDropdown|CourseFloatingBanner|FrameworkSwitch|audio|PipelineIcon|PipelineTag)).)*)>|&amp;lcub;(#if|:else}|/if})",
  re.DOTALL,
@@ -80,11 +94,14 @@ def convert_special_chars(text):
  # source is a special tag, it can be standalone (html tag) or closing (doc tag)
 
  # Temporarily replace all valid HTML tags with LTHTML
- text = re.sub(_re_lt_html, r"LTHTML\1>", text)
+ # Replace with '\1\5' => 2 possible groups to catch the tag but in practice only one is not empty.
+ text = re.sub(_re_lt_html, r"LTHTML\1\5>", text)
+
  # Encode remaining < symbols
  text = text.replace("<", "&amp;lt;")
  # Put back the HTML tags
  text = text.replace("LTHTML", "<")
+
  return text
 
 

diff --git a/tests/test_autodoc.py b/tests/test_autodoc.py
@@ -265,7 +265,7 @@ def test_document_object(self):
  model_output_doc = """
 <docstring><name>class transformers.utils.ModelOutput</name><anchor>transformers.utils.ModelOutput</anchor><source>"""
  model_output_doc += f"{self.test_source_link}"
- model_output_doc += """</source><parameters>""</parameters></docstring>
+ model_output_doc += """</source><parameters>[{"name": "*args", "val": ""}, {"name": "**kwargs", "val": ""}]</parameters></docstring>
 
 Base class for all model outputs as dataclass. Has a `__getitem__` that allows indexing by integer or slice (like a
 tuple) or strings (like a dictionary) that will ignore the `None` attributes. Otherwise behaves like a regular

diff --git a/tests/test_convert_md_to_mdx.py b/tests/test_convert_md_to_mdx.py
@@ -132,6 +132,13 @@ def test_convert_special_chars(self):
  "something &amp;lt;5MB something else -> here",
  )
 
+ # Regression test for https://github.com/huggingface/doc-builder/pull/398
+ # '10K<n<100K' must be caught correctly and not grouped with the next HTML tag.
+ self.assertEqual(
+ convert_special_chars("""10K<n<100K\n<Tip>\nThis is a tip.\n</Tip>"""),
+ "10K&amp;lt;n&amp;lt;100K\n<Tip>\nThis is a tip.\n</Tip>",
+ )
+
  def test_convert_img_links(self):
  page_info = {"package_name": "transformers", "version": "v4.10.0", "language": "fr"}