From b6be62acb933b324c5257dfc9a469338b40a98bb Mon Sep 17 00:00:00 2001
From: Wauplin <lucainp@gmail.com>
Date: Wed, 20 Sep 2023 15:40:55 +0200
Subject: [PATCH 1/5] Fix regex to catch HTML tags

---
 src/doc_builder/convert_md_to_mdx.py | 21 +++++++++++++++++++--
 tests/test_convert_md_to_mdx.py      |  7 +++++++
 2 files changed, 26 insertions(+), 2 deletions(-)
diff --git a/src/doc_builder/convert_md_to_mdx.py b/src/doc_builder/convert_md_to_mdx.py
index 357b3e92..72c5e5a6 100644
--- a/src/doc_builder/convert_md_to_mdx.py
+++ b/src/doc_builder/convert_md_to_mdx.py
@@ -22,7 +22,21 @@
 
 
 _re_doctest_flags = re.compile(r"^(>>>.*\S)(\s+)# doctest:\s+\+[A-Z_]+\s*$", flags=re.MULTILINE)
-_re_lt_html = re.compile(r"<(((!(DOCTYPE|--))|((\/\s*)?[a-z]+))[^>]*?)>", re.IGNORECASE)
+_re_lt_html = re.compile(
+    r"""
+    <(                   # HTML tag with...
+    (
+        !DOCTYPE         # ... !DOCTYPE
+    |
+        ((\/\s*)?[a-z]+) # ... or any regular tag (i.e. starts with [a-z]
+    )
+    [^><]*?              # ... followed by anything until next closing ">"
+    )>
+    |
+    <(!--[^>]*?--)>      # Or an HTML comment
+    """,
+    re.IGNORECASE | re.VERBOSE,
+)
 _re_lcub_svelte = re.compile(
     r"<(Question|Tip|Added|Changed|Deprecated|DocNotebookDropdown|CourseFloatingBanner|FrameworkSwitch|audio|PipelineIcon|PipelineTag)(((?!<(Question|Tip|Added|Changed|Deprecated|DocNotebookDropdown|CourseFloatingBanner|FrameworkSwitch|audio|PipelineIcon|PipelineTag)).)*)>|&amp;lcub;(#if|:else}|/if})",
     re.DOTALL,
@@ -80,11 +94,14 @@ def convert_special_chars(text):
     # source is a special tag, it can be standalone (html tag) or closing (doc tag)
 
     # Temporarily replace all valid HTML tags with LTHTML
-    text = re.sub(_re_lt_html, r"LTHTML\1>", text)
+    # Replace with '\1\5' => 2 possible groups to catch the tag but in practice only one is not empty.
+    text = re.sub(_re_lt_html, r"LTHTML\1\5>", text)
+
     # Encode remaining < symbols
     text = text.replace("<", "&amp;lt;")
     # Put back the HTML tags
     text = text.replace("LTHTML", "<")
+
     return text
 
 
diff --git a/tests/test_convert_md_to_mdx.py b/tests/test_convert_md_to_mdx.py
index 87ac33a8..73de984c 100644
--- a/tests/test_convert_md_to_mdx.py
+++ b/tests/test_convert_md_to_mdx.py
@@ -132,6 +132,13 @@ def test_convert_special_chars(self):
             "something &amp;lt;5MB something else -> here",
         )
 
+        # Regression test for
+        # '10K<n<100K' must be caught correctly and not grouped with the next HTML tag.
+        self.assertEqual(
+            convert_special_chars("""10K<n<100K\n<Tip>\nThis is a tip.\n</Tip>"""),
+            "10K&amp;lt;n&amp;lt;100K\n<Tip>\nThis is a tip.\n</Tip>",
+        )
+
     def test_convert_img_links(self):
         page_info = {"package_name": "transformers", "version": "v4.10.0", "language": "fr"}
 

From b7be025b979e11f231813218627176ebca6e51b8 Mon Sep 17 00:00:00 2001
From: Wauplin <lucainp@gmail.com>
Date: Wed, 20 Sep 2023 15:49:13 +0200
Subject: [PATCH 2/5] comments

---
 src/doc_builder/convert_md_to_mdx.py | 2 +-
 tests/test_convert_md_to_mdx.py      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/doc_builder/convert_md_to_mdx.py b/src/doc_builder/convert_md_to_mdx.py
index 72c5e5a6..bd6c4c45 100644
--- a/src/doc_builder/convert_md_to_mdx.py
+++ b/src/doc_builder/convert_md_to_mdx.py
@@ -23,7 +23,7 @@
 
 _re_doctest_flags = re.compile(r"^(>>>.*\S)(\s+)# doctest:\s+\+[A-Z_]+\s*$", flags=re.MULTILINE)
 _re_lt_html = re.compile(
-    r"""
+    r"""# This regex is meant to detect any HTML tag or comment, but not standalone '<' characters.
     <(                   # HTML tag with...
     (
         !DOCTYPE         # ... !DOCTYPE
diff --git a/tests/test_convert_md_to_mdx.py b/tests/test_convert_md_to_mdx.py
index 73de984c..1405bd05 100644
--- a/tests/test_convert_md_to_mdx.py
+++ b/tests/test_convert_md_to_mdx.py
@@ -132,7 +132,7 @@ def test_convert_special_chars(self):
             "something &amp;lt;5MB something else -> here",
         )
 
-        # Regression test for
+        # Regression test for https://github.com/huggingface/doc-builder/pull/398
         # '10K<n<100K' must be caught correctly and not grouped with the next HTML tag.
         self.assertEqual(
             convert_special_chars("""10K<n<100K\n<Tip>\nThis is a tip.\n</Tip>"""),

From ec14a49dc9916ec8f0e26148783bc22d72908bc2 Mon Sep 17 00:00:00 2001
From: Wauplin <lucainp@gmail.com>
Date: Wed, 20 Sep 2023 16:09:32 +0200
Subject: [PATCH 3/5] fix unrelated test

---
 tests/test_autodoc.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_autodoc.py b/tests/test_autodoc.py
index eba2353c..b2db46ee 100644
--- a/tests/test_autodoc.py
+++ b/tests/test_autodoc.py
@@ -265,7 +265,7 @@ def test_document_object(self):
         model_output_doc = """
 <docstring><name>class transformers.utils.ModelOutput</name><anchor>transformers.utils.ModelOutput</anchor><source>"""
         model_output_doc += f"{self.test_source_link}"
-        model_output_doc += """</source><parameters>""</parameters></docstring>
+        model_output_doc += """</source><parameters>[{"name": "*args", "val": ""}, {"name": "**kwargs", "val": ""}]</parameters></docstring>
 
 Base class for all model outputs as dataclass. Has a `__getitem__` that allows indexing by integer or slice (like a
 tuple) or strings (like a dictionary) that will ignore the `None` attributes. Otherwise behaves like a regular

From 8c6163b26990993adcfea537f6ec986185c55565 Mon Sep 17 00:00:00 2001
From: Mishig <mishig.davaadorj@coloradocollege.edu>
Date: Wed, 20 Sep 2023 16:31:44 +0200
Subject: [PATCH 4/5] temporary doc test

---
 .github/workflows/build_pr_documentation.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml
index 019332a2..f29dea90 100644
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@@ -52,6 +52,7 @@ jobs:
         with:
           repository: 'huggingface/doc-builder'
           path: doc-builder
+          ref: fix-lt-html-regex
 
       - uses: actions/checkout@v2
         with:
@@ -102,7 +103,7 @@ jobs:
         run: |
           pip uninstall -y doc-builder
           cd doc-builder
-          git pull origin main
+          git pull origin fix-lt-html-regex
           pip install .
           cd ..
 

From ea5a296cd448f9cc5dfef83bfde439377a05330b Mon Sep 17 00:00:00 2001
From: Wauplin <lucainp@gmail.com>
Date: Wed, 20 Sep 2023 17:02:02 +0200
Subject: [PATCH 5/5] Revert "temporary doc test"

This reverts commit 8c6163b26990993adcfea537f6ec986185c55565.
---
 .github/workflows/build_pr_documentation.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml
index f29dea90..019332a2 100644
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@@ -52,7 +52,6 @@ jobs:
         with:
           repository: 'huggingface/doc-builder'
           path: doc-builder
-          ref: fix-lt-html-regex
 
       - uses: actions/checkout@v2
         with:
@@ -103,7 +102,7 @@ jobs:
         run: |
           pip uninstall -y doc-builder
           cd doc-builder
-          git pull origin fix-lt-html-regex
+          git pull origin main
           pip install .
           cd ..