From b6be62acb933b324c5257dfc9a469338b40a98bb Mon Sep 17 00:00:00 2001 From: Wauplin Date: Wed, 20 Sep 2023 15:40:55 +0200 Subject: [PATCH 1/5] Fix regex to catch HTML tags --- src/doc_builder/convert_md_to_mdx.py | 21 +++++++++++++++++++-- tests/test_convert_md_to_mdx.py | 7 +++++++ 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/src/doc_builder/convert_md_to_mdx.py b/src/doc_builder/convert_md_to_mdx.py index 357b3e92..72c5e5a6 100644 --- a/src/doc_builder/convert_md_to_mdx.py +++ b/src/doc_builder/convert_md_to_mdx.py @@ -22,7 +22,21 @@ _re_doctest_flags = re.compile(r"^(>>>.*\S)(\s+)# doctest:\s+\+[A-Z_]+\s*$", flags=re.MULTILINE) -_re_lt_html = re.compile(r"<(((!(DOCTYPE|--))|((\/\s*)?[a-z]+))[^>]*?)>", re.IGNORECASE) +_re_lt_html = re.compile( + r""" + <( # HTML tag with... + ( + !DOCTYPE # ... !DOCTYPE + | + ((\/\s*)?[a-z]+) # ... or any regular tag (i.e. starts with [a-z] + ) + [^><]*? # ... followed by anything until next closing ">" + )> + | + <(!--[^>]*?--)> # Or an HTML comment + """, + re.IGNORECASE | re.VERBOSE, +) _re_lcub_svelte = re.compile( r"<(Question|Tip|Added|Changed|Deprecated|DocNotebookDropdown|CourseFloatingBanner|FrameworkSwitch|audio|PipelineIcon|PipelineTag)(((?!<(Question|Tip|Added|Changed|Deprecated|DocNotebookDropdown|CourseFloatingBanner|FrameworkSwitch|audio|PipelineIcon|PipelineTag)).)*)>|&lcub;(#if|:else}|/if})", re.DOTALL, @@ -80,11 +94,14 @@ def convert_special_chars(text): # source is a special tag, it can be standalone (html tag) or closing (doc tag) # Temporarily replace all valid HTML tags with LTHTML - text = re.sub(_re_lt_html, r"LTHTML\1>", text) + # Replace with '\1\5' => 2 possible groups to catch the tag but in practice only one is not empty. + text = re.sub(_re_lt_html, r"LTHTML\1\5>", text) + # Encode remaining < symbols text = text.replace("<", "&lt;") # Put back the HTML tags text = text.replace("LTHTML", "<") + return text diff --git a/tests/test_convert_md_to_mdx.py b/tests/test_convert_md_to_mdx.py index 87ac33a8..73de984c 100644 --- a/tests/test_convert_md_to_mdx.py +++ b/tests/test_convert_md_to_mdx.py @@ -132,6 +132,13 @@ def test_convert_special_chars(self): "something &lt;5MB something else -> here", ) + # Regression test for + # '10K\nThis is a tip.\n"""), + "10K&lt;n&lt;100K\n\nThis is a tip.\n", + ) + def test_convert_img_links(self): page_info = {"package_name": "transformers", "version": "v4.10.0", "language": "fr"} From b7be025b979e11f231813218627176ebca6e51b8 Mon Sep 17 00:00:00 2001 From: Wauplin Date: Wed, 20 Sep 2023 15:49:13 +0200 Subject: [PATCH 2/5] comments --- src/doc_builder/convert_md_to_mdx.py | 2 +- tests/test_convert_md_to_mdx.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/doc_builder/convert_md_to_mdx.py b/src/doc_builder/convert_md_to_mdx.py index 72c5e5a6..bd6c4c45 100644 --- a/src/doc_builder/convert_md_to_mdx.py +++ b/src/doc_builder/convert_md_to_mdx.py @@ -23,7 +23,7 @@ _re_doctest_flags = re.compile(r"^(>>>.*\S)(\s+)# doctest:\s+\+[A-Z_]+\s*$", flags=re.MULTILINE) _re_lt_html = re.compile( - r""" + r"""# This regex is meant to detect any HTML tag or comment, but not standalone '<' characters. <( # HTML tag with... ( !DOCTYPE # ... !DOCTYPE diff --git a/tests/test_convert_md_to_mdx.py b/tests/test_convert_md_to_mdx.py index 73de984c..1405bd05 100644 --- a/tests/test_convert_md_to_mdx.py +++ b/tests/test_convert_md_to_mdx.py @@ -132,7 +132,7 @@ def test_convert_special_chars(self): "something &lt;5MB something else -> here", ) - # Regression test for + # Regression test for https://github.com/huggingface/doc-builder/pull/398 # '10K\nThis is a tip.\n"""), From ec14a49dc9916ec8f0e26148783bc22d72908bc2 Mon Sep 17 00:00:00 2001 From: Wauplin Date: Wed, 20 Sep 2023 16:09:32 +0200 Subject: [PATCH 3/5] fix unrelated test --- tests/test_autodoc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_autodoc.py b/tests/test_autodoc.py index eba2353c..b2db46ee 100644 --- a/tests/test_autodoc.py +++ b/tests/test_autodoc.py @@ -265,7 +265,7 @@ def test_document_object(self): model_output_doc = """ class transformers.utils.ModelOutputtransformers.utils.ModelOutput""" model_output_doc += f"{self.test_source_link}" - model_output_doc += """"" + model_output_doc += """[{"name": "*args", "val": ""}, {"name": "**kwargs", "val": ""}] Base class for all model outputs as dataclass. Has a `__getitem__` that allows indexing by integer or slice (like a tuple) or strings (like a dictionary) that will ignore the `None` attributes. Otherwise behaves like a regular From 8c6163b26990993adcfea537f6ec986185c55565 Mon Sep 17 00:00:00 2001 From: Mishig Date: Wed, 20 Sep 2023 16:31:44 +0200 Subject: [PATCH 4/5] temporary doc test --- .github/workflows/build_pr_documentation.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml index 019332a2..f29dea90 100644 --- a/.github/workflows/build_pr_documentation.yml +++ b/.github/workflows/build_pr_documentation.yml @@ -52,6 +52,7 @@ jobs: with: repository: 'huggingface/doc-builder' path: doc-builder + ref: fix-lt-html-regex - uses: actions/checkout@v2 with: @@ -102,7 +103,7 @@ jobs: run: | pip uninstall -y doc-builder cd doc-builder - git pull origin main + git pull origin fix-lt-html-regex pip install . cd .. From ea5a296cd448f9cc5dfef83bfde439377a05330b Mon Sep 17 00:00:00 2001 From: Wauplin Date: Wed, 20 Sep 2023 17:02:02 +0200 Subject: [PATCH 5/5] Revert "temporary doc test" This reverts commit 8c6163b26990993adcfea537f6ec986185c55565. --- .github/workflows/build_pr_documentation.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml index f29dea90..019332a2 100644 --- a/.github/workflows/build_pr_documentation.yml +++ b/.github/workflows/build_pr_documentation.yml @@ -52,7 +52,6 @@ jobs: with: repository: 'huggingface/doc-builder' path: doc-builder - ref: fix-lt-html-regex - uses: actions/checkout@v2 with: @@ -103,7 +102,7 @@ jobs: run: | pip uninstall -y doc-builder cd doc-builder - git pull origin fix-lt-html-regex + git pull origin main pip install . cd ..