qodo-benchmark · ofir-frd · Dec 30, 2025 · Dec 30, 2025 · Dec 31, 2025 · Jan 21, 2026
diff --git a/api/core/rag/cleaner/clean_processor.py b/api/core/rag/cleaner/clean_processor.py
@@ -27,26 +27,44 @@ def clean(cls, text: str, process_rule: dict) -> str:
                     pattern = r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)"
                     text = re.sub(pattern, "", text)
 
-                    # Remove URL but keep Markdown image URLs
-                    # First, temporarily replace Markdown image URLs with a placeholder
-                    markdown_image_pattern = r"!\[.*?\]\((https?://[^\s)]+)\)"
-                    placeholders: list[str] = []
+                    # Remove URL but keep Markdown image URLs and link URLs
+                    # Replace the ENTIRE markdown link/image with a single placeholder to protect
+                    # the link text (which might also be a URL) from being removed
+                    markdown_link_pattern = r"\[([^\]]*)\]\((https?://[^)]+)\)"
+                    markdown_image_pattern = r"!\[.*?\]\((https?://[^)]+)\)"
+                    placeholders: list[tuple[str, str, str]] = []  # (type, text, url)
 
-                    def replace_with_placeholder(match, placeholders=placeholders):
+                    def replace_markdown_with_placeholder(match, placeholders=placeholders):
+                        link_type = "link"
+                        link_text = match.group(1)
+                        url = match.group(2)
+                        placeholder = f"__MARKDOWN_PLACEHOLDER_{len(placeholders)}__"
+                        placeholders.append((link_type, link_text, url))
+                        return placeholder
+
+                    def replace_image_with_placeholder(match, placeholders=placeholders):
+                        link_type = "image"
                         url = match.group(1)
-                        placeholder = f"__MARKDOWN_IMAGE_URL_{len(placeholders)}__"
-                        placeholders.append(url)
-                        return f"![image]({placeholder})"
+                        placeholder = f"__MARKDOWN_PLACEHOLDER_{len(placeholders)}__"
+                        placeholders.append((link_type, "image", url))
+                        return placeholder
 
-                    text = re.sub(markdown_image_pattern, replace_with_placeholder, text)
+                    # Protect markdown links first
+                    text = re.sub(markdown_link_pattern, replace_markdown_with_placeholder, text)
+                    # Then protect markdown images
+                    text = re.sub(markdown_image_pattern, replace_image_with_placeholder, text)
 
                     # Now remove all remaining URLs
-                    url_pattern = r"https?://[^\s)]+"
+                    url_pattern = r"https?://\S+"
                     text = re.sub(url_pattern, "", text)
 
-                    # Finally, restore the Markdown image URLs
-                    for i, url in enumerate(placeholders):
-                        text = text.replace(f"__MARKDOWN_IMAGE_URL_{i}__", url)
+                    # Restore the Markdown links and images
+                    for i, (link_type, text_or_alt, url) in enumerate(placeholders):
+                        placeholder = f"__MARKDOWN_PLACEHOLDER_{i}__"
+                        if link_type == "link":
+                            text = text.replace(placeholder, f"[{text_or_alt}]({url})")
+                        else:  # image
+                            text = text.replace(placeholder, f"![{text_or_alt}]({url})")
         return text
 
     def filter_string(self, text):

diff --git a/api/core/tools/utils/text_processing_utils.py b/api/core/tools/utils/text_processing_utils.py
@@ -4,13 +4,19 @@
 def remove_leading_symbols(text: str) -> str:
     """
     Remove leading punctuation or symbols from the given text.
+    Preserves markdown links like [text](url) at the start.
 
     Args:
         text (str): The input text to process.
 
     Returns:
         str: The text with leading punctuation or symbols removed.
     """
+    # Check if text starts with a markdown link - preserve it
+    markdown_link_pattern = r"^\[([^\]]+)\]\((https?://[^)]+)\)"
+    if re.match(markdown_link_pattern, text):
+        return text
+
     # Match Unicode ranges for punctuation and symbols
     # FIXME this pattern is confused quick fix for #11868 maybe refactor it later
     pattern = r'^[\[\]\u2000-\u2025\u2027-\u206F\u2E00-\u2E7F\u3000-\u300F\u3011-\u303F"#$%&\'()*+,./:;<=>?@^_`~]+'

diff --git a/api/tests/unit_tests/core/rag/cleaner/__init__.py b/api/tests/unit_tests/core/rag/cleaner/__init__.py