Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 31 additions & 13 deletions api/core/rag/cleaner/clean_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,26 +27,44 @@ def clean(cls, text: str, process_rule: dict) -> str:
pattern = r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)"
text = re.sub(pattern, "", text)

# Remove URL but keep Markdown image URLs
# First, temporarily replace Markdown image URLs with a placeholder
markdown_image_pattern = r"!\[.*?\]\((https?://[^\s)]+)\)"
placeholders: list[str] = []
# Remove URL but keep Markdown image URLs and link URLs
# Replace the ENTIRE markdown link/image with a single placeholder to protect
# the link text (which might also be a URL) from being removed
markdown_link_pattern = r"\[([^\]]*)\]\((https?://[^)]+)\)"
markdown_image_pattern = r"!\[.*?\]\((https?://[^)]+)\)"
placeholders: list[tuple[str, str, str]] = [] # (type, text, url)

def replace_with_placeholder(match, placeholders=placeholders):
def replace_markdown_with_placeholder(match, placeholders=placeholders):
link_type = "link"
link_text = match.group(1)
url = match.group(2)
placeholder = f"__MARKDOWN_PLACEHOLDER_{len(placeholders)}__"
placeholders.append((link_type, link_text, url))
return placeholder

def replace_image_with_placeholder(match, placeholders=placeholders):
link_type = "image"
url = match.group(1)
placeholder = f"__MARKDOWN_IMAGE_URL_{len(placeholders)}__"
placeholders.append(url)
return f"![image]({placeholder})"
placeholder = f"__MARKDOWN_PLACEHOLDER_{len(placeholders)}__"
placeholders.append((link_type, "image", url))
return placeholder

text = re.sub(markdown_image_pattern, replace_with_placeholder, text)
# Protect markdown links first
text = re.sub(markdown_link_pattern, replace_markdown_with_placeholder, text)
# Then protect markdown images
text = re.sub(markdown_image_pattern, replace_image_with_placeholder, text)

# Now remove all remaining URLs
url_pattern = r"https?://[^\s)]+"
url_pattern = r"https?://\S+"
text = re.sub(url_pattern, "", text)

# Finally, restore the Markdown image URLs
for i, url in enumerate(placeholders):
text = text.replace(f"__MARKDOWN_IMAGE_URL_{i}__", url)
# Restore the Markdown links and images
for i, (link_type, text_or_alt, url) in enumerate(placeholders):
placeholder = f"__MARKDOWN_PLACEHOLDER_{i}__"
if link_type == "link":
text = text.replace(placeholder, f"[{text_or_alt}]({url})")
else: # image
text = text.replace(placeholder, f"![{text_or_alt}]({url})")
return text

def filter_string(self, text):
Expand Down
6 changes: 6 additions & 0 deletions api/core/tools/utils/text_processing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,19 @@
def remove_leading_symbols(text: str) -> str:
"""
Remove leading punctuation or symbols from the given text.
Preserves markdown links like [text](url) at the start.

Args:
text (str): The input text to process.

Returns:
str: The text with leading punctuation or symbols removed.
"""
# Check if text starts with a markdown link - preserve it
markdown_link_pattern = r"^\[([^\]]+)\]\((https?://[^)]+)\)"
if re.match(markdown_link_pattern, text):
return text

# Match Unicode ranges for punctuation and symbols
# FIXME this pattern is confused quick fix for #11868 maybe refactor it later
pattern = r'^[\[\]\u2000-\u2025\u2027-\u206F\u2E00-\u2E7F\u3000-\u300F\u3011-\u303F"#$%&\'()*+,./:;<=>?@^_`~]+'
Expand Down
Empty file.
Loading
Loading