diff --git a/api/core/rag/cleaner/clean_processor.py b/api/core/rag/cleaner/clean_processor.py index 9cb009035bfc9d..e182c35b9990ee 100644 --- a/api/core/rag/cleaner/clean_processor.py +++ b/api/core/rag/cleaner/clean_processor.py @@ -27,26 +27,44 @@ def clean(cls, text: str, process_rule: dict) -> str: pattern = r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)" text = re.sub(pattern, "", text) - # Remove URL but keep Markdown image URLs - # First, temporarily replace Markdown image URLs with a placeholder - markdown_image_pattern = r"!\[.*?\]\((https?://[^\s)]+)\)" - placeholders: list[str] = [] + # Remove URL but keep Markdown image URLs and link URLs + # Replace the ENTIRE markdown link/image with a single placeholder to protect + # the link text (which might also be a URL) from being removed + markdown_link_pattern = r"\[([^\]]*)\]\((https?://[^)]+)\)" + markdown_image_pattern = r"!\[.*?\]\((https?://[^)]+)\)" + placeholders: list[tuple[str, str, str]] = [] # (type, text, url) - def replace_with_placeholder(match, placeholders=placeholders): + def replace_markdown_with_placeholder(match, placeholders=placeholders): + link_type = "link" + link_text = match.group(1) + url = match.group(2) + placeholder = f"__MARKDOWN_PLACEHOLDER_{len(placeholders)}__" + placeholders.append((link_type, link_text, url)) + return placeholder + + def replace_image_with_placeholder(match, placeholders=placeholders): + link_type = "image" url = match.group(1) - placeholder = f"__MARKDOWN_IMAGE_URL_{len(placeholders)}__" - placeholders.append(url) - return f"![image]({placeholder})" + placeholder = f"__MARKDOWN_PLACEHOLDER_{len(placeholders)}__" + placeholders.append((link_type, "image", url)) + return placeholder - text = re.sub(markdown_image_pattern, replace_with_placeholder, text) + # Protect markdown links first + text = re.sub(markdown_link_pattern, replace_markdown_with_placeholder, text) + # Then protect markdown images + text = re.sub(markdown_image_pattern, replace_image_with_placeholder, text) # Now remove all remaining URLs - url_pattern = r"https?://[^\s)]+" + url_pattern = r"https?://\S+" text = re.sub(url_pattern, "", text) - # Finally, restore the Markdown image URLs - for i, url in enumerate(placeholders): - text = text.replace(f"__MARKDOWN_IMAGE_URL_{i}__", url) + # Restore the Markdown links and images + for i, (link_type, text_or_alt, url) in enumerate(placeholders): + placeholder = f"__MARKDOWN_PLACEHOLDER_{i}__" + if link_type == "link": + text = text.replace(placeholder, f"[{text_or_alt}]({url})") + else: # image + text = text.replace(placeholder, f"![{text_or_alt}]({url})") return text def filter_string(self, text): diff --git a/api/core/tools/utils/text_processing_utils.py b/api/core/tools/utils/text_processing_utils.py index 0f9a91a111f89d..4bfaa5e49bd345 100644 --- a/api/core/tools/utils/text_processing_utils.py +++ b/api/core/tools/utils/text_processing_utils.py @@ -4,6 +4,7 @@ def remove_leading_symbols(text: str) -> str: """ Remove leading punctuation or symbols from the given text. + Preserves markdown links like [text](url) at the start. Args: text (str): The input text to process. @@ -11,6 +12,11 @@ def remove_leading_symbols(text: str) -> str: Returns: str: The text with leading punctuation or symbols removed. """ + # Check if text starts with a markdown link - preserve it + markdown_link_pattern = r"^\[([^\]]+)\]\((https?://[^)]+)\)" + if re.match(markdown_link_pattern, text): + return text + # Match Unicode ranges for punctuation and symbols # FIXME this pattern is confused quick fix for #11868 maybe refactor it later pattern = r'^[\[\]\u2000-\u2025\u2027-\u206F\u2E00-\u2E7F\u3000-\u300F\u3011-\u303F"#$%&\'()*+,./:;<=>?@^_`~]+' diff --git a/api/tests/unit_tests/core/rag/cleaner/__init__.py b/api/tests/unit_tests/core/rag/cleaner/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/api/tests/unit_tests/core/rag/cleaner/test_clean_processor.py b/api/tests/unit_tests/core/rag/cleaner/test_clean_processor.py new file mode 100644 index 00000000000000..467194602abef6 --- /dev/null +++ b/api/tests/unit_tests/core/rag/cleaner/test_clean_processor.py @@ -0,0 +1,969 @@ +from core.rag.cleaner.clean_processor import CleanProcessor + + +class TestCleanProcessor: + """Test cases for CleanProcessor.clean method.""" + + def test_clean_default_removal_of_invalid_symbols(self): + """Test default cleaning removes invalid symbols.""" + # Test <| replacement + assert CleanProcessor.clean("text<|with<|invalid", None) == "text replacement + assert CleanProcessor.clean("text|>with|>invalid", None) == "text>with>invalid" + + # Test removal of control characters + text_with_control = "normal\x00text\x1fwith\x07control\x7fchars" + expected = "normaltextwithcontrolchars" + assert CleanProcessor.clean(text_with_control, None) == expected + + # Test U+FFFE removal + text_with_ufffe = "normal\ufffepadding" + expected = "normalpadding" + assert CleanProcessor.clean(text_with_ufffe, None) == expected + + def test_clean_with_none_process_rule(self): + """Test cleaning with None process_rule - only default cleaning applied.""" + text = "Hello<|World\x00" + expected = "Hello becomes >, control chars and U+FFFE are removed + assert CleanProcessor.clean(text, None) == "<<>>" + + def test_clean_multiple_markdown_links_preserved(self): + """Test multiple markdown links are all preserved.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "[One](https://one.com) [Two](http://two.org) [Three](https://three.net)" + expected = "[One](https://one.com) [Two](http://two.org) [Three](https://three.net)" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_markdown_link_text_as_url(self): + """Test markdown link where the link text itself is a URL.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + # Link text that looks like URL should be preserved + text = "[https://text-url.com](https://actual-url.com)" + expected = "[https://text-url.com](https://actual-url.com)" + assert CleanProcessor.clean(text, process_rule) == expected + + # Text URL without markdown should be removed + text = "https://text-url.com https://actual-url.com" + expected = " " + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_complex_markdown_link_content(self): + """Test markdown links with complex content - known limitation with brackets in link text.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + # Note: The regex pattern [^\]]* cannot handle ] within link text + # This is a known limitation - the pattern stops at the first ] + text = "[Text with [brackets] and (parens)](https://example.com)" + # Actual behavior: only matches up to first ], URL gets removed + expected = "[Text with [brackets] and (parens)](" + assert CleanProcessor.clean(text, process_rule) == expected + + # Test that properly formatted markdown links work + text = "[Text with (parens) and symbols](https://example.com)" + expected = "[Text with (parens) and symbols](https://example.com)" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_mixed_content_with_markdown_and_plain_urls(self): + """Test content with both markdown links and plain URLs.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "Visit [Google](https://google.com) or https://bing.com for search" + expected = "Visit [Google](https://google.com) or for search" + assert CleanProcessor.clean(text, process_rule) == expected + + text = "Image ![alt](https://img.com/pic.jpg) and link https://example.com" + expected = "Image ![alt](https://img.com/pic.jpg) and link " + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_urls_with_query_parameters(self): + """Test URL removal with query parameters.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "Visit https://example.com?param=value&foo=bar for details" + expected = "Visit for details" + assert CleanProcessor.clean(text, process_rule) == expected + + text = "[Link](https://example.com?param=value) text https://other.com?q=test" + expected = "[Link](https://example.com?param=value) text " + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_urls_with_fragments(self): + """Test URL removal with fragments.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "See https://example.com#section for more" + expected = "See for more" + assert CleanProcessor.clean(text, process_rule) == expected + + text = "[Link](https://example.com#anchor) and https://test.com#top" + expected = "[Link](https://example.com#anchor) and " + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_multiple_emails_in_text(self): + """Test removal of multiple email addresses.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "Contact alice@example.com or bob@test.org for help" + expected = "Contact or for help" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_email_with_special_characters(self): + """Test email removal with various valid characters.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "Email: user.name+tag@example.co.uk" + expected = "Email: " + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_http_vs_https_urls(self): + """Test removal of both HTTP and HTTPS URLs.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "Visit http://insecure.com and https://secure.com" + expected = "Visit and " + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_markdown_image_with_alt_text(self): + """Test markdown image preservation with various alt text.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "![Description of image](https://example.com/image.png)" + expected = "![Description of image](https://example.com/image.png)" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_consecutive_markdown_links(self): + """Test consecutive markdown links without spaces.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "[First](https://first.com)[Second](https://second.com)[Third](https://third.com)" + expected = "[First](https://first.com)[Second](https://second.com)[Third](https://third.com)" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_newlines_with_various_counts(self): + """Test newline reduction with different counts.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_extra_spaces", "enabled": True}]}} + + text = "Line1\n\n\nLine2" + expected = "Line1\n\nLine2" + assert CleanProcessor.clean(text, process_rule) == expected + + text = "Line1\n\n\n\n\n\n\n\nLine2" + expected = "Line1\n\nLine2" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_mixed_whitespace_characters(self): + """Test removal of various Unicode whitespace characters.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_extra_spaces", "enabled": True}]}} + + text = "word1\u2000word2" + expected = "word1 word2" + assert CleanProcessor.clean(text, process_rule) == expected + + text = "word1\u3000word2" + expected = "word1 word2" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_tabs_and_spaces(self): + """Test tab and space normalization.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_extra_spaces", "enabled": True}]}} + + text = "word1\t\t\tword2" + expected = "word1 word2" + assert CleanProcessor.clean(text, process_rule) == expected + + text = "word1 word2" + expected = "word1 word2" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_combination_of_rules_complex(self): + """Test complex combination of all rules.""" + process_rule = { + "rules": { + "pre_processing_rules": [ + {"id": "remove_extra_spaces", "enabled": True}, + {"id": "remove_urls_emails", "enabled": True}, + ] + } + } + + text = "Contact\n\n\n\n me@example.com \n\nat [Site](https://site.com) or https://other.com" + expected = "Contact\n\n \n\nat [Site](https://site.com) or " + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_url_at_end_of_sentence(self): + """Test URL removal at sentence boundaries.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "Visit our site at https://example.com." + expected = "Visit our site at ." + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_url_in_parentheses(self): + """Test URL removal when surrounded by parentheses.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "See documentation (https://docs.example.com) for details" + expected = "See documentation () for details" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_markdown_link_empty_text(self): + """Test markdown link with empty link text.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "[](https://example.com)" + expected = "[](https://example.com)" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_markdown_link_with_numbers(self): + """Test markdown link text containing numbers.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "[Link 123](https://example.com)" + expected = "[Link 123](https://example.com)" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_markdown_link_with_special_chars(self): + """Test markdown link text with special characters.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "[Link: special!](https://example.com)" + expected = "[Link: special!](https://example.com)" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_url_with_port(self): + """Test URL removal with port numbers.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "Connect to https://example.com:8080 for service" + expected = "Connect to for service" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_url_with_username(self): + """Test URL removal with username in URL.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "Access https://user@example.com/path" + expected = "Access " + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_long_url_with_path(self): + """Test removal of long URLs with paths.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "Download from https://example.com/path/to/resource/file.zip today" + expected = "Download from today" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_multiple_spaces_between_words(self): + """Test multiple space reduction between words.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_extra_spaces", "enabled": True}]}} + + text = "word1 word2" + expected = "word1 word2" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_newlines_at_start(self): + """Test newlines at the start of text.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_extra_spaces", "enabled": True}]}} + + text = "\n\n\n\nContent" + expected = "\n\nContent" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_newlines_at_end(self): + """Test newlines at the end of text.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_extra_spaces", "enabled": True}]}} + + text = "Content\n\n\n\n" + expected = "Content\n\n" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_only_newlines(self): + """Test text containing only newlines.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_extra_spaces", "enabled": True}]}} + + text = "\n\n\n\n\n" + expected = "\n\n" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_only_spaces(self): + """Test text containing only spaces.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_extra_spaces", "enabled": True}]}} + + text = " " + expected = " " + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_email_in_sentence(self): + """Test email removal within a sentence.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "Please contact john.doe@company.com for assistance." + expected = "Please contact for assistance." + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_multiple_urls_consecutive(self): + """Test multiple consecutive URLs.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "Links: https://one.com https://two.com https://three.com" + expected = "Links: " + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_url_with_hyphen_in_domain(self): + """Test URL with hyphens in domain name.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "Visit https://my-example-site.com today" + expected = "Visit today" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_url_with_subdomain(self): + """Test URL with subdomains.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "Go to https://api.staging.example.com/endpoint" + expected = "Go to " + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_markdown_image_without_alt(self): + """Test markdown image without alt text.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "![](https://example.com/image.png)" + expected = "![](https://example.com/image.png)" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_markdown_nested_in_text(self): + """Test markdown links nested within larger text blocks.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "Before [link](https://example.com) middle https://other.com after" + expected = "Before [link](https://example.com) middle after" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_invalid_symbols_with_text(self): + """Test invalid symbol removal mixed with normal text.""" + text = "Normal<|text|>with\x00invalid\x1fchars" + expected = "Normalwithinvalidchars" + assert CleanProcessor.clean(text, None) == expected + + def test_clean_unicode_fffe_multiple(self): + """Test multiple U+FFFE character removal.""" + text = "text\ufffemore\ufffedata\ufffe" + expected = "textmoredata" + assert CleanProcessor.clean(text, None) == expected + + def test_clean_all_control_characters(self): + """Test removal of various control characters.""" + text = "text\x00\x01\x02\x03\x04\x05\x06\x07end" + expected = "textend" + assert CleanProcessor.clean(text, None) == expected + + def test_clean_combined_invalid_symbols(self): + """Test combination of different invalid symbols.""" + text = "<|test|>\x00content\ufffe<|more|>" + expected = "content" + assert CleanProcessor.clean(text, None) == expected + + def test_clean_markdown_link_and_image_together(self): + """Test markdown link followed by markdown image.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "[Link](https://link.com) ![Image](https://image.com/pic.jpg)" + expected = "[Link](https://link.com) ![Image](https://image.com/pic.jpg)" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_email_with_numbers(self): + """Test email with numbers in address.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "Contact user123@example456.com" + expected = "Contact " + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_url_ending_with_slash(self): + """Test URL that ends with a slash.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "Visit https://example.com/ for info" + expected = "Visit for info" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_url_with_encoded_characters(self): + """Test URL with percent-encoded characters.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "Search https://example.com/search?q=hello%20world" + expected = "Search " + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_markdown_link_with_title(self): + """Test markdown link (title attribute not in URL part).""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "[Link text](https://example.com)" + expected = "[Link text](https://example.com)" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_spaces_with_newlines_mixed(self): + """Test mixed spaces and newlines normalization.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_extra_spaces", "enabled": True}]}} + + text = "Line1\n\n\n \n\n Line2" + expected = "Line1\n\n \n\n Line2" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_real_world_text_example_1(self): + """Test real-world text example with multiple elements.""" + process_rule = { + "rules": { + "pre_processing_rules": [ + {"id": "remove_extra_spaces", "enabled": True}, + {"id": "remove_urls_emails", "enabled": True}, + ] + } + } + + text = """ + For more information, visit [our website](https://example.com) or + email us at support@example.com. + + + You can also check https://blog.example.com for updates. + """ + expected = """ + For more information, visit [our website](https://example.com) or + email us at . + + You can also check for updates. + """ + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_real_world_text_example_2(self): + """Test another real-world scenario.""" + process_rule = { + "rules": { + "pre_processing_rules": [ + {"id": "remove_extra_spaces", "enabled": True}, + {"id": "remove_urls_emails", "enabled": True}, + ] + } + } + + text = "Contact: john@company.com or visit https://company.com" + expected = "Contact: or visit " + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_markdown_with_inline_code(self): + """Test markdown links alongside inline code (not affected).""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "Use [docs](https://docs.com) or see `https://example.com`" + expected = "Use [docs](https://docs.com) or see ``" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_line_breaks_preservation(self): + """Test that single and double line breaks are preserved correctly.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_extra_spaces", "enabled": True}]}} + + text = "Line1\nLine2\n\nLine3" + expected = "Line1\nLine2\n\nLine3" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_url_followed_by_punctuation(self): + """Test URL followed immediately by punctuation.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "Visit https://example.com, https://other.com; or https://third.com!" + expected = "Visit , ; or !" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_email_followed_by_punctuation(self): + """Test email followed immediately by punctuation.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "Contact user@example.com, admin@test.com; or support@help.com." + expected = "Contact , ; or ." + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_multiple_markdown_images(self): + """Test multiple markdown images in sequence.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "![img1](https://a.com/1.jpg) ![img2](https://b.com/2.jpg) ![img3](https://c.com/3.jpg)" + expected = "![img1](https://a.com/1.jpg) ![img2](https://b.com/2.jpg) ![img3](https://c.com/3.jpg)" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_form_feed_and_carriage_return(self): + """Test form feed and carriage return handling.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_extra_spaces", "enabled": True}]}} + + text = "word1\f\f\rword2" + expected = "word1 word2" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_non_breaking_space(self): + """Test non-breaking space handling.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_extra_spaces", "enabled": True}]}} + + text = "word1\u00a0\u00a0word2" + expected = "word1 word2" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_ideographic_space(self): + """Test ideographic space (U+3000) handling.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_extra_spaces", "enabled": True}]}} + + text = "word1\u3000\u3000\u3000word2" + expected = "word1 word2" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_url_with_tld_variations(self): + """Test URL removal with various TLDs.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "Visit https://example.co.uk or https://site.io or https://page.dev" + expected = "Visit or or " + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_markdown_link_multiline(self): + """Test markdown links across lines.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "[Link](https://example.com)\nNext line" + expected = "[Link](https://example.com)\nNext line" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_email_at_start_of_text(self): + """Test email at the very beginning of text.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "user@example.com is the contact" + expected = " is the contact" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_url_at_start_of_text(self): + """Test URL at the very beginning of text.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "https://example.com is the website" + expected = " is the website" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_mixed_newlines_and_content(self): + """Test complex mix of newlines and content.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_extra_spaces", "enabled": True}]}} + + text = "Para1\n\n\nPara2\n\n\n\nPara3" + expected = "Para1\n\nPara2\n\nPara3" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_whitespace_only_lines(self): + """Test lines containing only whitespace.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_extra_spaces", "enabled": True}]}} + + text = "Line1\n \n \nLine2" + expected = "Line1\n \n \nLine2" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_url_in_quotes(self): + """Test URL within quotes.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = 'Visit "https://example.com" for more' + expected = 'Visit "" for more' + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_email_in_quotes(self): + """Test email within quotes.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = 'Email "contact@example.com" for help' + expected = 'Email "" for help' + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_markdown_link_in_list(self): + """Test markdown links in list format.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "- [Link1](https://one.com)\n- [Link2](https://two.com)" + expected = "- [Link1](https://one.com)\n- [Link2](https://two.com)" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_url_with_authentication(self): + """Test URL with username:password format.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "Access https://user:pass@example.com/resource" + expected = "Access " + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_multiple_control_chars_consecutive(self): + """Test consecutive control characters.""" + text = "text\x00\x01\x02\x03\x04more" + expected = "textmore" + assert CleanProcessor.clean(text, None) == expected + + def test_clean_special_bracket_sequences(self): + """Test special bracket replacement patterns.""" + text = "test<|bracket|>end" + expected = "testend" + assert CleanProcessor.clean(text, None) == expected + + def test_clean_repeated_invalid_patterns(self): + """Test repeated invalid symbol patterns.""" + text = "<|<|<|test|>|>|>" + expected = "<<>>" + assert CleanProcessor.clean(text, None) == expected + + def test_clean_email_with_subdomain(self): + """Test email with subdomain in domain part.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "Contact user@mail.example.com" + expected = "Contact " + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_url_with_multiple_query_params(self): + """Test URL with multiple query parameters.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "Search https://example.com?q=test&page=1&sort=desc" + expected = "Search " + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_markdown_image_with_title(self): + """Test markdown image (title not in main pattern).""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "![alt text](https://example.com/img.png)" + expected = "![alt text](https://example.com/img.png)" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_complex_whitespace_pattern(self): + """Test complex whitespace pattern.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_extra_spaces", "enabled": True}]}} + + text = "word1\t \t \tword2" + expected = "word1 word2" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_newline_after_url(self): + """Test newline immediately after URL.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "Visit https://example.com\nfor more info" + expected = "Visit \nfor more info" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_newline_after_email(self): + """Test newline immediately after email.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "Contact admin@example.com\nfor support" + expected = "Contact \nfor support" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_markdown_link_with_dash_in_text(self): + """Test markdown link with dashes in link text.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "[My-Link-Text](https://example.com)" + expected = "[My-Link-Text](https://example.com)" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_markdown_link_with_underscore(self): + """Test markdown link with underscores in text.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "[Link_With_Underscores](https://example.com)" + expected = "[Link_With_Underscores](https://example.com)" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_url_localhost(self): + """Test localhost URL removal.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "Test at http://localhost:3000/api" + expected = "Test at " + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_url_ip_address(self): + """Test IP address URL removal.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "Connect to http://192.168.1.1:8080" + expected = "Connect to " + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_file_protocol_url(self): + """Test that file:// protocol is not removed (only http/https).""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "Open file://path/to/file.txt" + expected = "Open file://path/to/file.txt" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_ftp_protocol_url(self): + """Test that ftp:// protocol is not removed (only http/https).""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "Download from ftp://ftp.example.com/file" + expected = "Download from ftp://ftp.example.com/file" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_mixed_valid_invalid_symbols(self): + """Test mixture of valid and invalid symbols.""" + text = "Normal<|text with|>valid and\x00invalid" + expected = "Normalvalid and invalid" + assert CleanProcessor.clean(text, None) == expected + + def test_clean_unicode_replacement_char(self): + """Test U+FFFE Unicode replacement character removal.""" + text = "start\ufffeend" + expected = "startend" + assert CleanProcessor.clean(text, None) == expected + + def test_clean_text_with_emoji_preserved(self): + """Test that emoji characters are preserved.""" + text = "Hello 😀 World 🌍" + expected = "Hello 😀 World 🌍" + assert CleanProcessor.clean(text, None) == expected + + def test_clean_chinese_characters_preserved(self): + """Test that Chinese characters are preserved.""" + text = "你好世界" + expected = "你好世界" + assert CleanProcessor.clean(text, None) == expected + + def test_clean_arabic_characters_preserved(self): + """Test that Arabic characters are preserved.""" + text = "مرحبا بالعالم" + expected = "مرحبا بالعالم" + assert CleanProcessor.clean(text, None) == expected + + def test_clean_cyrillic_characters_preserved(self): + """Test that Cyrillic characters are preserved.""" + text = "Привет мир" + expected = "Привет мир" + assert CleanProcessor.clean(text, None) == expected + + def test_clean_mixed_languages(self): + """Test mixed language content.""" + text = "Hello 你好 مرحبا Привет" + expected = "Hello 你好 مرحبا Привет" + assert CleanProcessor.clean(text, None) == expected + + def test_clean_url_with_hash_in_query(self): + """Test URL with hash in query parameter.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "Visit https://example.com?id=123#section" + expected = "Visit " + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_email_with_dots_in_local_part(self): + """Test email with multiple dots in local part.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "Contact first.middle.last@example.com" + expected = "Contact " + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_url_data_uri(self): + """Test that data: URI is not removed (only http/https).""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "Image data:image/png;base64,ABC123" + expected = "Image data:image/png;base64,ABC123" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_markdown_reference_link(self): + """Test standard markdown link (reference style not tested as not in pattern).""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "[Link][1]\n\n[1]: https://example.com" + expected = "[Link][1]\n\n[1]: " + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_url_with_equals_in_fragment(self): + """Test URL with equals sign in fragment.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "Link https://example.com#key=value here" + expected = "Link here" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_multiple_at_symbols(self): + """Test text with multiple @ symbols (not email).""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "Mention @user @another in text" + expected = "Mention @user @another in text" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_url_without_protocol_not_removed(self): + """Test that URLs without protocol are not removed.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "Visit example.com or www.example.com" + expected = "Visit example.com or www.example.com" + assert CleanProcessor.clean(text, process_rule) == expected + + def test_clean_partial_email_not_removed(self): + """Test that partial email patterns are not removed.""" + process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}} + + text = "Username is user@ or @domain" + expected = "Username is user@ or @domain" + assert CleanProcessor.clean(text, process_rule) == expected diff --git a/api/tests/unit_tests/utils/test_text_processing.py b/api/tests/unit_tests/utils/test_text_processing.py index 11e017464adc4b..bf61162a66bf8e 100644 --- a/api/tests/unit_tests/utils/test_text_processing.py +++ b/api/tests/unit_tests/utils/test_text_processing.py @@ -15,6 +15,11 @@ ("", ""), (" ", " "), ("【测试】", "【测试】"), + # Markdown link preservation - should be preserved if text starts with a markdown link + ("[Google](https://google.com) is a search engine", "[Google](https://google.com) is a search engine"), + ("[Example](http://example.com) some text", "[Example](http://example.com) some text"), + # Leading symbols before markdown link are removed, including the opening bracket [ + ("@[Test](https://example.com)", "Test](https://example.com)"), ], ) def test_remove_leading_symbols(input_text, expected_output):