|
2 | 2 | # pylint: disable=too-many-lines |
3 | 3 | # Justification: Search service orchestrates multiple complex search paths |
4 | 4 |
|
| 5 | +import re |
5 | 6 | import time |
6 | 7 | from collections.abc import Callable |
7 | 8 | from functools import wraps |
|
42 | 43 |
|
43 | 44 | logger = get_logger("services.search") |
44 | 45 |
|
| 46 | +# Pre-compiled regex pattern for HTML tag detection (optimized for performance) |
| 47 | +# Matches common HTML tags: table, div, p, b, strong, em, i, a, ul, ol, li, h1-h6, code, pre, blockquote, img, br, hr |
| 48 | +_HTML_TAG_PATTERN = re.compile( |
| 49 | + r"<(?:table|div|p|b|strong|em|i|a\s|ul|ol|li|h[1-6]|code|pre|blockquote|img\s|br|hr)", |
| 50 | + re.IGNORECASE, |
| 51 | +) |
| 52 | + |
45 | 53 | T = TypeVar("T") |
46 | 54 | P = ParamSpec("P") |
47 | 55 |
|
@@ -352,37 +360,102 @@ def _generate_document_metadata( |
352 | 360 |
|
353 | 361 | def _clean_generated_answer(self, answer: str) -> str: |
354 | 362 | """ |
355 | | - Clean generated answer by removing artifacts and duplicates. |
| 363 | + Clean generated answer by removing artifacts and duplicates while preserving Markdown. |
356 | 364 |
|
357 | 365 | Removes: |
358 | 366 | - " AND " artifacts from query rewriting |
359 | | - - Duplicate consecutive words |
| 367 | + - Duplicate consecutive words (except in Markdown headers) |
360 | 368 | - Leading/trailing whitespace |
| 369 | +
|
| 370 | + Converts: |
| 371 | + - HTML formatting to Markdown (tables, bold, italic, links, lists, etc.) |
| 372 | +
|
| 373 | + Preserves: |
| 374 | + - Markdown headers (##, ###, etc.) |
| 375 | + - Markdown formatting (bold, italic, lists, code blocks, etc.) |
| 376 | +
|
| 377 | + This ensures proper rendering in the React frontend which uses ReactMarkdown. |
| 378 | + Issue #655: Support all HTML formatting types, not just tables. |
361 | 379 | """ |
362 | 380 | # pylint: disable=import-outside-toplevel |
363 | | - # Justification: Lazy import to avoid loading re module unless needed |
364 | | - import re |
| 381 | + # Justification: Lazy import to avoid loading html2text unless needed |
| 382 | + import html2text |
365 | 383 |
|
366 | 384 | cleaned = answer.strip() |
367 | 385 |
|
368 | | - # Remove " AND " artifacts that come from query rewriting |
| 386 | + # Convert HTML to Markdown if HTML tags detected |
| 387 | + # Optimized: Use single pre-compiled regex pattern instead of 20+ searches |
| 388 | + if "<" in cleaned and ">" in cleaned and _HTML_TAG_PATTERN.search(cleaned): |
| 389 | + try: |
| 390 | + # Configure html2text for clean Markdown conversion |
| 391 | + h = html2text.HTML2Text() |
| 392 | + h.body_width = 0 # Don't wrap lines |
| 393 | + h.unicode_snob = True # Use Unicode characters |
| 394 | + h.ignore_links = False # Keep links |
| 395 | + h.ignore_images = False # Keep images |
| 396 | + h.ignore_emphasis = False # Keep bold/italic |
| 397 | + h.skip_internal_links = False # Keep all links |
| 398 | + h.inline_links = True # Use inline link format [text](url) |
| 399 | + h.protect_links = True # Don't mangle URLs |
| 400 | + h.wrap_links = False # Don't wrap links |
| 401 | + h.wrap_lists = False # Don't wrap lists |
| 402 | + |
| 403 | + # Convert HTML to Markdown |
| 404 | + cleaned = h.handle(cleaned) |
| 405 | + except Exception as e: |
| 406 | + # If HTML conversion fails, log warning and continue with original text |
| 407 | + logger.warning("Failed to convert HTML to Markdown: %s", e) |
| 408 | + # Continue with original cleaned text (HTML tags will remain but ReactMarkdown handles them safely) |
| 409 | + |
| 410 | + # STEP 1: Protect Markdown headers and formatting before cleaning |
| 411 | + # Extract and protect Markdown headers (##, ###, etc.) |
| 412 | + markdown_header_pattern = re.compile(r"^(#{1,6}\s+.+)$", re.MULTILINE) |
| 413 | + markdown_headers = markdown_header_pattern.findall(cleaned) |
| 414 | + header_placeholders = {} |
| 415 | + |
| 416 | + # Replace all occurrences of each header to handle duplicates correctly |
| 417 | + for i, header in enumerate(markdown_headers): |
| 418 | + placeholder = f"__MDHEADER_{i}__" |
| 419 | + header_placeholders[placeholder] = header |
| 420 | + # Replace all occurrences of this header (not just first) |
| 421 | + cleaned = cleaned.replace(header, placeholder) |
| 422 | + |
| 423 | + # STEP 2: Remove " AND " artifacts that come from query rewriting |
369 | 424 | # Handle both middle "AND" and trailing "AND" |
370 | 425 | cleaned = re.sub(r"\s+AND\s+", " ", cleaned) # Middle ANDs |
371 | 426 | cleaned = re.sub(r"\s+AND$", "", cleaned) # Trailing AND |
372 | 427 |
|
373 | | - # Remove duplicate consecutive words |
374 | | - words = cleaned.split() |
375 | | - deduplicated_words = [] |
376 | | - prev_word = None |
| 428 | + # STEP 3: Remove duplicate consecutive words (but NOT in protected headers) |
| 429 | + lines = cleaned.split("\n") |
| 430 | + processed_lines = [] |
| 431 | + |
| 432 | + for line in lines: |
| 433 | + # Skip deduplication for lines with header placeholders |
| 434 | + if any(placeholder in line for placeholder in header_placeholders): |
| 435 | + processed_lines.append(line) |
| 436 | + continue |
| 437 | + |
| 438 | + # Process regular lines |
| 439 | + words = line.split() |
| 440 | + deduplicated_words = [] |
| 441 | + prev_word = None |
| 442 | + |
| 443 | + for word in words: |
| 444 | + if not prev_word or word.lower() != prev_word.lower(): |
| 445 | + deduplicated_words.append(word) |
| 446 | + prev_word = word # Always update prev_word for next iteration |
| 447 | + |
| 448 | + processed_lines.append(" ".join(deduplicated_words)) |
| 449 | + |
| 450 | + result = "\n".join(processed_lines) |
377 | 451 |
|
378 | | - for word in words: |
379 | | - if not prev_word or word.lower() != prev_word.lower(): |
380 | | - deduplicated_words.append(word) |
381 | | - prev_word = word |
| 452 | + # STEP 4: Restore Markdown headers |
| 453 | + for placeholder, header in header_placeholders.items(): |
| 454 | + result = result.replace(placeholder, header) |
382 | 455 |
|
383 | | - # Join back and clean up any multiple spaces |
384 | | - result = " ".join(deduplicated_words) |
385 | | - result = re.sub(r"\s+", " ", result).strip() |
| 456 | + # STEP 5: Clean up any multiple spaces (but preserve newlines) |
| 457 | + result = re.sub(r" +", " ", result) # Multiple spaces to single space |
| 458 | + result = result.strip() |
386 | 459 |
|
387 | 460 | return result |
388 | 461 |
|
|
0 commit comments