fix(cot): address code quality issues from PR review

manavgup · manavgup · commit 25243ef836d5 · 2025-10-26T11:03:08.000-04:00
Implements critical code quality improvements from PR #490 review: 1. **ReDoS Protection (Security)**: - Added MAX_REGEX_INPUT_LENGTH constant (10KB limit) - Length checks before all regex operations in: - _parse_xml_tags - _parse_json_structure - _parse_final_answer_marker - Prevents regex denial of service attacks 2. **Pre-compiled Regex Patterns (Performance)**: - XML_ANSWER_PATTERN for <answer> tags - JSON_ANSWER_PATTERN for JSON structures - FINAL_ANSWER_PATTERN for "Final Answer:" markers - Improves performance by compiling patterns once 3. **Specific Exception Handling**: - Changed generic Exception to specific types - Catches LLMProviderError, ValidationError, PydanticValidationError - Wraps exceptions in LLMProviderError on final retry - Maintains retry logic with proper exception chaining 4. **Production Logging**: - Changed verbose logger.info to logger.debug - Applies to answer_synthesizer.py and chain_of_thought_service.py - Reduces production log noise Related: #490
diff --git a/backend/rag_solution/services/chain_of_thought_service.py b/backend/rag_solution/services/chain_of_thought_service.py
@@ -1,5 +1,7 @@
 """Chain of Thought (CoT) service for enhanced RAG search quality."""
 
+import json
+import re
 import time
 from typing import TYPE_CHECKING, Any
 from uuid import UUID
@@ -37,6 +39,14 @@
 
 logger = get_logger(__name__)
 
+# Security: Maximum input length for regex operations to prevent ReDoS attacks
+MAX_REGEX_INPUT_LENGTH = 10 * 1024  # 10KB
+
+# Pre-compiled regex patterns for better performance
+XML_ANSWER_PATTERN = re.compile(r"<answer>(.*?)</answer>", re.DOTALL | re.IGNORECASE)
+JSON_ANSWER_PATTERN = re.compile(r"\{[^{}]*\"answer\"[^{}]*\}", re.DOTALL)
+FINAL_ANSWER_PATTERN = re.compile(r"final\s+answer:\s*(.+)", re.DOTALL | re.IGNORECASE)
+
 
 class ChainOfThoughtService:
     """Service for Chain of Thought reasoning in RAG search."""
@@ -299,9 +309,12 @@ def _parse_xml_tags(self, llm_response: str) -> str | None:
         Returns:
             Extracted answer or None if not found
         """
-        import re
+        # ReDoS protection: Limit input length for regex operations
+        if len(llm_response) > MAX_REGEX_INPUT_LENGTH:
+            logger.warning("LLM response exceeds %d chars, truncating for ReDoS protection", MAX_REGEX_INPUT_LENGTH)
+            llm_response = llm_response[:MAX_REGEX_INPUT_LENGTH]
 
-        answer_match = re.search(r"<answer>(.*?)</answer>", llm_response, re.DOTALL | re.IGNORECASE)
+        answer_match = XML_ANSWER_PATTERN.search(llm_response)
         if answer_match:
             return answer_match.group(1).strip()
 
@@ -325,12 +338,14 @@ def _parse_json_structure(self, llm_response: str) -> str | None:
         Returns:
             Extracted answer or None if not found
         """
-        import json
-        import re
+        # ReDoS protection: Limit input length for regex operations
+        if len(llm_response) > MAX_REGEX_INPUT_LENGTH:
+            logger.warning("LLM response exceeds %d chars, truncating for ReDoS protection", MAX_REGEX_INPUT_LENGTH)
+            llm_response = llm_response[:MAX_REGEX_INPUT_LENGTH]
 
         try:
             # Try to find JSON object
-            json_match = re.search(r"\{[^{}]*\"answer\"[^{}]*\}", llm_response, re.DOTALL)
+            json_match = JSON_ANSWER_PATTERN.search(llm_response)
             if json_match:
                 data = json.loads(json_match.group(0))
                 if "answer" in data:
@@ -349,10 +364,13 @@ def _parse_final_answer_marker(self, llm_response: str) -> str | None:
         Returns:
             Extracted answer or None if not found
         """
-        import re
+        # ReDoS protection: Limit input length for regex operations
+        if len(llm_response) > MAX_REGEX_INPUT_LENGTH:
+            logger.warning("LLM response exceeds %d chars, truncating for ReDoS protection", MAX_REGEX_INPUT_LENGTH)
+            llm_response = llm_response[:MAX_REGEX_INPUT_LENGTH]
 
         # Try "Final Answer:" marker
-        final_match = re.search(r"final\s+answer:\s*(.+)", llm_response, re.DOTALL | re.IGNORECASE)
+        final_match = FINAL_ANSWER_PATTERN.search(llm_response)
         if final_match:
             return final_match.group(1).strip()
 
@@ -600,10 +618,15 @@ def _generate_llm_response_with_retry(
                     logger.info("Waiting %ds before retry (exponential backoff)...", delay)
                     time.sleep(delay)
 
-            except Exception as exc:
+            except (LLMProviderError, ValidationError, PydanticValidationError) as exc:
                 logger.error("Attempt %d/%d failed: %s", attempt + 1, max_retries, exc)
                 if attempt == max_retries - 1:
-                    raise
+                    # Wrap in LLMProviderError as documented in the method signature
+                    if isinstance(exc, LLMProviderError):
+                        raise
+                    raise LLMProviderError(
+                        f"LLM response generation failed after {max_retries} attempts: {exc}"
+                    ) from exc
 
                 # Exponential backoff before retry
                 delay = 2**attempt  # 1s, 2s, 4s for attempts 0, 1, 2