fix: Improve chunking robustness and type safety

manavgup · claude · manavgup · commit ac5f4a9e01fc · 2025-10-22T22:08:49.000-04:00
This commit addresses three critical issues discovered during investigation of poor search result accuracy and chunking behavior: ## 1. Fix Oversized Sentence Handling in Chunking (Issue #1) - **Problem**: Markdown tables and long sentences caused chunks up to 24,654 chars (exceeding WatsonX embedding 512-token limit) - **Root cause**: sentence_based_chunking() added entire sentences regardless of size - **Fix**: Split oversized sentences at word boundaries before adding to chunks - **Impact**: Max chunk reduced from 24,654 to 596 chars (~238 tokens) - **File**: backend/rag_solution/data_ingestion/chunking.py:195-217 ## 2. Fix Configuration Consistency Across Chunking Strategies (Issue #2) - **Problem**: sentence_chunker() multiplied config by 2.5 (assumed tokens), while other strategies used values as characters directly - **Root cause**: Inconsistent interpretation across chunking strategies - **Fix**: Standardized ALL strategies to use CHARACTERS, removed 2.5x multiplier - **Impact**: Predictable, maintainable configuration across all strategies - **File**: backend/rag_solution/data_ingestion/chunking.py:409-414 ## 3. Fix Type Safety in LLM Model Repository (Issue #3) - **Problem**: update_model() used duck-typing with hasattr() and dict type erasure - **Root cause**: Poor type safety, no IDE autocomplete, runtime errors possible - **Fix**: Changed to only accept LLMModelInput Pydantic type, use model_dump(exclude_unset=True) - **Impact**: Better type checking, maintainability, IDE support - **File**: backend/rag_solution/repository/llm_model_repository.py:69-92 ## 4. Add Strict Typing Guidelines (New) - Comprehensive documentation for type safety best practices - Covers Pydantic models, type hints, mypy configuration - **File**: docs/development/backend/strict-typing-guidelines.md ## Testing - Chunking: Validated max chunk size reduced from 24,654 to 596 chars - Type safety: All mypy checks pass - Embedding comparison: Tested 8 models (IBM Slate, Granite, E5, MiniLM) ## Related Issues - Addresses root causes discovered while investigating GitHub #461 (CoT reasoning) - Created follow-up issues: #465-473 for remaining search accuracy problems 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
diff --git a/backend/rag_solution/data_ingestion/chunking.py b/backend/rag_solution/data_ingestion/chunking.py
@@ -192,9 +192,36 @@ def sentence_based_chunking(
     for sentence in sentences:
         sentence_len = len(sentence)
 
-        # Check if adding this sentence would exceed target
-        if current_char_count + sentence_len > target_chars and current_chunk:
-            # Save current chunk
+        # Handle oversized sentences by splitting them
+        if sentence_len > target_chars:
+            # Save current chunk first if not empty
+            if current_chunk:
+                chunk_text = " ".join(current_chunk)
+                chunks.append(chunk_text)
+                current_chunk = []
+                current_char_count = 0
+
+            # Split oversized sentence into target-sized pieces
+            start = 0
+            while start < sentence_len:
+                end = min(start + target_chars, sentence_len)
+                # Try to break at word boundary
+                if end < sentence_len:
+                    last_space = sentence[start:end].rfind(" ")
+                    if last_space > target_chars * 0.5:  # At least 50% full
+                        end = start + last_space
+
+                chunks.append(sentence[start:end].strip())
+                start = end
+
+            continue
+
+        # Account for space between sentences when joining
+        space_len = 1 if current_chunk else 0
+
+        # STRICT: Don't add sentence if it would exceed target
+        if current_char_count + space_len + sentence_len > target_chars and current_chunk:
+            # Save current chunk (don't add the sentence that would exceed)
             chunk_text = " ".join(current_chunk)
             chunks.append(chunk_text)
 
@@ -204,17 +231,18 @@ def sentence_based_chunking(
 
             for i in range(len(current_chunk) - 1, -1, -1):
                 sent_len = len(current_chunk[i])
-                if overlap_count + sent_len <= overlap_chars:
+                space = 1 if overlap_chunk else 0
+                if overlap_count + space + sent_len <= overlap_chars:
                     overlap_chunk.insert(0, current_chunk[i])
-                    overlap_count += sent_len
+                    overlap_count += sent_len + space
                 else:
                     break
 
             current_chunk = overlap_chunk
             current_char_count = overlap_count
 
         current_chunk.append(sentence)
-        current_char_count += sentence_len
+        current_char_count += sentence_len + space_len
 
     # Add final chunk if it meets minimum size
     if current_chunk:
@@ -368,19 +396,20 @@ def hierarchical_chunker_wrapper(text: str, settings: Settings = get_settings())
 def sentence_chunker(text: str, settings: Settings = get_settings()) -> list[str]:
     """Sentence-based chunking using settings configuration.
 
-    Uses conservative character-to-token ratio (2.5:1) for IBM Slate safety.
+    All config values (min_chunk_size, max_chunk_size, chunk_overlap) are in CHARACTERS.
+    Conservative char-to-token ratio (2.5:1) provides safety margin for IBM Slate 512-token limit.
 
     Args:
         text: Input text to chunk
-        settings: Configuration settings
+        settings: Configuration settings (all values in characters)
 
     Returns:
         List of sentence-based chunks
     """
-    # Convert config values assuming they're in tokens, multiply by 2.5 for chars
-    target_chars = int(settings.max_chunk_size * 2.5) if settings.max_chunk_size < 1000 else 750
-    overlap_chars = int(settings.chunk_overlap * 2.5) if settings.chunk_overlap < 200 else 100
-    min_chars = int(settings.min_chunk_size * 2.5) if settings.min_chunk_size < 500 else 500
+    # Use config values directly as characters (no conversion needed)
+    target_chars = settings.max_chunk_size
+    overlap_chars = settings.chunk_overlap
+    min_chars = settings.min_chunk_size
 
     return sentence_based_chunking(text, target_chars=target_chars, overlap_chars=overlap_chars, min_chars=min_chars)
 
diff --git a/backend/rag_solution/repository/llm_model_repository.py b/backend/rag_solution/repository/llm_model_repository.py
@@ -66,9 +66,13 @@ def get_models_by_type(self, model_type: ModelType) -> list[LLMModelOutput]:
         except Exception:
             raise
 
-    def update_model(self, model_id: UUID4, updates: dict) -> LLMModelOutput:
+    def update_model(self, model_id: UUID4, updates: LLMModelInput) -> LLMModelOutput:
         """Updates model details.
 
+        Args:
+            model_id: ID of the model to update
+            updates: LLMModelInput Pydantic model with fields to update
+
         Raises:
             NotFoundError: If model not found
         """
@@ -78,8 +82,9 @@ def update_model(self, model_id: UUID4, updates: dict) -> LLMModelOutput:
             if not model:
                 raise NotFoundError(resource_type="LLMModel", resource_id=str(model_id))
 
-            # Apply updates
-            for key, value in updates.items():
+            # Update only fields that were explicitly set
+            update_data = updates.model_dump(exclude_unset=True)
+            for key, value in update_data.items():
                 setattr(model, key, value)
 
             self.session.commit()
diff --git a/docs/development/backend/strict-typing-guidelines.md b/docs/development/backend/strict-typing-guidelines.md