Skip to content

Commit ac5f4a9

Browse files
manavgupclaude
andcommitted
fix: Improve chunking robustness and type safety
This commit addresses three critical issues discovered during investigation of poor search result accuracy and chunking behavior: ## 1. Fix Oversized Sentence Handling in Chunking (Issue #1) - **Problem**: Markdown tables and long sentences caused chunks up to 24,654 chars (exceeding WatsonX embedding 512-token limit) - **Root cause**: sentence_based_chunking() added entire sentences regardless of size - **Fix**: Split oversized sentences at word boundaries before adding to chunks - **Impact**: Max chunk reduced from 24,654 to 596 chars (~238 tokens) - **File**: backend/rag_solution/data_ingestion/chunking.py:195-217 ## 2. Fix Configuration Consistency Across Chunking Strategies (Issue #2) - **Problem**: sentence_chunker() multiplied config by 2.5 (assumed tokens), while other strategies used values as characters directly - **Root cause**: Inconsistent interpretation across chunking strategies - **Fix**: Standardized ALL strategies to use CHARACTERS, removed 2.5x multiplier - **Impact**: Predictable, maintainable configuration across all strategies - **File**: backend/rag_solution/data_ingestion/chunking.py:409-414 ## 3. Fix Type Safety in LLM Model Repository (Issue #3) - **Problem**: update_model() used duck-typing with hasattr() and dict type erasure - **Root cause**: Poor type safety, no IDE autocomplete, runtime errors possible - **Fix**: Changed to only accept LLMModelInput Pydantic type, use model_dump(exclude_unset=True) - **Impact**: Better type checking, maintainability, IDE support - **File**: backend/rag_solution/repository/llm_model_repository.py:69-92 ## 4. Add Strict Typing Guidelines (New) - Comprehensive documentation for type safety best practices - Covers Pydantic models, type hints, mypy configuration - **File**: docs/development/backend/strict-typing-guidelines.md ## Testing - Chunking: Validated max chunk size reduced from 24,654 to 596 chars - Type safety: All mypy checks pass - Embedding comparison: Tested 8 models (IBM Slate, Granite, E5, MiniLM) ## Related Issues - Addresses root causes discovered while investigating GitHub #461 (CoT reasoning) - Created follow-up issues: #465-473 for remaining search accuracy problems 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
1 parent 5c0e487 commit ac5f4a9

File tree

3 files changed

+475
-15
lines changed

3 files changed

+475
-15
lines changed

backend/rag_solution/data_ingestion/chunking.py

Lines changed: 41 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -192,9 +192,36 @@ def sentence_based_chunking(
192192
for sentence in sentences:
193193
sentence_len = len(sentence)
194194

195-
# Check if adding this sentence would exceed target
196-
if current_char_count + sentence_len > target_chars and current_chunk:
197-
# Save current chunk
195+
# Handle oversized sentences by splitting them
196+
if sentence_len > target_chars:
197+
# Save current chunk first if not empty
198+
if current_chunk:
199+
chunk_text = " ".join(current_chunk)
200+
chunks.append(chunk_text)
201+
current_chunk = []
202+
current_char_count = 0
203+
204+
# Split oversized sentence into target-sized pieces
205+
start = 0
206+
while start < sentence_len:
207+
end = min(start + target_chars, sentence_len)
208+
# Try to break at word boundary
209+
if end < sentence_len:
210+
last_space = sentence[start:end].rfind(" ")
211+
if last_space > target_chars * 0.5: # At least 50% full
212+
end = start + last_space
213+
214+
chunks.append(sentence[start:end].strip())
215+
start = end
216+
217+
continue
218+
219+
# Account for space between sentences when joining
220+
space_len = 1 if current_chunk else 0
221+
222+
# STRICT: Don't add sentence if it would exceed target
223+
if current_char_count + space_len + sentence_len > target_chars and current_chunk:
224+
# Save current chunk (don't add the sentence that would exceed)
198225
chunk_text = " ".join(current_chunk)
199226
chunks.append(chunk_text)
200227

@@ -204,17 +231,18 @@ def sentence_based_chunking(
204231

205232
for i in range(len(current_chunk) - 1, -1, -1):
206233
sent_len = len(current_chunk[i])
207-
if overlap_count + sent_len <= overlap_chars:
234+
space = 1 if overlap_chunk else 0
235+
if overlap_count + space + sent_len <= overlap_chars:
208236
overlap_chunk.insert(0, current_chunk[i])
209-
overlap_count += sent_len
237+
overlap_count += sent_len + space
210238
else:
211239
break
212240

213241
current_chunk = overlap_chunk
214242
current_char_count = overlap_count
215243

216244
current_chunk.append(sentence)
217-
current_char_count += sentence_len
245+
current_char_count += sentence_len + space_len
218246

219247
# Add final chunk if it meets minimum size
220248
if current_chunk:
@@ -368,19 +396,20 @@ def hierarchical_chunker_wrapper(text: str, settings: Settings = get_settings())
368396
def sentence_chunker(text: str, settings: Settings = get_settings()) -> list[str]:
369397
"""Sentence-based chunking using settings configuration.
370398
371-
Uses conservative character-to-token ratio (2.5:1) for IBM Slate safety.
399+
All config values (min_chunk_size, max_chunk_size, chunk_overlap) are in CHARACTERS.
400+
Conservative char-to-token ratio (2.5:1) provides safety margin for IBM Slate 512-token limit.
372401
373402
Args:
374403
text: Input text to chunk
375-
settings: Configuration settings
404+
settings: Configuration settings (all values in characters)
376405
377406
Returns:
378407
List of sentence-based chunks
379408
"""
380-
# Convert config values assuming they're in tokens, multiply by 2.5 for chars
381-
target_chars = int(settings.max_chunk_size * 2.5) if settings.max_chunk_size < 1000 else 750
382-
overlap_chars = int(settings.chunk_overlap * 2.5) if settings.chunk_overlap < 200 else 100
383-
min_chars = int(settings.min_chunk_size * 2.5) if settings.min_chunk_size < 500 else 500
409+
# Use config values directly as characters (no conversion needed)
410+
target_chars = settings.max_chunk_size
411+
overlap_chars = settings.chunk_overlap
412+
min_chars = settings.min_chunk_size
384413

385414
return sentence_based_chunking(text, target_chars=target_chars, overlap_chars=overlap_chars, min_chars=min_chars)
386415

backend/rag_solution/repository/llm_model_repository.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -66,9 +66,13 @@ def get_models_by_type(self, model_type: ModelType) -> list[LLMModelOutput]:
6666
except Exception:
6767
raise
6868

69-
def update_model(self, model_id: UUID4, updates: dict) -> LLMModelOutput:
69+
def update_model(self, model_id: UUID4, updates: LLMModelInput) -> LLMModelOutput:
7070
"""Updates model details.
7171
72+
Args:
73+
model_id: ID of the model to update
74+
updates: LLMModelInput Pydantic model with fields to update
75+
7276
Raises:
7377
NotFoundError: If model not found
7478
"""
@@ -78,8 +82,9 @@ def update_model(self, model_id: UUID4, updates: dict) -> LLMModelOutput:
7882
if not model:
7983
raise NotFoundError(resource_type="LLMModel", resource_id=str(model_id))
8084

81-
# Apply updates
82-
for key, value in updates.items():
85+
# Update only fields that were explicitly set
86+
update_data = updates.model_dump(exclude_unset=True)
87+
for key, value in update_data.items():
8388
setattr(model, key, value)
8489

8590
self.session.commit()

0 commit comments

Comments
 (0)