Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 36 additions & 6 deletions src/kit/pr_review/line_ref_fixer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,13 @@

import bisect
import re
from typing import Dict, List, Tuple
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union

from .diff_parser import DiffParser

if TYPE_CHECKING:
from .diff_parser import FileDiff


class LineRefFixer:
"""Utility to validate and auto-fix file:line references in an AI review comment."""
Expand All @@ -15,8 +18,22 @@ class LineRefFixer:
REF_PATTERN = re.compile(r"([\w./+-]+\.[a-zA-Z0-9]{1,10}):(\d+)(?:-(\d+))?")

@classmethod
def _build_valid_line_map(cls, diff_text: str) -> Dict[str, set[int]]:
diff_files = DiffParser.parse_diff(diff_text)
def _build_valid_line_map(
cls,
diff_text_or_parsed: Union[str, Dict[str, "FileDiff"]],
) -> Dict[str, set[int]]:
"""Build map of valid line numbers from diff.

Args:
diff_text_or_parsed: Either raw diff text (str) or pre-parsed diff dict.
Passing pre-parsed diff avoids redundant parsing when caller
has already parsed the diff.
"""
if isinstance(diff_text_or_parsed, str):
diff_files = DiffParser.parse_diff(diff_text_or_parsed)
else:
diff_files = diff_text_or_parsed

valid: Dict[str, set[int]] = {}
for filename, fd in diff_files.items():
line_set: set[int] = set()
Expand All @@ -31,12 +48,25 @@ def _build_valid_line_map(cls, diff_text: str) -> Dict[str, set[int]]:
return valid

@classmethod
def fix_comment(cls, comment: str, diff_text: str) -> Tuple[str, List[Tuple[str, int, int]]]:
def fix_comment(
cls,
comment: str,
diff_text: str,
parsed_diff: Optional[Dict[str, "FileDiff"]] = None,
) -> Tuple[str, List[Tuple[str, int, int]]]:
"""Return (fixed_comment, fixes).

fixes list items are (filename, old_line, new_line).
Args:
comment: The review comment text to fix.
diff_text: Raw diff text (used if parsed_diff not provided).
parsed_diff: Pre-parsed diff dict. If provided, avoids re-parsing
the diff which saves ~0.1ms per call.

Returns:
Tuple of (fixed_comment, fixes) where fixes is a list of
(filename, old_line, new_line) tuples.
"""
valid_map = cls._build_valid_line_map(diff_text)
valid_map = cls._build_valid_line_map(parsed_diff if parsed_diff else diff_text)
# Convert sets to sorted lists once for O(log n) lookups
sorted_lines_cache: Dict[str, List[int]] = {}
fixes: List[Tuple[str, int, int]] = []
Expand Down
13 changes: 10 additions & 3 deletions src/kit/pr_review/reviewer.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,7 +394,9 @@ def review_pr(self, pr_input: str) -> str:
if validation.metrics.get("line_reference_errors", 0) > 0:
from .line_ref_fixer import LineRefFixer

analysis, fixes = LineRefFixer.fix_comment(analysis, pr_diff)
# Use cached parsed diff to avoid re-parsing
cached_parsed = self.get_parsed_diff(owner, repo, pr_number)
analysis, fixes = LineRefFixer.fix_comment(analysis, pr_diff, parsed_diff=cached_parsed)
if fixes and not quiet:
print(
f"🔧 Auto-fixed {len(fixes) // (2 if any(f[1] != f[2] for f in fixes) else 1)} line reference(s)"
Expand Down Expand Up @@ -443,7 +445,11 @@ def review_pr(self, pr_input: str) -> str:
if validation.metrics.get("line_reference_errors", 0) > 0:
from .line_ref_fixer import LineRefFixer

analysis, fixes = LineRefFixer.fix_comment(analysis, pr_diff)
# Use cached parsed diff to avoid re-parsing
cached_parsed = self.get_parsed_diff(owner, repo, pr_number)
analysis, fixes = LineRefFixer.fix_comment(
analysis, pr_diff, parsed_diff=cached_parsed
)
if fixes and not quiet:
print(
f"🔧 Auto-fixed {len(fixes) // (2 if any(f[1] != f[2] for f in fixes) else 1)} line reference(s)"
Expand Down Expand Up @@ -605,7 +611,8 @@ def review_local_diff(self, diff_spec: str, repo_path: str = ".") -> str:
if validation.metrics.get("line_reference_errors", 0) > 0:
from .line_ref_fixer import LineRefFixer

analysis, fixes = LineRefFixer.fix_comment(analysis, diff_content)
# Reuse already-parsed diff to avoid re-parsing
analysis, fixes = LineRefFixer.fix_comment(analysis, diff_content, parsed_diff=parsed_diff)
if fixes and not quiet:
is_different = [f[1] != f[2] for f in fixes]
divisor = 2 if any(is_different) else 1
Expand Down
79 changes: 63 additions & 16 deletions src/kit/vector_searcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,25 +327,72 @@ def __init__(self, repo, embed_fn, backend: Optional[VectorDBBackend] = None, pe
self.chunk_metadatas: List[Dict[str, Any]] = []
self.chunk_embeddings: List[List[float]] = []

def build_index(self, chunk_by: str = "symbols"):
def build_index(self, chunk_by: str = "symbols", parallel: bool = True, max_workers: Optional[int] = None):
"""Build the vector index from repository files.

Args:
chunk_by: Chunking strategy - "symbols" or "lines"
parallel: Whether to process files in parallel (default True)
max_workers: Max parallel workers. Defaults to min(4, cpu_count).
Set via KIT_INDEXER_MAX_WORKERS env var.
"""
self.chunk_metadatas = []
chunk_codes: List[str] = []

for file in self.repo.get_file_tree():
if file["is_dir"]:
continue
path = file["path"]
if chunk_by == "symbols":
chunks = self.repo.chunk_file_by_symbols(path)
for chunk in chunks:
code = chunk["code"]
self.chunk_metadatas.append({"file": path, **chunk})
chunk_codes.append(code)
else:
chunks = self.repo.chunk_file_by_lines(path, max_lines=50)
for code in chunks:
self.chunk_metadatas.append({"file": path, "code": code})
chunk_codes.append(code)
files_to_process = [f["path"] for f in self.repo.get_file_tree() if not f["is_dir"]]

if parallel and len(files_to_process) > 1:
# Parallel processing for better performance on multi-core systems
from concurrent.futures import ThreadPoolExecutor, as_completed

if max_workers is None:
import os as _os

env_workers = _os.environ.get("KIT_INDEXER_MAX_WORKERS")
if env_workers:
try:
max_workers = int(env_workers)
except ValueError:
max_workers = None
if max_workers is None:
cpu_count = _os.cpu_count() or 4
max_workers = min(4, cpu_count)

def process_file(path: str) -> List[Dict[str, Any]]:
"""Process a single file and return its chunks."""
if chunk_by == "symbols":
chunks = self.repo.chunk_file_by_symbols(path)
return [{"file": path, **chunk} for chunk in chunks]
else:
chunks = self.repo.chunk_file_by_lines(path, max_lines=50)
return [{"file": path, "code": code} for code in chunks]

with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = {executor.submit(process_file, path): path for path in files_to_process}
for future in as_completed(futures):
try:
file_chunks = future.result()
for chunk in file_chunks:
code = chunk.get("code", "")
self.chunk_metadatas.append(chunk)
chunk_codes.append(code)
except Exception:
# Skip files that fail to process
pass
else:
# Sequential processing (fallback or single file)
for path in files_to_process:
if chunk_by == "symbols":
chunks = self.repo.chunk_file_by_symbols(path)
for chunk in chunks:
code = chunk["code"]
self.chunk_metadatas.append({"file": path, **chunk})
chunk_codes.append(code)
else:
chunks = self.repo.chunk_file_by_lines(path, max_lines=50)
for code in chunks:
self.chunk_metadatas.append({"file": path, "code": code})
chunk_codes.append(code)

# Embed in batch (attempt). Fallback to per-item if embed_fn doesn't support list input.
if chunk_codes:
Expand Down