diff --git a/src/kit/code_searcher.py b/src/kit/code_searcher.py index 32112163..6371b8c0 100644 --- a/src/kit/code_searcher.py +++ b/src/kit/code_searcher.py @@ -1,5 +1,7 @@ from __future__ import annotations +import logging +import os import re from dataclasses import dataclass from pathlib import Path @@ -35,16 +37,78 @@ def __init__(self, repo_path: str) -> None: self._gitignore_spec = self._load_gitignore() # Load gitignore spec def _load_gitignore(self): - """Loads .gitignore rules from the repository root.""" - gitignore_path = self.repo_path / ".gitignore" - if gitignore_path.exists(): + """Load all .gitignore files in repository tree and merge them. + + Returns a PathSpec that respects all .gitignore files, with proper + precedence (deeper paths override shallower ones). + """ + gitignore_files = [] + + # Collect all .gitignore files + for dirpath, dirnames, filenames in os.walk(self.repo_path): + # Skip .git directory + if ".git" in Path(dirpath).parts: + continue + + if ".gitignore" in filenames: + gitignore_path = Path(dirpath) / ".gitignore" + gitignore_files.append(gitignore_path) + + if not gitignore_files: + return None + + # Sort by depth (deepest first) for correct precedence + gitignore_files.sort(key=lambda p: len(p.parts), reverse=True) + + # Collect all patterns with proper path prefixes + all_patterns = [] + for gitignore_path in gitignore_files: + gitignore_dir = gitignore_path.parent + try: with open(gitignore_path, "r", encoding="utf-8") as f: - return pathspec.PathSpec.from_lines("gitwildmatch", f) + patterns = f.readlines() + + # Calculate relative base path from repo root + try: + rel_base = gitignore_dir.relative_to(self.repo_path) + except ValueError: + # gitignore outside repo (shouldn't happen, but be safe) + continue + + # Process each pattern + for pattern in patterns: + pattern = pattern.strip() + + # Skip empty lines and comments + if not pattern or pattern.startswith("#"): + continue + + # Adjust pattern to be relative to repo root + if str(rel_base) != ".": + # Pattern is in subdirectory - prefix with path + if pattern.startswith("/"): + # Absolute pattern (from gitignore dir) - make relative to repo + adjusted = f"{rel_base}/{pattern[1:]}" + else: + # Relative pattern - prefix with directory path + adjusted = f"{rel_base}/{pattern}" + else: + # Pattern is in root .gitignore - use as-is + adjusted = pattern + + all_patterns.append(adjusted) + except Exception as e: - # Log this error if logging is set up, or print - print(f"Warning: Could not load .gitignore: {e}") - return None + # Log warning but continue processing other .gitignore files + logging.warning(f"Could not load {gitignore_path}: {e}") + continue + + if not all_patterns: + return None + + # Create single merged pathspec + return pathspec.PathSpec.from_lines("gitwildmatch", all_patterns) def _should_ignore(self, file: Path) -> bool: """Checks if a file should be ignored based on .gitignore rules.""" diff --git a/src/kit/repo_mapper.py b/src/kit/repo_mapper.py index 8d0dee55..6fd9aa76 100644 --- a/src/kit/repo_mapper.py +++ b/src/kit/repo_mapper.py @@ -24,11 +24,78 @@ def __init__(self, repo_path: str) -> None: self._gitignore_spec = self._load_gitignore() def _load_gitignore(self): - gitignore_path = self.repo_path / ".gitignore" - if gitignore_path.exists(): - with open(gitignore_path) as f: - return pathspec.PathSpec.from_lines("gitwildmatch", f) - return None + """Load all .gitignore files in repository tree and merge them. + + Returns a PathSpec that respects all .gitignore files, with proper + precedence (deeper paths override shallower ones). + """ + gitignore_files = [] + + # Collect all .gitignore files + for dirpath, dirnames, filenames in os.walk(self.repo_path): + # Skip .git directory + if ".git" in Path(dirpath).parts: + continue + + if ".gitignore" in filenames: + gitignore_path = Path(dirpath) / ".gitignore" + gitignore_files.append(gitignore_path) + + if not gitignore_files: + return None + + # Sort by depth (deepest first) for correct precedence + gitignore_files.sort(key=lambda p: len(p.parts), reverse=True) + + # Collect all patterns with proper path prefixes + all_patterns = [] + for gitignore_path in gitignore_files: + gitignore_dir = gitignore_path.parent + + try: + with open(gitignore_path, "r", encoding="utf-8") as f: + patterns = f.readlines() + + # Calculate relative base path from repo root + try: + rel_base = gitignore_dir.relative_to(self.repo_path) + except ValueError: + # gitignore outside repo (shouldn't happen, but be safe) + continue + + # Process each pattern + for pattern in patterns: + pattern = pattern.strip() + + # Skip empty lines and comments + if not pattern or pattern.startswith("#"): + continue + + # Adjust pattern to be relative to repo root + if str(rel_base) != ".": + # Pattern is in subdirectory - prefix with path + if pattern.startswith("/"): + # Absolute pattern (from gitignore dir) - make relative to repo + adjusted = f"{rel_base}/{pattern[1:]}" + else: + # Relative pattern - prefix with directory path + adjusted = f"{rel_base}/{pattern}" + else: + # Pattern is in root .gitignore - use as-is + adjusted = pattern + + all_patterns.append(adjusted) + + except Exception as e: + # Log warning but continue processing other .gitignore files + logging.warning(f"Could not load {gitignore_path}: {e}") + continue + + if not all_patterns: + return None + + # Create single merged pathspec + return pathspec.PathSpec.from_lines("gitwildmatch", all_patterns) def _should_ignore(self, file: Path) -> bool: # Handle potential symlink resolution mismatches diff --git a/tests/integration/test_humanlayer_repo.py b/tests/integration/test_humanlayer_repo.py new file mode 100644 index 00000000..712b50cb --- /dev/null +++ b/tests/integration/test_humanlayer_repo.py @@ -0,0 +1,44 @@ +import pytest +from pathlib import Path +from kit.repo_mapper import RepoMapper +import subprocess + + +@pytest.mark.integration +@pytest.mark.skipif( + not Path("/home/selman/dev/humanlayer").exists(), + reason="Requires humanlayer repository" +) +def test_humanlayer_repo_gitignore(): + """Integration test: Verify fix works on actual humanlayer repo.""" + + # Get git's file count + result = subprocess.run( + ["git", "ls-files"], + cwd="/home/selman/dev/humanlayer", + capture_output=True, + text=True + ) + git_files = set(result.stdout.strip().split("\n")) + git_count = len(git_files) + + # Get kit's file count + mapper = RepoMapper("/home/selman/dev/humanlayer") + tree = mapper.get_file_tree() + kit_count = len(tree) + kit_paths = {item["path"] for item in tree} + + # Should be approximately equal (within 10% tolerance for build artifacts) + tolerance = 0.1 + assert abs(kit_count - git_count) / git_count < tolerance, \ + f"Kit returned {kit_count} files, Git tracks {git_count} files" + + # Should be well under token limit (assuming ~100 chars per file path) + estimated_tokens = kit_count * 100 + assert estimated_tokens < 25000, \ + f"Estimated {estimated_tokens} tokens (exceeds 25k limit)" + + # Verify no node_modules files included + node_modules_files = [p for p in kit_paths if "node_modules" in p] + assert len(node_modules_files) == 0, \ + f"Found {len(node_modules_files)} node_modules files (should be 0)" diff --git a/tests/test_gitignore.py b/tests/test_gitignore.py new file mode 100644 index 00000000..2583ef03 --- /dev/null +++ b/tests/test_gitignore.py @@ -0,0 +1,138 @@ +import pytest +from pathlib import Path +import tempfile +from kit.repo_mapper import RepoMapper + + +def test_root_gitignore_only(): + """Test basic root .gitignore works as before.""" + with tempfile.TemporaryDirectory() as tmpdir: + repo = Path(tmpdir) + + # Create root .gitignore + (repo / ".gitignore").write_text("*.pyc\n__pycache__/\n") + + # Create test files + (repo / "test.py").touch() + (repo / "test.pyc").touch() + (repo / "__pycache__").mkdir() + (repo / "__pycache__" / "test.pyc").touch() + + mapper = RepoMapper(str(repo)) + tree = mapper.get_file_tree() + + # Should only include test.py, not .pyc or __pycache__ + paths = [item["path"] for item in tree] + assert "test.py" in paths + assert "test.pyc" not in paths + assert "__pycache__/test.pyc" not in paths + + +def test_subdirectory_gitignore(): + """Test subdirectory .gitignore files are respected.""" + with tempfile.TemporaryDirectory() as tmpdir: + repo = Path(tmpdir) + + # Create subdirectory with its own .gitignore + subdir = repo / "frontend" + subdir.mkdir() + (subdir / ".gitignore").write_text("node_modules/\n*.log\n") + + # Create test files + (subdir / "app.js").touch() + (subdir / "debug.log").touch() + node_modules = subdir / "node_modules" + node_modules.mkdir() + (node_modules / "package.json").touch() + + mapper = RepoMapper(str(repo)) + tree = mapper.get_file_tree() + + # Should include app.js but not debug.log or node_modules + paths = [item["path"] for item in tree] + assert "frontend/app.js" in paths + assert "frontend/debug.log" not in paths + assert "frontend/node_modules/package.json" not in paths + + +def test_nested_gitignore_precedence(): + """Test deeper .gitignore files override shallower ones.""" + with tempfile.TemporaryDirectory() as tmpdir: + repo = Path(tmpdir) + + # Root .gitignore ignores *.tmp + (repo / ".gitignore").write_text("*.tmp\n") + + # Subdirectory .gitignore allows *.tmp (negation) + subdir = repo / "special" + subdir.mkdir() + (subdir / ".gitignore").write_text("!*.tmp\n") + + # Create test files + (repo / "root.tmp").touch() + (subdir / "special.tmp").touch() + + mapper = RepoMapper(str(repo)) + tree = mapper.get_file_tree() + + # Root .tmp should be ignored, but special/ .tmp should be included + paths = [item["path"] for item in tree] + assert "root.tmp" not in paths + assert "special/special.tmp" in paths # Negation pattern + + +def test_multiple_subdirectory_gitignores(): + """Test multiple subdirectories each with .gitignore files.""" + with tempfile.TemporaryDirectory() as tmpdir: + repo = Path(tmpdir) + + # Frontend with node_modules + frontend = repo / "frontend" + frontend.mkdir() + (frontend / ".gitignore").write_text("node_modules/\n") + (frontend / "app.js").touch() + fe_nm = frontend / "node_modules" + fe_nm.mkdir() + (fe_nm / "react.js").touch() + + # Backend with venv + backend = repo / "backend" + backend.mkdir() + (backend / ".gitignore").write_text("venv/\n__pycache__/\n") + (backend / "main.py").touch() + be_venv = backend / "venv" + be_venv.mkdir() + (be_venv / "python").touch() + + mapper = RepoMapper(str(repo)) + tree = mapper.get_file_tree() + + paths = [item["path"] for item in tree] + + # Should include source files + assert "frontend/app.js" in paths + assert "backend/main.py" in paths + + # Should exclude ignored directories + assert "frontend/node_modules/react.js" not in paths + assert "backend/venv/python" not in paths + + +def test_no_gitignore_files(): + """Test repository with no .gitignore files.""" + with tempfile.TemporaryDirectory() as tmpdir: + repo = Path(tmpdir) + + # Create files without .gitignore + (repo / "test.py").touch() + subdir = repo / "src" + subdir.mkdir() + (subdir / "main.py").touch() + + mapper = RepoMapper(str(repo)) + tree = mapper.get_file_tree() + + # All files should be included + paths = [item["path"] for item in tree] + assert "test.py" in paths + assert "src/main.py" in paths