diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c8dce118..aa40e20d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -105,6 +105,7 @@ repos: starlette>=0.40.0, tiktoken, tomli, + pathspec, uvicorn>=0.11.7, ] - id: pylint @@ -124,6 +125,7 @@ repos: starlette>=0.40.0, tiktoken, tomli, + pathspec, uvicorn>=0.11.7, ] diff --git a/README.md b/README.md index 90d2d51b..3e29673c 100644 --- a/README.md +++ b/README.md @@ -109,6 +109,9 @@ export GITHUB_TOKEN=github_pat_... gitingest https://github.com/username/private-repo ``` +By default, files listed in `.gitignore` are skipped. Use `--include-gitignored` if you +need those files in the digest. + By default, the digest is written to a text file (`digest.txt`) in your current working directory. You can customize the output in two ways: - Use `--output/-o ` to write to a specific file. diff --git a/pyproject.toml b/pyproject.toml index f6d39290..d71abc98 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,6 +13,7 @@ dependencies = [ "starlette>=0.40.0", # Vulnerable to https://osv.dev/vulnerability/GHSA-f96h-pmfr-66vw "tiktoken>=0.7.0", # Support for o200k_base encoding "tomli", + "pathspec>=0.12.1", "typing_extensions; python_version < '3.10'", "uvicorn>=0.11.7", # Vulnerable to https://osv.dev/vulnerability/PYSEC-2020-150 ] diff --git a/requirements.txt b/requirements.txt index aa8ff03b..bb2956a5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ click>=8.0.0 fastapi[standard]>=0.109.1 # Vulnerable to https://osv.dev/vulnerability/PYSEC-2024-38 +pathspec>=0.12.1 pydantic python-dotenv slowapi diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index fb4e584e..1fb8a785 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -44,6 +44,12 @@ ), ) @click.option("--branch", "-b", default=None, help="Branch to clone and ingest") +@click.option( + "--include-gitignored", + is_flag=True, + default=False, + help="Include files matched by .gitignore", +) @click.option( "--token", "-t", @@ -61,6 +67,7 @@ def main( exclude_pattern: Tuple[str, ...], include_pattern: Tuple[str, ...], branch: Optional[str], + include_gitignored: bool, token: Optional[str], ): """ @@ -83,11 +90,12 @@ def main( Glob patterns for including files in the output. branch : str, optional Specific branch to ingest (defaults to the repository's default). + include_gitignored : bool + If provided, include files normally ignored by .gitignore. token: str, optional GitHub personal-access token (PAT). Needed when *source* refers to a **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. """ - asyncio.run( _async_main( source=source, @@ -96,6 +104,7 @@ def main( exclude_pattern=exclude_pattern, include_pattern=include_pattern, branch=branch, + include_gitignored=include_gitignored, token=token, ) ) @@ -108,6 +117,7 @@ async def _async_main( exclude_pattern: Tuple[str, ...], include_pattern: Tuple[str, ...], branch: Optional[str], + include_gitignored: bool, token: Optional[str], ) -> None: """ @@ -132,6 +142,8 @@ async def _async_main( Glob patterns for including files in the output. branch : str, optional Specific branch to ingest (defaults to the repository's default). + include_gitignored : bool + If provided, include files normally ignored by .gitignore. token: str, optional GitHub personal-access token (PAT). Needed when *source* refers to a **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. @@ -160,6 +172,7 @@ async def _async_main( exclude_patterns=exclude_patterns, branch=branch, output=output_target, + include_gitignored=include_gitignored, token=token, ) diff --git a/src/gitingest/entrypoint.py b/src/gitingest/entrypoint.py index f9e65dde..5ccf6a5e 100644 --- a/src/gitingest/entrypoint.py +++ b/src/gitingest/entrypoint.py @@ -11,6 +11,7 @@ from gitingest.config import TMP_BASE_PATH from gitingest.ingestion import ingest_query from gitingest.query_parsing import IngestionQuery, parse_query +from gitingest.utils.ignore_patterns import load_gitignore_patterns async def ingest_async( @@ -19,6 +20,7 @@ async def ingest_async( include_patterns: Optional[Union[str, Set[str]]] = None, exclude_patterns: Optional[Union[str, Set[str]]] = None, branch: Optional[str] = None, + include_gitignored: bool = False, token: Optional[str] = None, output: Optional[str] = None, ) -> Tuple[str, str, str]: @@ -42,6 +44,8 @@ async def ingest_async( Pattern or set of patterns specifying which files to exclude. If `None`, no files are excluded. branch : str, optional The branch to clone and ingest. If `None`, the default branch is used. + include_gitignored : bool + If ``True``, include files ignored by ``.gitignore``. Defaults to ``False``. token : str, optional GitHub personal-access token (PAT). Needed when *source* refers to a **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. @@ -76,6 +80,10 @@ async def ingest_async( token=token, ) + if not include_gitignored: + gitignore_patterns = load_gitignore_patterns(query.local_path) + query.ignore_patterns.update(gitignore_patterns) + if query.url: selected_branch = branch if branch else query.branch # prioritize branch argument query.branch = selected_branch @@ -117,6 +125,7 @@ def ingest( include_patterns: Optional[Union[str, Set[str]]] = None, exclude_patterns: Optional[Union[str, Set[str]]] = None, branch: Optional[str] = None, + include_gitignored: bool = False, token: Optional[str] = None, output: Optional[str] = None, ) -> Tuple[str, str, str]: @@ -140,6 +149,8 @@ def ingest( Pattern or set of patterns specifying which files to exclude. If `None`, no files are excluded. branch : str, optional The branch to clone and ingest. If `None`, the default branch is used. + include_gitignored : bool + If ``True``, include files ignored by ``.gitignore``. Defaults to ``False``. token : str, optional GitHub personal-access token (PAT). Needed when *source* refers to a **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. @@ -165,6 +176,7 @@ def ingest( include_patterns=include_patterns, exclude_patterns=exclude_patterns, branch=branch, + include_gitignored=include_gitignored, token=token, output=output, ) diff --git a/src/gitingest/utils/ignore_patterns.py b/src/gitingest/utils/ignore_patterns.py index 8928c66d..7f4d6454 100644 --- a/src/gitingest/utils/ignore_patterns.py +++ b/src/gitingest/utils/ignore_patterns.py @@ -1,5 +1,7 @@ """Default ignore patterns for Gitingest.""" +import os +from pathlib import Path from typing import Set DEFAULT_IGNORE_PATTERNS: Set[str] = { @@ -160,3 +162,47 @@ # Gitingest "digest.txt", } + + +def load_gitignore_patterns(root: Path) -> Set[str]: + """ + Recursively load ignore patterns from all .gitignore files under the given root directory. + + Parameters + ---------- + root : Path + The root directory to search for .gitignore files. + + Returns + ------- + Set[str] + A set of ignore patterns extracted from all .gitignore files found under the root directory. + """ + patterns: Set[str] = set() + for dirpath, _, filenames in os.walk(root): + if ".gitignore" not in filenames: + continue + + gitignore_path = Path(dirpath) / ".gitignore" + with gitignore_path.open("r", encoding="utf-8") as f: + for line in f: + stripped = line.strip() + + if not stripped or stripped.startswith("#"): + continue + + negated = stripped.startswith("!") + if negated: + stripped = stripped[1:] + + rel_dir = os.path.relpath(dirpath, root) + if stripped.startswith("/"): + pattern_body = os.path.join(rel_dir, stripped.lstrip("/")) + else: + pattern_body = os.path.join(rel_dir, stripped) if rel_dir != "." else stripped + + pattern_body = pattern_body.replace("\\", "/") + pattern = f"!{pattern_body}" if negated else pattern_body + patterns.add(pattern) + + return patterns diff --git a/src/gitingest/utils/ingestion_utils.py b/src/gitingest/utils/ingestion_utils.py index 9ce2ae72..924490a9 100644 --- a/src/gitingest/utils/ingestion_utils.py +++ b/src/gitingest/utils/ingestion_utils.py @@ -1,9 +1,10 @@ """Utility functions for the ingestion process.""" -from fnmatch import fnmatch from pathlib import Path from typing import Set +from pathspec import PathSpec + def _should_include(path: Path, base_path: Path, include_patterns: Set[str]) -> bool: """ @@ -38,10 +39,8 @@ def _should_include(path: Path, base_path: Path, include_patterns: Set[str]) -> if path.is_dir(): return True - for pattern in include_patterns: - if fnmatch(rel_str, pattern): - return True - return False + spec = PathSpec.from_lines("gitwildmatch", include_patterns) + return spec.match_file(rel_str) def _should_exclude(path: Path, base_path: Path, ignore_patterns: Set[str]) -> bool: @@ -73,7 +72,5 @@ def _should_exclude(path: Path, base_path: Path, ignore_patterns: Set[str]) -> b return True rel_str = str(rel_path) - for pattern in ignore_patterns: - if pattern and fnmatch(rel_str, pattern): - return True - return False + spec = PathSpec.from_lines("gitwildmatch", ignore_patterns) + return spec.match_file(rel_str) diff --git a/tests/test_gitignore_feature.py b/tests/test_gitignore_feature.py new file mode 100644 index 00000000..fcbc74f7 --- /dev/null +++ b/tests/test_gitignore_feature.py @@ -0,0 +1,73 @@ +""" +Tests for the gitignore functionality in Gitingest. +""" + +from pathlib import Path + +import pytest + +from gitingest.entrypoint import ingest_async +from gitingest.utils.ignore_patterns import load_gitignore_patterns + + +@pytest.fixture(name="repo_path") +def repo_fixture(tmp_path: Path) -> Path: + """ + Create a temporary repository structure with: + - A .gitignore that excludes 'exclude.txt' + - 'include.txt' (should be processed) + - 'exclude.txt' (should be skipped when gitignore rules are respected) + """ + # Create a .gitignore file that excludes 'exclude.txt' + gitignore_file = tmp_path / ".gitignore" + gitignore_file.write_text("exclude.txt\n") + + # Create a file that should be included + include_file = tmp_path / "include.txt" + include_file.write_text("This file should be included.") + + # Create a file that should be excluded + exclude_file = tmp_path / "exclude.txt" + exclude_file.write_text("This file should be excluded.") + + return tmp_path + + +def test_load_gitignore_patterns(tmp_path: Path): + """ + Test that load_gitignore_patterns() correctly loads patterns from a .gitignore file. + """ + gitignore = tmp_path / ".gitignore" + # Write some sample patterns with a comment line included + gitignore.write_text("exclude.txt\n*.log\n# a comment\n") + + patterns = load_gitignore_patterns(tmp_path) + + # Check that the expected patterns are loaded + assert "exclude.txt" in patterns + assert "*.log" in patterns + # Ensure that comment lines are not added + for pattern in patterns: + assert not pattern.startswith("#") + + +@pytest.mark.asyncio +async def test_ingest_with_gitignore(repo_path: Path): + """ + Integration test for ingest_async() respecting .gitignore rules. + + When ``include_gitignored`` is ``False`` (default), the content of 'exclude.txt' should be omitted. + When ``include_gitignored`` is ``True``, both files should be present. + """ + # Run ingestion with the gitignore functionality enabled. + _, _, content_with_ignore = await ingest_async(source=str(repo_path)) + # 'exclude.txt' should be skipped. + assert "This file should be excluded." not in content_with_ignore + # 'include.txt' should be processed. + assert "This file should be included." in content_with_ignore + + # Run ingestion with the gitignore functionality disabled. + _, _, content_without_ignore = await ingest_async(source=str(repo_path), include_gitignored=True) + # Now both files should be present. + assert "This file should be excluded." in content_without_ignore + assert "This file should be included." in content_without_ignore diff --git a/tests/test_ingestion.py b/tests/test_ingestion.py index 3d829b4a..363749fd 100644 --- a/tests/test_ingestion.py +++ b/tests/test_ingestion.py @@ -84,10 +84,10 @@ class PatternScenario(TypedDict): "*/file_dir2.txt", }, "ignore_patterns": {*()}, - "expected_num_files": 3, - "expected_content": {"file1.txt", "file2.py", "dir2/file_dir2.txt"}, - "expected_structure": {"test_repo/", "dir2/"}, - "expected_not_structure": {"src/", "subdir/", "dir1/"}, + "expected_num_files": 4, + "expected_content": {"file1.txt", "file2.py", "dir1/file_dir1.txt", "dir2/file_dir2.txt"}, + "expected_structure": {"test_repo/", "dir1/", "dir2/"}, + "expected_not_structure": {"src/", "subdir/"}, } ), id="include-wildcard-directory", @@ -114,9 +114,10 @@ class PatternScenario(TypedDict): { "include_patterns": {"**/file_dir2.txt", "src/**/*.py"}, "ignore_patterns": {*()}, - "expected_num_files": 2, + "expected_num_files": 3, "expected_content": { "dir2/file_dir2.txt", + "src/subfile2.py", "src/subdir/file_subdir.py", }, "expected_structure": {"test_repo/", "dir2/", "src/", "subdir/"}, @@ -169,12 +170,11 @@ class PatternScenario(TypedDict): { "include_patterns": {*()}, "ignore_patterns": {"src/**/*.py"}, - "expected_num_files": 7, + "expected_num_files": 6, "expected_content": { "file1.txt", "file2.py", "src/subfile1.txt", - "src/subfile2.py", "src/subdir/file_subdir.txt", "dir1/file_dir1.txt", "dir2/file_dir2.txt",