From 2f1bd53ecfa7e296447c8787b4525b92242122ab Mon Sep 17 00:00:00 2001 From: Sendi John Date: Fri, 27 Jun 2025 13:59:10 +0100 Subject: [PATCH 1/9] docs: Fix CLI help text accuracy - Add stdout documentation to --output option help text - Update default filename to 'digest.txt' consistently - Enhance docstrings with comprehensive usage examples - Improve GitHub token documentation with environment variable support - Fix inconsistencies between help text and actual CLI behavior --- current_help.txt | 36 + src/gitingest/cli.py | 27 +- src/static/llm.txt | 4 +- test.txt | 5928 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 5989 insertions(+), 6 deletions(-) create mode 100644 current_help.txt create mode 100644 test.txt diff --git a/current_help.txt b/current_help.txt new file mode 100644 index 00000000..0477c796 --- /dev/null +++ b/current_help.txt @@ -0,0 +1,36 @@ +Usage: gitingest [OPTIONS] [SOURCE] + + Main entry point for the CLI. This function is called when the CLI is run as + a script. + + It calls the async main function to run the command. + + Parameters ---------- source : str A directory path or a Git repository + URL. output : str, optional The path where the output file will be + written. If not specified, the output will be written to a file named + `.txt` in the current directory. Use '-' to output to stdout. + max_size : int Maximum file size (in bytes) to consider. exclude_pattern + : Tuple[str, ...] Glob patterns for pruning the file set. + include_pattern : Tuple[str, ...] Glob patterns for including files in + the output. branch : str, optional Specific branch to ingest (defaults + to the repository's default). include_gitignored : bool If provided, + include files normally ignored by .gitignore. token: str, optional + GitHub personal-access token (PAT). Needed when *source* refers to a + **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. + +Options: + -o, --output TEXT Output file path (default: .txt in + current directory) + -s, --max-size INTEGER Maximum file size to process in bytes + -e, --exclude-pattern TEXT Patterns to exclude. Handles Python's arbitrary + subset of Unix shell-style wildcards. See: + https://docs.python.org/3/library/fnmatch.html + -i, --include-pattern TEXT Patterns to include. Handles Python's arbitrary + subset of Unix shell-style wildcards. See: + https://docs.python.org/3/library/fnmatch.html + -b, --branch TEXT Branch to clone and ingest + --include-gitignored Include files matched by .gitignore + -t, --token TEXT GitHub personal access token for accessing + private repositories. If omitted, the CLI will + look for the GITHUB_TOKEN environment variable. + --help Show this message and exit. diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index 1fb8a785..8573acf6 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -17,7 +17,7 @@ "--output", "-o", default=None, - help="Output file path (default: .txt in current directory)", + help="Output file path (default: digest.txt in current directory). Use '-' for stdout.", ) @click.option( "--max-size", @@ -81,7 +81,7 @@ def main( A directory path or a Git repository URL. output : str, optional The path where the output file will be written. If not specified, the output will be written - to a file named `.txt` in the current directory. Use '-' to output to stdout. + to a file named `digest.txt` in the current directory. Use '-' to output to stdout. max_size : int Maximum file size (in bytes) to consider. exclude_pattern : Tuple[str, ...] @@ -95,6 +95,25 @@ def main( token: str, optional GitHub personal-access token (PAT). Needed when *source* refers to a **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. + + Examples + -------- + Basic usage: + $ gitingest . + $ gitingest /path/to/repo + $ gitingest https://github.com/user/repo + + Output to stdout: + $ gitingest . -o - + $ gitingest https://github.com/user/repo --output - + + With filtering: + $ gitingest . -i "*.py" -e "*.log" + $ gitingest . --include-pattern "*.js" --exclude-pattern "node_modules/*" + + Private repositories: + $ gitingest https://github.com/user/private-repo -t ghp_token + $ GITHUB_TOKEN=ghp_token gitingest https://github.com/user/private-repo """ asyncio.run( _async_main( @@ -133,7 +152,7 @@ async def _async_main( A directory path or a Git repository URL. output : str, optional The path where the output file will be written. If not specified, the output will be written - to a file named `.txt` in the current directory. Use '-' to output to stdout. + to a file named `digest.txt` in the current directory. Use '-' to output to stdout. max_size : int Maximum file size (in bytes) to consider. exclude_pattern : Tuple[str, ...] @@ -193,4 +212,4 @@ async def _async_main( if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/src/static/llm.txt b/src/static/llm.txt index a307e2e8..bf09c404 100644 --- a/src/static/llm.txt +++ b/src/static/llm.txt @@ -176,7 +176,7 @@ gitingest https://github.com/user/private-repo -t $GITHUB_TOKEN -o - # Specific branch analysis (short flag) gitingest https://github.com/user/repo -b main -o - -# Save to file (default: .txt in current directory) +# Save to file (default: digest.txt in current directory) gitingest https://github.com/user/repo -o my_analysis.txt # Ultra-concise example for small files only @@ -184,7 +184,7 @@ gitingest https://github.com/user/repo -i "*.py" -s 51200 -o - ``` **Key Parameters for AI Agents**: -- `-o` / `--output`: Stream to STDOUT with `-` (default saves to `.txt`) +- `-o` / `--output`: Stream to STDOUT with `-` (default saves to `digest.txt`) - `-s` / `--max-size`: Maximum file size in bytes to process (default: no limit) - `-i` / `--include-pattern`: Include files matching Unix shell-style wildcards - `-e` / `--exclude-pattern`: Exclude files matching Unix shell-style wildcards diff --git a/test.txt b/test.txt new file mode 100644 index 00000000..0e2d7c2c --- /dev/null +++ b/test.txt @@ -0,0 +1,5928 @@ +Directory structure: +└── gitingest/ + ├── src/ + │ ├── gitingest/ + │ │ ├── __init__.py + │ │ ├── cli.py + │ │ ├── cloning.py + │ │ ├── config.py + │ │ ├── entrypoint.py + │ │ ├── ingestion.py + │ │ ├── output_formatters.py + │ │ ├── query_parsing.py + │ │ ├── schemas/ + │ │ │ ├── __init__.py + │ │ │ ├── filesystem_schema.py + │ │ │ └── ingestion_schema.py + │ │ └── utils/ + │ │ ├── __init__.py + │ │ ├── exceptions.py + │ │ ├── file_utils.py + │ │ ├── git_utils.py + │ │ ├── ignore_patterns.py + │ │ ├── ingestion_utils.py + │ │ ├── notebook_utils.py + │ │ ├── os_utils.py + │ │ ├── path_utils.py + │ │ ├── query_parser_utils.py + │ │ └── timeout_wrapper.py + │ └── server/ + │ ├── __init__.py + │ ├── main.py + │ ├── query_processor.py + │ ├── server_config.py + │ ├── server_utils.py + │ └── routers/ + │ ├── __init__.py + │ ├── download.py + │ ├── dynamic.py + │ └── index.py + └── tests/ + ├── __init__.py + ├── conftest.py + ├── test_cli.py + ├── test_flow_integration.py + ├── test_git_utils.py + ├── test_gitignore_feature.py + ├── test_ingestion.py + ├── test_notebook_utils.py + ├── test_repository_clone.py + └── query_parser/ + ├── __init__.py + ├── test_git_host_agnostic.py + └── test_query_parser.py + +================================================ +FILE: src/gitingest/__init__.py +================================================ +"""Gitingest: A package for ingesting data from Git repositories.""" + +from gitingest.cloning import clone_repo +from gitingest.entrypoint import ingest, ingest_async +from gitingest.ingestion import ingest_query +from gitingest.query_parsing import parse_query + +__all__ = ["ingest_query", "clone_repo", "parse_query", "ingest", "ingest_async"] + + + +================================================ +FILE: src/gitingest/cli.py +================================================ +"""Command-line interface for the Gitingest package.""" + +# pylint: disable=no-value-for-parameter + +import asyncio +from typing import Optional, Tuple + +import click + +from gitingest.config import MAX_FILE_SIZE, OUTPUT_FILE_NAME +from gitingest.entrypoint import ingest_async + + +@click.command() +@click.argument("source", type=str, default=".") +@click.option( + "--output", + "-o", + default=None, + help="Output file path (default: digest.txt in current directory). Use '-' for stdout.", +) +@click.option( + "--max-size", + "-s", + default=MAX_FILE_SIZE, + help="Maximum file size to process in bytes", +) +@click.option( + "--exclude-pattern", + "-e", + multiple=True, + help=( + "Patterns to exclude. Handles Python's arbitrary subset of Unix shell-style " + "wildcards. See: https://docs.python.org/3/library/fnmatch.html" + ), +) +@click.option( + "--include-pattern", + "-i", + multiple=True, + help=( + "Patterns to include. Handles Python's arbitrary subset of Unix shell-style " + "wildcards. See: https://docs.python.org/3/library/fnmatch.html" + ), +) +@click.option("--branch", "-b", default=None, help="Branch to clone and ingest") +@click.option( + "--include-gitignored", + is_flag=True, + default=False, + help="Include files matched by .gitignore", +) +@click.option( + "--token", + "-t", + envvar="GITHUB_TOKEN", + default=None, + help=( + "GitHub personal access token for accessing private repositories. " + "If omitted, the CLI will look for the GITHUB_TOKEN environment variable." + ), +) +def main( + source: str, + output: Optional[str], + max_size: int, + exclude_pattern: Tuple[str, ...], + include_pattern: Tuple[str, ...], + branch: Optional[str], + include_gitignored: bool, + token: Optional[str], +): + """ + Main entry point for the CLI. This function is called when the CLI is run as a script. + + It calls the async main function to run the command. + + Parameters + ---------- + source : str + A directory path or a Git repository URL. + output : str, optional + The path where the output file will be written. If not specified, the output will be written + to a file named `digest.txt` in the current directory. Use '-' to output to stdout. + max_size : int + Maximum file size (in bytes) to consider. + exclude_pattern : Tuple[str, ...] + Glob patterns for pruning the file set. + include_pattern : Tuple[str, ...] + Glob patterns for including files in the output. + branch : str, optional + Specific branch to ingest (defaults to the repository's default). + include_gitignored : bool + If provided, include files normally ignored by .gitignore. + token: str, optional + GitHub personal-access token (PAT). Needed when *source* refers to a + **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. + + Examples + -------- + Basic usage: + $ gitingest . + $ gitingest /path/to/repo + $ gitingest https://github.com/user/repo + + Output to stdout: + $ gitingest . -o - + $ gitingest https://github.com/user/repo --output - + + With filtering: + $ gitingest . -i "*.py" -e "*.log" + $ gitingest . --include-pattern "*.js" --exclude-pattern "node_modules/*" + + Private repositories: + $ gitingest https://github.com/user/private-repo -t ghp_token + $ GITHUB_TOKEN=ghp_token gitingest https://github.com/user/private-repo + """ + asyncio.run( + _async_main( + source=source, + output=output, + max_size=max_size, + exclude_pattern=exclude_pattern, + include_pattern=include_pattern, + branch=branch, + include_gitignored=include_gitignored, + token=token, + ) + ) + + +async def _async_main( + source: str, + output: Optional[str], + max_size: int, + exclude_pattern: Tuple[str, ...], + include_pattern: Tuple[str, ...], + branch: Optional[str], + include_gitignored: bool, + token: Optional[str], +) -> None: + """ + Analyze a directory or repository and create a text dump of its contents. + + This command analyzes the contents of a specified source directory or repository, applies custom include and + exclude patterns, and generates a text summary of the analysis which is then written to an output file + or printed to stdout. + + Parameters + ---------- + source : str + A directory path or a Git repository URL. + output : str, optional + The path where the output file will be written. If not specified, the output will be written + to a file named `digest.txt` in the current directory. Use '-' to output to stdout. + max_size : int + Maximum file size (in bytes) to consider. + exclude_pattern : Tuple[str, ...] + Glob patterns for pruning the file set. + include_pattern : Tuple[str, ...] + Glob patterns for including files in the output. + branch : str, optional + Specific branch to ingest (defaults to the repository's default). + include_gitignored : bool + If provided, include files normally ignored by .gitignore. + token: str, optional + GitHub personal-access token (PAT). Needed when *source* refers to a + **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. + + Raises + ------ + Abort + If there is an error during the execution of the command, this exception is raised to abort the process. + """ + try: + # Normalise pattern containers (the ingest layer expects sets) + exclude_patterns = set(exclude_pattern) + include_patterns = set(include_pattern) + + output_target = output if output is not None else OUTPUT_FILE_NAME + + if output_target == "-": + click.echo("Analyzing source, preparing output for stdout...", err=True) + else: + click.echo(f"Analyzing source, output will be written to '{output_target}'...", err=True) + + summary, _, _ = await ingest_async( + source=source, + max_file_size=max_size, + include_patterns=include_patterns, + exclude_patterns=exclude_patterns, + branch=branch, + output=output_target, + include_gitignored=include_gitignored, + token=token, + ) + + if output_target == "-": # stdout + click.echo("\n--- Summary ---", err=True) + click.echo(summary, err=True) + click.echo("--- End Summary ---", err=True) + click.echo("Analysis complete! Output sent to stdout.", err=True) + else: # file + click.echo(f"Analysis complete! Output written to: {output_target}") + click.echo("\nSummary:") + click.echo(summary) + + except Exception as exc: + # Convert any exception into Click.Abort so that exit status is non-zero + click.echo(f"Error: {exc}", err=True) + raise click.Abort() from exc + + +if __name__ == "__main__": + main() + + +================================================ +FILE: src/gitingest/cloning.py +================================================ +"""This module contains functions for cloning a Git repository to a local path.""" + +from pathlib import Path +from typing import Optional + +from gitingest.config import DEFAULT_TIMEOUT +from gitingest.schemas import CloneConfig +from gitingest.utils.git_utils import ( + check_repo_exists, + create_git_auth_header, + create_git_command, + ensure_git_installed, + run_command, + validate_github_token, +) +from gitingest.utils.os_utils import ensure_directory +from gitingest.utils.timeout_wrapper import async_timeout + + +@async_timeout(DEFAULT_TIMEOUT) +async def clone_repo(config: CloneConfig, token: Optional[str] = None) -> None: + """ + Clone a repository to a local path based on the provided configuration. + + This function handles the process of cloning a Git repository to the local file system. + It can clone a specific branch or commit if provided, and it raises exceptions if + any errors occur during the cloning process. + + Parameters + ---------- + config : CloneConfig + The configuration for cloning the repository. + token : str, optional + GitHub personal-access token (PAT). Needed when *source* refers to a + **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. + Must start with 'github_pat_' or 'gph_' for GitHub repositories. + + Raises + ------ + ValueError + If the repository is not found, if the provided URL is invalid, or if the token format is invalid. + """ + # Extract and validate query parameters + url: str = config.url + local_path: str = config.local_path + commit: Optional[str] = config.commit + branch: Optional[str] = config.branch + partial_clone: bool = config.subpath != "/" + + # Validate token if provided + if token and url.startswith("https://github.com"): + validate_github_token(token) + + # Create parent directory if it doesn't exist + await ensure_directory(Path(local_path).parent) + + # Check if the repository exists + if not await check_repo_exists(url, token=token): + raise ValueError("Repository not found. Make sure it is public or that you have provided a valid token.") + + clone_cmd = ["git"] + if token and url.startswith("https://github.com"): + clone_cmd += ["-c", create_git_auth_header(token)] + + clone_cmd += ["clone", "--single-branch"] + # TODO: Re-enable --recurse-submodules when submodule support is needed + + if partial_clone: + clone_cmd += ["--filter=blob:none", "--sparse"] + + if not commit: + clone_cmd += ["--depth=1"] + if branch and branch.lower() not in ("main", "master"): + clone_cmd += ["--branch", branch] + + clone_cmd += [url, local_path] + + # Clone the repository + await ensure_git_installed() + await run_command(*clone_cmd) + + # Checkout the subpath if it is a partial clone + if partial_clone: + subpath = config.subpath.lstrip("/") + if config.blob: + # When ingesting from a file url (blob/branch/path/file.txt), we need to remove the file name. + subpath = str(Path(subpath).parent.as_posix()) + + checkout_cmd = create_git_command(["git"], local_path, url, token) + await run_command(*checkout_cmd, "sparse-checkout", "set", subpath) + + # Checkout the commit if it is provided + if commit: + checkout_cmd = create_git_command(["git"], local_path, url, token) + await run_command(*checkout_cmd, "checkout", commit) + + + +================================================ +FILE: src/gitingest/config.py +================================================ +"""Configuration file for the project.""" + +import tempfile +from pathlib import Path + +MAX_FILE_SIZE = 10 * 1024 * 1024 # 10 MB +MAX_DIRECTORY_DEPTH = 20 # Maximum depth of directory traversal +MAX_FILES = 10_000 # Maximum number of files to process +MAX_TOTAL_SIZE_BYTES = 500 * 1024 * 1024 # 500 MB +DEFAULT_TIMEOUT = 60 # seconds + +OUTPUT_FILE_NAME = "digest.txt" + +TMP_BASE_PATH = Path(tempfile.gettempdir()) / "gitingest" + + + +================================================ +FILE: src/gitingest/entrypoint.py +================================================ +"""Main entry point for ingesting a source and processing its contents.""" + +import asyncio +import inspect +import os +import shutil +import sys +from typing import Optional, Set, Tuple, Union + +from gitingest.cloning import clone_repo +from gitingest.config import TMP_BASE_PATH +from gitingest.ingestion import ingest_query +from gitingest.query_parsing import IngestionQuery, parse_query +from gitingest.utils.ignore_patterns import load_gitignore_patterns + + +async def ingest_async( + source: str, + max_file_size: int = 10 * 1024 * 1024, # 10 MB + include_patterns: Optional[Union[str, Set[str]]] = None, + exclude_patterns: Optional[Union[str, Set[str]]] = None, + branch: Optional[str] = None, + include_gitignored: bool = False, + token: Optional[str] = None, + output: Optional[str] = None, +) -> Tuple[str, str, str]: + """ + Main entry point for ingesting a source and processing its contents. + + This function analyzes a source (URL or local path), clones the corresponding repository (if applicable), + and processes its files according to the specified query parameters. It returns a summary, a tree-like + structure of the files, and the content of the files. The results can optionally be written to an output file. + + Parameters + ---------- + source : str + The source to analyze, which can be a URL (for a Git repository) or a local directory path. + max_file_size : int + Maximum allowed file size for file ingestion. Files larger than this size are ignored, by default + 10*1024*1024 (10 MB). + include_patterns : Union[str, Set[str]], optional + Pattern or set of patterns specifying which files to include. If `None`, all files are included. + exclude_patterns : Union[str, Set[str]], optional + Pattern or set of patterns specifying which files to exclude. If `None`, no files are excluded. + branch : str, optional + The branch to clone and ingest. If `None`, the default branch is used. + include_gitignored : bool + If ``True``, include files ignored by ``.gitignore``. Defaults to ``False``. + token : str, optional + GitHub personal-access token (PAT). Needed when *source* refers to a + **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. + output : str, optional + File path where the summary and content should be written. If `None`, the results are not written to a file. + + Returns + ------- + Tuple[str, str, str] + A tuple containing: + - A summary string of the analyzed repository or directory. + - A tree-like string representation of the file structure. + - The content of the files in the repository or directory. + + Raises + ------ + TypeError + If `clone_repo` does not return a coroutine, or if the `source` is of an unsupported type. + """ + repo_cloned = False + + if not token: + token = os.getenv("GITHUB_TOKEN") + + try: + query: IngestionQuery = await parse_query( + source=source, + max_file_size=max_file_size, + from_web=False, + include_patterns=include_patterns, + ignore_patterns=exclude_patterns, + token=token, + ) + + if not include_gitignored: + gitignore_patterns = load_gitignore_patterns(query.local_path) + query.ignore_patterns.update(gitignore_patterns) + + if query.url: + selected_branch = branch if branch else query.branch # prioritize branch argument + query.branch = selected_branch + + clone_config = query.extract_clone_config() + clone_coroutine = clone_repo(clone_config, token=token) + + if inspect.iscoroutine(clone_coroutine): + if asyncio.get_event_loop().is_running(): + await clone_coroutine + else: + asyncio.run(clone_coroutine) + else: + raise TypeError("clone_repo did not return a coroutine as expected.") + + repo_cloned = True + + summary, tree, content = ingest_query(query) + + if output == "-": + loop = asyncio.get_running_loop() + output_data = tree + "\n" + content + await loop.run_in_executor(None, sys.stdout.write, output_data) + await loop.run_in_executor(None, sys.stdout.flush) + elif output is not None: + with open(output, "w", encoding="utf-8") as f: + f.write(tree + "\n" + content) + + return summary, tree, content + finally: + # Clean up the temporary directory if it was created + if repo_cloned: + shutil.rmtree(TMP_BASE_PATH, ignore_errors=True) + + +def ingest( + source: str, + max_file_size: int = 10 * 1024 * 1024, # 10 MB + include_patterns: Optional[Union[str, Set[str]]] = None, + exclude_patterns: Optional[Union[str, Set[str]]] = None, + branch: Optional[str] = None, + include_gitignored: bool = False, + token: Optional[str] = None, + output: Optional[str] = None, +) -> Tuple[str, str, str]: + """ + Synchronous version of ingest_async. + + This function analyzes a source (URL or local path), clones the corresponding repository (if applicable), + and processes its files according to the specified query parameters. It returns a summary, a tree-like + structure of the files, and the content of the files. The results can optionally be written to an output file. + + Parameters + ---------- + source : str + The source to analyze, which can be a URL (for a Git repository) or a local directory path. + max_file_size : int + Maximum allowed file size for file ingestion. Files larger than this size are ignored, by default + 10*1024*1024 (10 MB). + include_patterns : Union[str, Set[str]], optional + Pattern or set of patterns specifying which files to include. If `None`, all files are included. + exclude_patterns : Union[str, Set[str]], optional + Pattern or set of patterns specifying which files to exclude. If `None`, no files are excluded. + branch : str, optional + The branch to clone and ingest. If `None`, the default branch is used. + include_gitignored : bool + If ``True``, include files ignored by ``.gitignore``. Defaults to ``False``. + token : str, optional + GitHub personal-access token (PAT). Needed when *source* refers to a + **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. + output : str, optional + File path where the summary and content should be written. If `None`, the results are not written to a file. + + Returns + ------- + Tuple[str, str, str] + A tuple containing: + - A summary string of the analyzed repository or directory. + - A tree-like string representation of the file structure. + - The content of the files in the repository or directory. + + See Also + -------- + ingest_async : The asynchronous version of this function. + """ + return asyncio.run( + ingest_async( + source=source, + max_file_size=max_file_size, + include_patterns=include_patterns, + exclude_patterns=exclude_patterns, + branch=branch, + include_gitignored=include_gitignored, + token=token, + output=output, + ) + ) + + + +================================================ +FILE: src/gitingest/ingestion.py +================================================ +"""Functions to ingest and analyze a codebase directory or single file.""" + +import warnings +from pathlib import Path +from typing import Tuple + +from gitingest.config import MAX_DIRECTORY_DEPTH, MAX_FILES, MAX_TOTAL_SIZE_BYTES +from gitingest.output_formatters import format_node +from gitingest.query_parsing import IngestionQuery +from gitingest.schemas import FileSystemNode, FileSystemNodeType, FileSystemStats +from gitingest.utils.ingestion_utils import _should_exclude, _should_include + +try: + import tomllib # type: ignore[import] +except ImportError: + import tomli as tomllib + + +def ingest_query(query: IngestionQuery) -> Tuple[str, str, str]: + """ + Run the ingestion process for a parsed query. + + This is the main entry point for analyzing a codebase directory or single file. It processes the query + parameters, reads the file or directory content, and generates a summary, directory structure, and file content, + along with token estimations. + + Parameters + ---------- + query : IngestionQuery + The parsed query object containing information about the repository and query parameters. + + Returns + ------- + Tuple[str, str, str] + A tuple containing the summary, directory structure, and file contents. + + Raises + ------ + ValueError + If the path cannot be found, is not a file, or the file has no content. + """ + subpath = Path(query.subpath.strip("/")).as_posix() + path = query.local_path / subpath + + apply_gitingest_file(path, query) + + if not path.exists(): + raise ValueError(f"{query.slug} cannot be found") + + if (query.type and query.type == "blob") or query.local_path.is_file(): + # TODO: We do this wrong! We should still check the branch and commit! + if not path.is_file(): + raise ValueError(f"Path {path} is not a file") + + relative_path = path.relative_to(query.local_path) + + file_node = FileSystemNode( + name=path.name, + type=FileSystemNodeType.FILE, + size=path.stat().st_size, + file_count=1, + path_str=str(relative_path), + path=path, + ) + + if not file_node.content: + raise ValueError(f"File {file_node.name} has no content") + + return format_node(file_node, query) + + root_node = FileSystemNode( + name=path.name, + type=FileSystemNodeType.DIRECTORY, + path_str=str(path.relative_to(query.local_path)), + path=path, + ) + + stats = FileSystemStats() + + _process_node( + node=root_node, + query=query, + stats=stats, + ) + + return format_node(root_node, query) + + +def apply_gitingest_file(path: Path, query: IngestionQuery) -> None: + """ + Apply the .gitingest file to the query object. + + This function reads the .gitingest file in the specified path and updates the query object with the ignore + patterns found in the file. + + Parameters + ---------- + path : Path + The path of the directory to ingest. + query : IngestionQuery + The parsed query object containing information about the repository and query parameters. + It should have an attribute `ignore_patterns` which is either None or a set of strings. + """ + path_gitingest = path / ".gitingest" + + if not path_gitingest.is_file(): + return + + try: + with path_gitingest.open("rb") as f: + data = tomllib.load(f) + except tomllib.TOMLDecodeError as exc: + warnings.warn(f"Invalid TOML in {path_gitingest}: {exc}", UserWarning) + return + + config_section = data.get("config", {}) + ignore_patterns = config_section.get("ignore_patterns") + + if not ignore_patterns: + return + + # If a single string is provided, make it a list of one element + if isinstance(ignore_patterns, str): + ignore_patterns = [ignore_patterns] + + if not isinstance(ignore_patterns, (list, set)): + warnings.warn( + f"Expected a list/set for 'ignore_patterns', got {type(ignore_patterns)} in {path_gitingest}. Skipping.", + UserWarning, + ) + return + + # Filter out duplicated patterns + ignore_patterns = set(ignore_patterns) + + # Filter out any non-string entries + valid_patterns = {pattern for pattern in ignore_patterns if isinstance(pattern, str)} + invalid_patterns = ignore_patterns - valid_patterns + + if invalid_patterns: + warnings.warn(f"Ignore patterns {invalid_patterns} are not strings. Skipping.", UserWarning) + + if not valid_patterns: + return + + if query.ignore_patterns is None: + query.ignore_patterns = valid_patterns + else: + query.ignore_patterns.update(valid_patterns) + + return + + +def _process_node( + node: FileSystemNode, + query: IngestionQuery, + stats: FileSystemStats, +) -> None: + """ + Process a file or directory item within a directory. + + This function handles each file or directory item, checking if it should be included or excluded based on the + provided patterns. It handles symlinks, directories, and files accordingly. + + Parameters + ---------- + node : FileSystemNode + The current directory or file node being processed. + query : IngestionQuery + The parsed query object containing information about the repository and query parameters. + stats : FileSystemStats + Statistics tracking object for the total file count and size. + """ + + if limit_exceeded(stats, node.depth): + return + + for sub_path in node.path.iterdir(): + + if query.ignore_patterns and _should_exclude(sub_path, query.local_path, query.ignore_patterns): + continue + + if query.include_patterns and not _should_include(sub_path, query.local_path, query.include_patterns): + continue + + if sub_path.is_symlink(): + _process_symlink(path=sub_path, parent_node=node, stats=stats, local_path=query.local_path) + elif sub_path.is_file(): + _process_file(path=sub_path, parent_node=node, stats=stats, local_path=query.local_path) + elif sub_path.is_dir(): + + child_directory_node = FileSystemNode( + name=sub_path.name, + type=FileSystemNodeType.DIRECTORY, + path_str=str(sub_path.relative_to(query.local_path)), + path=sub_path, + depth=node.depth + 1, + ) + + _process_node( + node=child_directory_node, + query=query, + stats=stats, + ) + + if not child_directory_node.children: + continue + + node.children.append(child_directory_node) + node.size += child_directory_node.size + node.file_count += child_directory_node.file_count + node.dir_count += 1 + child_directory_node.dir_count + else: + print(f"Warning: {sub_path} is an unknown file type, skipping") + + node.sort_children() + + +def _process_symlink(path: Path, parent_node: FileSystemNode, stats: FileSystemStats, local_path: Path) -> None: + """ + Process a symlink in the file system. + + This function checks the symlink's target. + + Parameters + ---------- + path : Path + The full path of the symlink. + parent_node : FileSystemNode + The parent directory node. + stats : FileSystemStats + Statistics tracking object for the total file count and size. + local_path : Path + The base path of the repository or directory being processed. + """ + child = FileSystemNode( + name=path.name, + type=FileSystemNodeType.SYMLINK, + path_str=str(path.relative_to(local_path)), + path=path, + depth=parent_node.depth + 1, + ) + stats.total_files += 1 + parent_node.children.append(child) + parent_node.file_count += 1 + + +def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStats, local_path: Path) -> None: + """ + Process a file in the file system. + + This function checks the file's size, increments the statistics, and reads its content. + If the file size exceeds the maximum allowed, it raises an error. + + Parameters + ---------- + path : Path + The full path of the file. + parent_node : FileSystemNode + The dictionary to accumulate the results. + stats : FileSystemStats + Statistics tracking object for the total file count and size. + local_path : Path + The base path of the repository or directory being processed. + """ + file_size = path.stat().st_size + if stats.total_size + file_size > MAX_TOTAL_SIZE_BYTES: + print(f"Skipping file {path}: would exceed total size limit") + return + + stats.total_files += 1 + stats.total_size += file_size + + if stats.total_files > MAX_FILES: + print(f"Maximum file limit ({MAX_FILES}) reached") + return + + child = FileSystemNode( + name=path.name, + type=FileSystemNodeType.FILE, + size=file_size, + file_count=1, + path_str=str(path.relative_to(local_path)), + path=path, + depth=parent_node.depth + 1, + ) + + parent_node.children.append(child) + parent_node.size += file_size + parent_node.file_count += 1 + + +def limit_exceeded(stats: FileSystemStats, depth: int) -> bool: + """ + Check if any of the traversal limits have been exceeded. + + This function checks if the current traversal has exceeded any of the configured limits: + maximum directory depth, maximum number of files, or maximum total size in bytes. + + Parameters + ---------- + stats : FileSystemStats + Statistics tracking object for the total file count and size. + depth : int + The current depth of directory traversal. + + Returns + ------- + bool + True if any limit has been exceeded, False otherwise. + """ + if depth > MAX_DIRECTORY_DEPTH: + print(f"Maximum depth limit ({MAX_DIRECTORY_DEPTH}) reached") + return True + + if stats.total_files >= MAX_FILES: + print(f"Maximum file limit ({MAX_FILES}) reached") + return True # TODO: end recursion + + if stats.total_size >= MAX_TOTAL_SIZE_BYTES: + print(f"Maxumum total size limit ({MAX_TOTAL_SIZE_BYTES/1024/1024:.1f}MB) reached") + return True # TODO: end recursion + + return False + + + +================================================ +FILE: src/gitingest/output_formatters.py +================================================ +"""Functions to ingest and analyze a codebase directory or single file.""" + +from typing import Optional, Tuple + +import tiktoken + +from gitingest.query_parsing import IngestionQuery +from gitingest.schemas import FileSystemNode, FileSystemNodeType + + +def format_node(node: FileSystemNode, query: IngestionQuery) -> Tuple[str, str, str]: + """ + Generate a summary, directory structure, and file contents for a given file system node. + + If the node represents a directory, the function will recursively process its contents. + + Parameters + ---------- + node : FileSystemNode + The file system node to be summarized. + query : IngestionQuery + The parsed query object containing information about the repository and query parameters. + + Returns + ------- + Tuple[str, str, str] + A tuple containing the summary, directory structure, and file contents. + """ + is_single_file = node.type == FileSystemNodeType.FILE + summary = _create_summary_prefix(query, single_file=is_single_file) + + if node.type == FileSystemNodeType.DIRECTORY: + summary += f"Files analyzed: {node.file_count}\n" + elif node.type == FileSystemNodeType.FILE: + summary += f"File: {node.name}\n" + summary += f"Lines: {len(node.content.splitlines()):,}\n" + + tree = "Directory structure:\n" + _create_tree_structure(query, node) + _create_tree_structure(query, node) + + content = _gather_file_contents(node) + + token_estimate = _format_token_count(tree + content) + if token_estimate: + summary += f"\nEstimated tokens: {token_estimate}" + + return summary, tree, content + + +def _create_summary_prefix(query: IngestionQuery, single_file: bool = False) -> str: + """ + Create a prefix string for summarizing a repository or local directory. + + Includes repository name (if provided), commit/branch details, and subpath if relevant. + + Parameters + ---------- + query : IngestionQuery + The parsed query object containing information about the repository and query parameters. + single_file : bool + A flag indicating whether the summary is for a single file, by default False. + + Returns + ------- + str + A summary prefix string containing repository, commit, branch, and subpath details. + """ + parts = [] + + if query.user_name: + parts.append(f"Repository: {query.user_name}/{query.repo_name}") + else: + # Local scenario + parts.append(f"Directory: {query.slug}") + + if query.commit: + parts.append(f"Commit: {query.commit}") + elif query.branch and query.branch not in ("main", "master"): + parts.append(f"Branch: {query.branch}") + + if query.subpath != "/" and not single_file: + parts.append(f"Subpath: {query.subpath}") + + return "\n".join(parts) + "\n" + + +def _gather_file_contents(node: FileSystemNode) -> str: + """ + Recursively gather contents of all files under the given node. + + This function recursively processes a directory node and gathers the contents of all files + under that node. It returns the concatenated content of all files as a single string. + + Parameters + ---------- + node : FileSystemNode + The current directory or file node being processed. + + Returns + ------- + str + The concatenated content of all files under the given node. + """ + if node.type != FileSystemNodeType.DIRECTORY: + return node.content_string + + # Recursively gather contents of all files under the current directory + return "\n".join(_gather_file_contents(child) for child in node.children) + + +def _create_tree_structure(query: IngestionQuery, node: FileSystemNode, prefix: str = "", is_last: bool = True) -> str: + """ + Generate a tree-like string representation of the file structure. + + This function generates a string representation of the directory structure, formatted + as a tree with appropriate indentation for nested directories and files. + + Parameters + ---------- + query : IngestionQuery + The parsed query object containing information about the repository and query parameters. + node : FileSystemNode + The current directory or file node being processed. + prefix : str + A string used for indentation and formatting of the tree structure, by default "". + is_last : bool + A flag indicating whether the current node is the last in its directory, by default True. + + Returns + ------- + str + A string representing the directory structure formatted as a tree. + """ + if not node.name: + # If no name is present, use the slug as the top-level directory name + node.name = query.slug + + tree_str = "" + current_prefix = "└── " if is_last else "├── " + + # Indicate directories with a trailing slash + display_name = node.name + if node.type == FileSystemNodeType.DIRECTORY: + display_name += "/" + elif node.type == FileSystemNodeType.SYMLINK: + display_name += " -> " + node.path.readlink().name + + tree_str += f"{prefix}{current_prefix}{display_name}\n" + + if node.type == FileSystemNodeType.DIRECTORY and node.children: + prefix += " " if is_last else "│ " + for i, child in enumerate(node.children): + tree_str += _create_tree_structure(query, node=child, prefix=prefix, is_last=i == len(node.children) - 1) + return tree_str + + +def _format_token_count(text: str) -> Optional[str]: + """ + Return a human-readable string representing the token count of the given text. + + E.g., '120' -> '120', '1200' -> '1.2k', '1200000' -> '1.2M'. + + Parameters + ---------- + text : str + The text string for which the token count is to be estimated. + + Returns + ------- + str, optional + The formatted number of tokens as a string (e.g., '1.2k', '1.2M'), or `None` if an error occurs. + """ + try: + encoding = tiktoken.get_encoding("o200k_base") # gpt-4o, gpt-4o-mini + total_tokens = len(encoding.encode(text, disallowed_special=())) + except (ValueError, UnicodeEncodeError) as exc: + print(exc) + return None + + if total_tokens >= 1_000_000: + return f"{total_tokens / 1_000_000:.1f}M" + + if total_tokens >= 1_000: + return f"{total_tokens / 1_000:.1f}k" + + return str(total_tokens) + + + +================================================ +FILE: src/gitingest/query_parsing.py +================================================ +"""This module contains functions to parse and validate input sources and patterns.""" + +import re +import uuid +import warnings +from pathlib import Path +from typing import List, Optional, Set, Union +from urllib.parse import unquote, urlparse + +from gitingest.config import TMP_BASE_PATH +from gitingest.schemas import IngestionQuery +from gitingest.utils.exceptions import InvalidPatternError +from gitingest.utils.git_utils import check_repo_exists, fetch_remote_branch_list +from gitingest.utils.ignore_patterns import DEFAULT_IGNORE_PATTERNS +from gitingest.utils.query_parser_utils import ( + KNOWN_GIT_HOSTS, + _get_user_and_repo_from_path, + _is_valid_git_commit_hash, + _is_valid_pattern, + _normalize_pattern, + _validate_host, + _validate_url_scheme, +) + + +async def parse_query( + source: str, + max_file_size: int, + from_web: bool, + include_patterns: Optional[Union[str, Set[str]]] = None, + ignore_patterns: Optional[Union[str, Set[str]]] = None, + token: Optional[str] = None, +) -> IngestionQuery: + """ + Parse the input source (URL or path) to extract relevant details for the query. + + This function parses the input source to extract details such as the username, repository name, + commit hash, branch name, and other relevant information. It also processes the include and ignore + patterns to filter the files and directories to include or exclude from the query. + + Parameters + ---------- + source : str + The source URL or file path to parse. + max_file_size : int + The maximum file size in bytes to include. + from_web : bool + Flag indicating whether the source is a web URL. + include_patterns : Union[str, Set[str]], optional + Patterns to include, by default None. Can be a set of strings or a single string. + ignore_patterns : Union[str, Set[str]], optional + Patterns to ignore, by default None. Can be a set of strings or a single string. + token : str, optional + GitHub personal-access token (PAT). Needed when *source* refers to a + **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. + Must start with 'github_pat_' or 'gph_' for GitHub repositories. + Returns + ------- + IngestionQuery + A dataclass object containing the parsed details of the repository or file path. + """ + + # Determine the parsing method based on the source type + if from_web or urlparse(source).scheme in ("https", "http") or any(h in source for h in KNOWN_GIT_HOSTS): + # We either have a full URL or a domain-less slug + query = await _parse_remote_repo(source, token=token) + else: + # Local path scenario + query = _parse_local_dir_path(source) + + # Combine default ignore patterns + custom patterns + ignore_patterns_set = DEFAULT_IGNORE_PATTERNS.copy() + if ignore_patterns: + ignore_patterns_set.update(_parse_patterns(ignore_patterns)) + + # Process include patterns and override ignore patterns accordingly + if include_patterns: + parsed_include = _parse_patterns(include_patterns) + # Override ignore patterns with include patterns + ignore_patterns_set = set(ignore_patterns_set) - set(parsed_include) + else: + parsed_include = None + + return IngestionQuery( + user_name=query.user_name, + repo_name=query.repo_name, + url=query.url, + subpath=query.subpath, + local_path=query.local_path, + slug=query.slug, + id=query.id, + type=query.type, + branch=query.branch, + commit=query.commit, + max_file_size=max_file_size, + ignore_patterns=ignore_patterns_set, + include_patterns=parsed_include, + ) + + +async def _parse_remote_repo(source: str, token: Optional[str] = None) -> IngestionQuery: + """ + Parse a repository URL into a structured query dictionary. + + If source is: + - A fully qualified URL (https://gitlab.com/...), parse & verify that domain + - A URL missing 'https://' (gitlab.com/...), add 'https://' and parse + - A 'slug' (like 'pandas-dev/pandas'), attempt known domains until we find one that exists. + + Parameters + ---------- + source : str + The URL or domain-less slug to parse. + token : str, optional + GitHub personal-access token (PAT). Needed when *source* refers to a + **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. + + Returns + ------- + IngestionQuery + A dictionary containing the parsed details of the repository. + """ + source = unquote(source) + + # Attempt to parse + parsed_url = urlparse(source) + + if parsed_url.scheme: + _validate_url_scheme(parsed_url.scheme) + _validate_host(parsed_url.netloc.lower()) + + else: # Will be of the form 'host/user/repo' or 'user/repo' + tmp_host = source.split("/")[0].lower() + if "." in tmp_host: + _validate_host(tmp_host) + else: + # No scheme, no domain => user typed "user/repo", so we'll guess the domain. + host = await try_domains_for_user_and_repo(*_get_user_and_repo_from_path(source), token=token) + source = f"{host}/{source}" + + source = "https://" + source + parsed_url = urlparse(source) + + host = parsed_url.netloc.lower() + user_name, repo_name = _get_user_and_repo_from_path(parsed_url.path) + + _id = str(uuid.uuid4()) + slug = f"{user_name}-{repo_name}" + local_path = TMP_BASE_PATH / _id / slug + url = f"https://{host}/{user_name}/{repo_name}" + + parsed = IngestionQuery( + user_name=user_name, + repo_name=repo_name, + url=url, + local_path=local_path, + slug=slug, + id=_id, + ) + + remaining_parts = parsed_url.path.strip("/").split("/")[2:] + + if not remaining_parts: + return parsed + + possible_type = remaining_parts.pop(0) # e.g. 'issues', 'pull', 'tree', 'blob' + + # If no extra path parts, just return + if not remaining_parts: + return parsed + + # If this is an issues page or pull requests, return early without processing subpath + if remaining_parts and possible_type in ("issues", "pull"): + return parsed + + parsed.type = possible_type + + # Commit or branch + commit_or_branch = remaining_parts[0] + if _is_valid_git_commit_hash(commit_or_branch): + parsed.commit = commit_or_branch + remaining_parts.pop(0) + else: + parsed.branch = await _configure_branch_and_subpath(remaining_parts, url) + + # Subpath if anything left + if remaining_parts: + parsed.subpath += "/".join(remaining_parts) + + return parsed + + +async def _configure_branch_and_subpath(remaining_parts: List[str], url: str) -> Optional[str]: + """ + Configure the branch and subpath based on the remaining parts of the URL. + Parameters + ---------- + remaining_parts : List[str] + The remaining parts of the URL path. + url : str + The URL of the repository. + Returns + ------- + str, optional + The branch name if found, otherwise None. + + """ + try: + # Fetch the list of branches from the remote repository + branches: List[str] = await fetch_remote_branch_list(url) + except RuntimeError as exc: + warnings.warn(f"Warning: Failed to fetch branch list: {exc}", RuntimeWarning) + return remaining_parts.pop(0) + + branch = [] + while remaining_parts: + branch.append(remaining_parts.pop(0)) + branch_name = "/".join(branch) + if branch_name in branches: + return branch_name + + return None + + +def _parse_patterns(pattern: Union[str, Set[str]]) -> Set[str]: + """ + Parse and validate file/directory patterns for inclusion or exclusion. + + Takes either a single pattern string or set of pattern strings and processes them into a normalized list. + Patterns are split on commas and spaces, validated for allowed characters, and normalized. + + Parameters + ---------- + pattern : Set[str] | str + Pattern(s) to parse - either a single string or set of strings + + Returns + ------- + Set[str] + A set of normalized patterns. + + Raises + ------ + InvalidPatternError + If any pattern contains invalid characters. Only alphanumeric characters, + dash (-), underscore (_), dot (.), forward slash (/), plus (+), and + asterisk (*) are allowed. + """ + patterns = pattern if isinstance(pattern, set) else {pattern} + + parsed_patterns: Set[str] = set() + for p in patterns: + parsed_patterns = parsed_patterns.union(set(re.split(",| ", p))) + + # Remove empty string if present + parsed_patterns = parsed_patterns - {""} + + # Normalize Windows paths to Unix-style paths + parsed_patterns = {p.replace("\\", "/") for p in parsed_patterns} + + # Validate and normalize each pattern + for p in parsed_patterns: + if not _is_valid_pattern(p): + raise InvalidPatternError(p) + + return {_normalize_pattern(p) for p in parsed_patterns} + + +def _parse_local_dir_path(path_str: str) -> IngestionQuery: + """ + Parse the given file path into a structured query dictionary. + + Parameters + ---------- + path_str : str + The file path to parse. + + Returns + ------- + IngestionQuery + A dictionary containing the parsed details of the file path. + """ + path_obj = Path(path_str).resolve() + slug = path_obj.name if path_str == "." else path_str.strip("/") + return IngestionQuery( + user_name=None, + repo_name=None, + url=None, + local_path=path_obj, + slug=slug, + id=str(uuid.uuid4()), + ) + + +async def try_domains_for_user_and_repo(user_name: str, repo_name: str, token: Optional[str] = None) -> str: + """ + Attempt to find a valid repository host for the given user_name and repo_name. + + Parameters + ---------- + user_name : str + The username or owner of the repository. + repo_name : str + The name of the repository. + token : str, optional + GitHub personal-access token (PAT). Needed when *source* refers to a + **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. + + Returns + ------- + str + The domain of the valid repository host. + + Raises + ------ + ValueError + If no valid repository host is found for the given user_name and repo_name. + """ + for domain in KNOWN_GIT_HOSTS: + candidate = f"https://{domain}/{user_name}/{repo_name}" + if await check_repo_exists(candidate, token=token if domain == "github.com" else None): + return domain + raise ValueError(f"Could not find a valid repository host for '{user_name}/{repo_name}'.") + + + +================================================ +FILE: src/gitingest/schemas/__init__.py +================================================ +"""This module contains the schemas for the Gitingest package.""" + +from gitingest.schemas.filesystem_schema import FileSystemNode, FileSystemNodeType, FileSystemStats +from gitingest.schemas.ingestion_schema import CloneConfig, IngestionQuery + +__all__ = ["FileSystemNode", "FileSystemNodeType", "FileSystemStats", "CloneConfig", "IngestionQuery"] + + + +================================================ +FILE: src/gitingest/schemas/filesystem_schema.py +================================================ +"""Define the schema for the filesystem representation.""" + +from __future__ import annotations + +import os +from dataclasses import dataclass, field +from enum import Enum, auto +from pathlib import Path + +from gitingest.utils.file_utils import get_preferred_encodings, is_text_file +from gitingest.utils.notebook_utils import process_notebook + +SEPARATOR = "=" * 48 # Tiktoken, the tokenizer openai uses, counts 2 tokens if we have more than 48 + + +class FileSystemNodeType(Enum): + """Enum representing the type of a file system node (directory or file).""" + + DIRECTORY = auto() + FILE = auto() + SYMLINK = auto() + + +@dataclass +class FileSystemStats: + """Class for tracking statistics during file system traversal.""" + + visited: set[Path] = field(default_factory=set) + total_files: int = 0 + total_size: int = 0 + + +@dataclass +class FileSystemNode: # pylint: disable=too-many-instance-attributes + """ + Class representing a node in the file system (either a file or directory). + + Tracks properties of files/directories for comprehensive analysis. + """ + + name: str + type: FileSystemNodeType + path_str: str + path: Path + size: int = 0 + file_count: int = 0 + dir_count: int = 0 + depth: int = 0 + children: list[FileSystemNode] = field(default_factory=list) + + def sort_children(self) -> None: + """ + Sort the children nodes of a directory according to a specific order. + + Order of sorting: + 2. Regular files (not starting with dot) + 3. Hidden files (starting with dot) + 4. Regular directories (not starting with dot) + 5. Hidden directories (starting with dot) + + All groups are sorted alphanumerically within themselves. + + Raises + ------ + ValueError + If the node is not a directory. + """ + if self.type != FileSystemNodeType.DIRECTORY: + raise ValueError("Cannot sort children of a non-directory node") + + def _sort_key(child: FileSystemNode) -> tuple[int, str]: + # returns the priority order for the sort function, 0 is first + # Groups: 0=README, 1=regular file, 2=hidden file, 3=regular dir, 4=hidden dir + name = child.name.lower() + if child.type == FileSystemNodeType.FILE: + if name == "readme.md": + return (0, name) + return (1 if not name.startswith(".") else 2, name) + return (3 if not name.startswith(".") else 4, name) + + self.children.sort(key=_sort_key) + + @property + def content_string(self) -> str: + """ + Return the content of the node as a string, including path and content. + + Returns + ------- + str + A string representation of the node's content. + """ + parts = [ + SEPARATOR, + f"{self.type.name}: {str(self.path_str).replace(os.sep, '/')}" + + (f" -> {self.path.readlink().name}" if self.type == FileSystemNodeType.SYMLINK else ""), + SEPARATOR, + f"{self.content}", + ] + + return "\n".join(parts) + "\n\n" + + @property + def content(self) -> str: # pylint: disable=too-many-return-statements + """ + Read the content of a file if it's text (or a notebook). Return an error message otherwise. + + Returns + ------- + str + The content of the file, or an error message if the file could not be read. + + Raises + ------ + ValueError + If the node is a directory. + """ + if self.type == FileSystemNodeType.DIRECTORY: + raise ValueError("Cannot read content of a directory node") + + if self.type == FileSystemNodeType.SYMLINK: + return "" + + if not is_text_file(self.path): + return "[Non-text file]" + + if self.path.suffix == ".ipynb": + try: + return process_notebook(self.path) + except Exception as exc: + return f"Error processing notebook: {exc}" + + # Try multiple encodings + for encoding in get_preferred_encodings(): + try: + with self.path.open(encoding=encoding) as f: + return f.read() + except UnicodeDecodeError: + continue + except UnicodeError: + continue + except OSError as exc: + return f"Error reading file: {exc}" + + return "Error: Unable to decode file with available encodings" + + + +================================================ +FILE: src/gitingest/schemas/ingestion_schema.py +================================================ +"""This module contains the dataclasses for the ingestion process.""" + +from dataclasses import dataclass +from pathlib import Path +from typing import Optional, Set + +from pydantic import BaseModel, ConfigDict, Field + +from gitingest.config import MAX_FILE_SIZE + + +@dataclass +class CloneConfig: + """ + Configuration for cloning a Git repository. + + This class holds the necessary parameters for cloning a repository to a local path, including + the repository's URL, the target local path, and optional parameters for a specific commit or branch. + + Attributes + ---------- + url : str + The URL of the Git repository to clone. + local_path : str + The local directory where the repository will be cloned. + commit : str, optional + The specific commit hash to check out after cloning (default is None). + branch : str, optional + The branch to clone (default is None). + subpath : str + The subpath to clone from the repository (default is "/"). + blob: bool + Whether the repository is a blob (default is False). + """ + + url: str + local_path: str + commit: Optional[str] = None + branch: Optional[str] = None + subpath: str = "/" + blob: bool = False + + +class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes + """ + Pydantic model to store the parsed details of the repository or file path. + """ + + user_name: Optional[str] = None + repo_name: Optional[str] = None + local_path: Path + url: Optional[str] = None + slug: str + id: str + subpath: str = "/" + type: Optional[str] = None + branch: Optional[str] = None + commit: Optional[str] = None + max_file_size: int = Field(default=MAX_FILE_SIZE) + ignore_patterns: Optional[Set[str]] = None + include_patterns: Optional[Set[str]] = None + + model_config = ConfigDict(arbitrary_types_allowed=True) + + def extract_clone_config(self) -> CloneConfig: + """ + Extract the relevant fields for the CloneConfig object. + + Returns + ------- + CloneConfig + A CloneConfig object containing the relevant fields. + + Raises + ------ + ValueError + If the 'url' parameter is not provided. + """ + if not self.url: + raise ValueError("The 'url' parameter is required.") + + return CloneConfig( + url=self.url, + local_path=str(self.local_path), + commit=self.commit, + branch=self.branch, + subpath=self.subpath, + blob=self.type == "blob", + ) + + + +================================================ +FILE: src/gitingest/utils/__init__.py +================================================ + + + +================================================ +FILE: src/gitingest/utils/exceptions.py +================================================ +"""Custom exceptions for the Gitingest package.""" + + +class InvalidPatternError(ValueError): + """ + Exception raised when a pattern contains invalid characters. + This exception is used to signal that a pattern provided for some operation + contains characters that are not allowed. The valid characters for the pattern + include alphanumeric characters, dash (-), underscore (_), dot (.), forward slash (/), + plus (+), and asterisk (*). + Parameters + ---------- + pattern : str + The invalid pattern that caused the error. + """ + + def __init__(self, pattern: str) -> None: + super().__init__( + f"Pattern '{pattern}' contains invalid characters. Only alphanumeric characters, dash (-), " + "underscore (_), dot (.), forward slash (/), plus (+), and asterisk (*) are allowed." + ) + + +class AsyncTimeoutError(Exception): + """ + Exception raised when an async operation exceeds its timeout limit. + + This exception is used by the `async_timeout` decorator to signal that the wrapped + asynchronous function has exceeded the specified time limit for execution. + """ + + +class InvalidNotebookError(Exception): + """Exception raised when a Jupyter notebook is invalid or cannot be processed.""" + + def __init__(self, message: str) -> None: + super().__init__(message) + + +class InvalidGitHubTokenError(ValueError): + """Exception raised when a GitHub Personal Access Token is malformed.""" + + def __init__(self) -> None: + super().__init__( + "Invalid GitHub token format. Token should start with 'github_pat_' or 'ghp_' " + "followed by at least 36 characters of letters, numbers, and underscores." + ) + + + +================================================ +FILE: src/gitingest/utils/file_utils.py +================================================ +"""Utility functions for working with files and directories.""" + +import locale +import platform +from pathlib import Path +from typing import List + +try: + locale.setlocale(locale.LC_ALL, "") +except locale.Error: + locale.setlocale(locale.LC_ALL, "C") + + +def get_preferred_encodings() -> List[str]: + """ + Get list of encodings to try, prioritized for the current platform. + + Returns + ------- + List[str] + List of encoding names to try in priority order, starting with the + platform's default encoding followed by common fallback encodings. + """ + encodings = [locale.getpreferredencoding(), "utf-8", "utf-16", "utf-16le", "utf-8-sig", "latin"] + if platform.system() == "Windows": + encodings += ["cp1252", "iso-8859-1"] + return encodings + + +def is_text_file(path: Path) -> bool: + """ + Determine if the file is likely a text file by trying to decode a small chunk + with multiple encodings, and checking for common binary markers. + + Parameters + ---------- + path : Path + The path to the file to check. + + Returns + ------- + bool + True if the file is likely textual; False if it appears to be binary. + """ + + # Attempt to read a portion of the file in binary mode + try: + with path.open("rb") as f: + chunk = f.read(1024) + except OSError: + return False + + # If file is empty, treat as text + if not chunk: + return True + + # Check obvious binary bytes + if b"\x00" in chunk or b"\xff" in chunk: + return False + + # Attempt multiple encodings + for enc in get_preferred_encodings(): + try: + with path.open(encoding=enc) as f: + f.read() + return True + except UnicodeDecodeError: + continue + except UnicodeError: + continue + except OSError: + return False + + return False + + + +================================================ +FILE: src/gitingest/utils/git_utils.py +================================================ +"""Utility functions for interacting with Git repositories.""" + +import asyncio +import base64 +import re +from typing import List, Optional, Tuple + +from gitingest.utils.exceptions import InvalidGitHubTokenError + +GITHUB_PAT_PATTERN = r"^(?:github_pat_|ghp_)[A-Za-z0-9_]{36,}$" + + +async def run_command(*args: str) -> Tuple[bytes, bytes]: + """ + Execute a shell command asynchronously and return (stdout, stderr) bytes. + + Parameters + ---------- + *args : str + The command and its arguments to execute. + + Returns + ------- + Tuple[bytes, bytes] + A tuple containing the stdout and stderr of the command. + + Raises + ------ + RuntimeError + If command exits with a non-zero status. + """ + # Execute the requested command + proc = await asyncio.create_subprocess_exec( + *args, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await proc.communicate() + if proc.returncode != 0: + error_message = stderr.decode().strip() + raise RuntimeError(f"Command failed: {' '.join(args)}\nError: {error_message}") + + return stdout, stderr + + +async def ensure_git_installed() -> None: + """ + Ensure Git is installed and accessible on the system. + + Raises + ------ + RuntimeError + If Git is not installed or not accessible. + """ + try: + await run_command("git", "--version") + except RuntimeError as exc: + raise RuntimeError("Git is not installed or not accessible. Please install Git first.") from exc + + +async def check_repo_exists(url: str, token: Optional[str] = None) -> bool: + """ + Check if a Git repository exists at the provided URL. + + Parameters + ---------- + url : str + The URL of the Git repository to check. + token : str, optional + GitHub personal-access token (PAT). Needed when *source* refers to a + **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. + + Returns + ------- + bool + True if the repository exists, False otherwise. + + Raises + ------ + RuntimeError + If the curl command returns an unexpected status code. + """ + if token and "github.com" in url: + return await _check_github_repo_exists(url, token) + + proc = await asyncio.create_subprocess_exec( + "curl", + "-I", + url, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, _ = await proc.communicate() + + if proc.returncode != 0: + return False # likely unreachable or private + + response = stdout.decode() + status_line = response.splitlines()[0].strip() + parts = status_line.split(" ") + if len(parts) >= 2: + status_code_str = parts[1] + if status_code_str in ("200", "301"): + return True + if status_code_str in ("302", "404"): + return False + raise RuntimeError(f"Unexpected status line: {status_line}") + + +async def _check_github_repo_exists(url: str, token: Optional[str] = None) -> bool: + """ + Return True iff the authenticated user can see `url`. + + Parameters + ---------- + url : str + The URL of the GitHub repository to check. + token : str, optional + GitHub personal-access token (PAT). Needed when *source* refers to a + **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. + + Returns + ------- + bool + True if the repository exists, False otherwise. + + Raises + ------ + ValueError + If the URL is not a valid GitHub repository URL. + RuntimeError + If the repository is not found, if the provided URL is invalid, or if the token format is invalid. + """ + m = re.match(r"https?://github\.com/([^/]+)/([^/]+?)(?:\.git)?/?$", url) + if not m: + raise ValueError(f"Un-recognised GitHub URL: {url!r}") + owner, repo = m.groups() + + api = f"https://api.github.com/repos/{owner}/{repo}" + cmd = [ + "curl", + "--silent", + "--location", + "--write-out", + "%{http_code}", + "-o", + "/dev/null", + "-H", + "Accept: application/vnd.github+json", + ] + if token: + cmd += ["-H", f"Authorization: Bearer {token}"] + cmd.append(api) + + proc = await asyncio.create_subprocess_exec( + *cmd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, _ = await proc.communicate() + status = stdout.decode()[-3:] # just the %{http_code} + + if status == "200": + return True + if status == "404": + return False + if status in ("401", "403"): + raise RuntimeError("Token invalid or lacks permissions") + raise RuntimeError(f"GitHub API returned unexpected HTTP {status}") + + +async def fetch_remote_branch_list(url: str, token: Optional[str] = None) -> List[str]: + """ + Fetch the list of branches from a remote Git repository. + + Parameters + ---------- + url : str + The URL of the Git repository to fetch branches from. + token : str, optional + GitHub personal-access token (PAT). Needed when *source* refers to a + **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. + + Returns + ------- + List[str] + A list of branch names available in the remote repository. + """ + fetch_branches_command = ["git"] + + # Add authentication if needed + if token and "github.com" in url: + fetch_branches_command += ["-c", create_git_auth_header(token)] + + fetch_branches_command += ["ls-remote", "--heads", url] + + await ensure_git_installed() + stdout, _ = await run_command(*fetch_branches_command) + stdout_decoded = stdout.decode() + + return [ + line.split("refs/heads/", 1)[1] + for line in stdout_decoded.splitlines() + if line.strip() and "refs/heads/" in line + ] + + +def create_git_command(base_cmd: List[str], local_path: str, url: str, token: Optional[str] = None) -> List[str]: + """Create a git command with authentication if needed. + + Parameters + ---------- + base_cmd : List[str] + The base git command to start with + local_path : str + The local path where the git command should be executed + url : str + The repository URL to check if it's a GitHub repository + token : Optional[str] + GitHub personal access token for authentication + + Returns + ------- + List[str] + The git command with authentication if needed + """ + cmd = base_cmd + ["-C", local_path] + if token and url.startswith("https://github.com"): + validate_github_token(token) + cmd += ["-c", create_git_auth_header(token)] + return cmd + + +def create_git_auth_header(token: str) -> str: + """Create a Basic authentication header for GitHub git operations. + + Parameters + ---------- + token : str + GitHub personal access token + + Returns + ------- + str + The git config command for setting the authentication header + """ + basic = base64.b64encode(f"x-oauth-basic:{token}".encode()).decode() + return f"http.https://github.com/.extraheader=Authorization: Basic {basic}" + + +def validate_github_token(token: str) -> None: + """Validate the format of a GitHub Personal Access Token. + + Parameters + ---------- + token : str + The GitHub token to validate + + Raises + ------ + InvalidGitHubTokenError + If the token format is invalid + """ + if not re.match(GITHUB_PAT_PATTERN, token): + raise InvalidGitHubTokenError() + + + +================================================ +FILE: src/gitingest/utils/ignore_patterns.py +================================================ +"""Default ignore patterns for Gitingest.""" + +import os +from pathlib import Path +from typing import Set + +DEFAULT_IGNORE_PATTERNS: Set[str] = { + # Python + "*.pyc", + "*.pyo", + "*.pyd", + "__pycache__", + ".pytest_cache", + ".coverage", + ".tox", + ".nox", + ".mypy_cache", + ".ruff_cache", + ".hypothesis", + "poetry.lock", + "Pipfile.lock", + # JavaScript/FileSystemNode + "node_modules", + "bower_components", + "package-lock.json", + "yarn.lock", + ".npm", + ".yarn", + ".pnpm-store", + "bun.lock", + "bun.lockb", + # Java + "*.class", + "*.jar", + "*.war", + "*.ear", + "*.nar", + ".gradle/", + "build/", + ".settings/", + ".classpath", + "gradle-app.setting", + "*.gradle", + # IDEs and editors / Java + ".project", + # C/C++ + "*.o", + "*.obj", + "*.dll", + "*.dylib", + "*.exe", + "*.lib", + "*.out", + "*.a", + "*.pdb", + # Swift/Xcode + ".build/", + "*.xcodeproj/", + "*.xcworkspace/", + "*.pbxuser", + "*.mode1v3", + "*.mode2v3", + "*.perspectivev3", + "*.xcuserstate", + "xcuserdata/", + ".swiftpm/", + # Ruby + "*.gem", + ".bundle/", + "vendor/bundle", + "Gemfile.lock", + ".ruby-version", + ".ruby-gemset", + ".rvmrc", + # Rust + "Cargo.lock", + "**/*.rs.bk", + # Java / Rust + "target/", + # Go + "pkg/", + # .NET/C# + "obj/", + "*.suo", + "*.user", + "*.userosscache", + "*.sln.docstates", + "packages/", + "*.nupkg", + # Go / .NET / C# + "bin/", + # Version control + ".git", + ".svn", + ".hg", + ".gitignore", + ".gitattributes", + ".gitmodules", + # Images and media + "*.svg", + "*.png", + "*.jpg", + "*.jpeg", + "*.gif", + "*.ico", + "*.pdf", + "*.mov", + "*.mp4", + "*.mp3", + "*.wav", + # Virtual environments + "venv", + ".venv", + "env", + ".env", + "virtualenv", + # IDEs and editors + ".idea", + ".vscode", + ".vs", + "*.swo", + "*.swn", + ".settings", + "*.sublime-*", + # Temporary and cache files + "*.log", + "*.bak", + "*.swp", + "*.tmp", + "*.temp", + ".cache", + ".sass-cache", + ".eslintcache", + ".DS_Store", + "Thumbs.db", + "desktop.ini", + # Build directories and artifacts + "build", + "dist", + "target", + "out", + "*.egg-info", + "*.egg", + "*.whl", + "*.so", + # Documentation + "site-packages", + ".docusaurus", + ".next", + ".nuxt", + # Other common patterns + ## Minified files + "*.min.js", + "*.min.css", + ## Source maps + "*.map", + ## Terraform + ".terraform", + "*.tfstate*", + ## Dependencies in various languages + "vendor/", + # Gitingest + "digest.txt", +} + + +def load_gitignore_patterns(root: Path) -> Set[str]: + """ + Recursively load ignore patterns from all .gitignore files under the given root directory. + + Parameters + ---------- + root : Path + The root directory to search for .gitignore files. + + Returns + ------- + Set[str] + A set of ignore patterns extracted from all .gitignore files found under the root directory. + """ + patterns: Set[str] = set() + for dirpath, _, filenames in os.walk(root): + if ".gitignore" not in filenames: + continue + + gitignore_path = Path(dirpath) / ".gitignore" + with gitignore_path.open("r", encoding="utf-8") as f: + for line in f: + stripped = line.strip() + + if not stripped or stripped.startswith("#"): + continue + + negated = stripped.startswith("!") + if negated: + stripped = stripped[1:] + + rel_dir = os.path.relpath(dirpath, root) + if stripped.startswith("/"): + pattern_body = os.path.join(rel_dir, stripped.lstrip("/")) + else: + pattern_body = os.path.join(rel_dir, stripped) if rel_dir != "." else stripped + + pattern_body = pattern_body.replace("\\", "/") + pattern = f"!{pattern_body}" if negated else pattern_body + patterns.add(pattern) + + return patterns + + + +================================================ +FILE: src/gitingest/utils/ingestion_utils.py +================================================ +"""Utility functions for the ingestion process.""" + +from pathlib import Path +from typing import Set + +from pathspec import PathSpec + + +def _should_include(path: Path, base_path: Path, include_patterns: Set[str]) -> bool: + """ + Determine if the given file or directory path matches any of the include patterns. + + This function checks whether the relative path of a file or directory matches any of the specified patterns. If a + match is found, it returns `True`, indicating that the file or directory should be included in further processing. + + Parameters + ---------- + path : Path + The absolute path of the file or directory to check. + base_path : Path + The base directory from which the relative path is calculated. + include_patterns : Set[str] + A set of patterns to check against the relative path. + + Returns + ------- + bool + `True` if the path matches any of the include patterns, `False` otherwise. + """ + try: + rel_path = path.relative_to(base_path) + except ValueError: + # If path is not under base_path at all + return False + + rel_str = str(rel_path) + + # if path is a directory, include it by default + if path.is_dir(): + return True + + spec = PathSpec.from_lines("gitwildmatch", include_patterns) + return spec.match_file(rel_str) + + +def _should_exclude(path: Path, base_path: Path, ignore_patterns: Set[str]) -> bool: + """ + Determine if the given file or directory path matches any of the ignore patterns. + + This function checks whether the relative path of a file or directory matches + any of the specified ignore patterns. If a match is found, it returns `True`, indicating + that the file or directory should be excluded from further processing. + + Parameters + ---------- + path : Path + The absolute path of the file or directory to check. + base_path : Path + The base directory from which the relative path is calculated. + ignore_patterns : Set[str] + A set of patterns to check against the relative path. + + Returns + ------- + bool + `True` if the path matches any of the ignore patterns, `False` otherwise. + """ + try: + rel_path = path.relative_to(base_path) + except ValueError: + # If path is not under base_path at all + return True + + rel_str = str(rel_path) + spec = PathSpec.from_lines("gitwildmatch", ignore_patterns) + return spec.match_file(rel_str) + + + +================================================ +FILE: src/gitingest/utils/notebook_utils.py +================================================ +"""Utilities for processing Jupyter notebooks.""" + +import json +import warnings +from itertools import chain +from pathlib import Path +from typing import Any, Dict, List, Optional + +from gitingest.utils.exceptions import InvalidNotebookError + + +def process_notebook(file: Path, include_output: bool = True) -> str: + """ + Process a Jupyter notebook file and return an executable Python script as a string. + + Parameters + ---------- + file : Path + The path to the Jupyter notebook file. + include_output : bool + Whether to include cell outputs in the generated script, by default True. + + Returns + ------- + str + The executable Python script as a string. + + Raises + ------ + InvalidNotebookError + If the notebook file is invalid or cannot be processed. + """ + try: + with file.open(encoding="utf-8") as f: + notebook: Dict[str, Any] = json.load(f) + except json.JSONDecodeError as exc: + raise InvalidNotebookError(f"Invalid JSON in notebook: {file}") from exc + + # Check if the notebook contains worksheets + worksheets = notebook.get("worksheets") + if worksheets: + warnings.warn( + "Worksheets are deprecated as of IPEP-17. Consider updating the notebook. " + "(See: https://github.com/jupyter/nbformat and " + "https://github.com/ipython/ipython/wiki/IPEP-17:-Notebook-Format-4#remove-multiple-worksheets " + "for more information.)", + DeprecationWarning, + ) + + if len(worksheets) > 1: + warnings.warn("Multiple worksheets detected. Combining all worksheets into a single script.", UserWarning) + + cells = list(chain.from_iterable(ws["cells"] for ws in worksheets)) + + else: + cells = notebook["cells"] + + result = ["# Jupyter notebook converted to Python script."] + + for cell in cells: + cell_str = _process_cell(cell, include_output=include_output) + if cell_str: + result.append(cell_str) + + return "\n\n".join(result) + "\n" + + +def _process_cell(cell: Dict[str, Any], include_output: bool) -> Optional[str]: + """ + Process a Jupyter notebook cell and return the cell content as a string. + + Parameters + ---------- + cell : Dict[str, Any] + The cell dictionary from a Jupyter notebook. + include_output : bool + Whether to include cell outputs in the generated script + + Returns + ------- + str, optional + The cell content as a string, or None if the cell is empty. + + Raises + ------ + ValueError + If an unexpected cell type is encountered. + """ + cell_type = cell["cell_type"] + + # Validate cell type and handle unexpected types + if cell_type not in ("markdown", "code", "raw"): + raise ValueError(f"Unknown cell type: {cell_type}") + + cell_str = "".join(cell["source"]) + + # Skip empty cells + if not cell_str: + return None + + # Convert Markdown and raw cells to multi-line comments + if cell_type in ("markdown", "raw"): + return f'"""\n{cell_str}\n"""' + + # Add cell output as comments + outputs = cell.get("outputs") + if include_output and outputs: + + # Include cell outputs as comments + output_lines = [] + + for output in outputs: + output_lines += _extract_output(output) + + for output_line in output_lines: + if not output_line.endswith("\n"): + output_line += "\n" + + cell_str += "\n# Output:\n# " + "\n# ".join(output_lines) + + return cell_str + + +def _extract_output(output: Dict[str, Any]) -> List[str]: + """ + Extract the output from a Jupyter notebook cell. + + Parameters + ---------- + output : Dict[str, Any] + The output dictionary from a Jupyter notebook cell. + + Returns + ------- + List[str] + The output as a list of strings. + + Raises + ------ + ValueError + If an unknown output type is encountered. + """ + output_type = output["output_type"] + + if output_type == "stream": + return output["text"] + + if output_type in ("execute_result", "display_data"): + return output["data"]["text/plain"] + + if output_type == "error": + return [f"Error: {output['ename']}: {output['evalue']}"] + + raise ValueError(f"Unknown output type: {output_type}") + + + +================================================ +FILE: src/gitingest/utils/os_utils.py +================================================ +"""Utility functions for working with the operating system.""" + +import os +from pathlib import Path + + +async def ensure_directory(path: Path) -> None: + """ + Ensure the directory exists, creating it if necessary. + + Parameters + ---------- + path : Path + The path to ensure exists + + Raises + ------ + OSError + If the directory cannot be created + """ + try: + os.makedirs(path, exist_ok=True) + except OSError as exc: + raise OSError(f"Failed to create directory {path}: {exc}") from exc + + + +================================================ +FILE: src/gitingest/utils/path_utils.py +================================================ +"""Utility functions for working with file paths.""" + +import os +import platform +from pathlib import Path + + +def _is_safe_symlink(symlink_path: Path, base_path: Path) -> bool: + """ + Check if a symlink points to a location within the base directory. + + This function resolves the target of a symlink and ensures it is within the specified + base directory, returning `True` if it is safe, or `False` if the symlink points outside + the base directory. + + Parameters + ---------- + symlink_path : Path + The path of the symlink to check. + base_path : Path + The base directory to ensure the symlink points within. + + Returns + ------- + bool + `True` if the symlink points within the base directory, `False` otherwise. + """ + try: + if platform.system() == "Windows": + if not os.path.islink(str(symlink_path)): + return False + + target_path = symlink_path.resolve() + base_resolved = base_path.resolve() + + return base_resolved in target_path.parents or target_path == base_resolved + except (OSError, ValueError): + # If there's any error resolving the paths, consider it unsafe + return False + + + +================================================ +FILE: src/gitingest/utils/query_parser_utils.py +================================================ +"""Utility functions for parsing and validating query parameters.""" + +import os +import string +from typing import List, Set, Tuple + +HEX_DIGITS: Set[str] = set(string.hexdigits) + + +KNOWN_GIT_HOSTS: List[str] = [ + "github.com", + "gitlab.com", + "bitbucket.org", + "gitea.com", + "codeberg.org", + "gist.github.com", +] + + +def _is_valid_git_commit_hash(commit: str) -> bool: + """ + Validate if the provided string is a valid Git commit hash. + + This function checks if the commit hash is a 40-character string consisting only + of hexadecimal digits, which is the standard format for Git commit hashes. + + Parameters + ---------- + commit : str + The string to validate as a Git commit hash. + + Returns + ------- + bool + True if the string is a valid 40-character Git commit hash, otherwise False. + """ + return len(commit) == 40 and all(c in HEX_DIGITS for c in commit) + + +def _is_valid_pattern(pattern: str) -> bool: + """ + Validate if the given pattern contains only valid characters. + + This function checks if the pattern contains only alphanumeric characters or one + of the following allowed characters: dash (`-`), underscore (`_`), dot (`.`), + forward slash (`/`), plus (`+`), asterisk (`*`), or the at sign (`@`). + + Parameters + ---------- + pattern : str + The pattern to validate. + + Returns + ------- + bool + True if the pattern is valid, otherwise False. + """ + return all(c.isalnum() or c in "-_./+*@" for c in pattern) + + +def _validate_host(host: str) -> None: + """ + Validate a hostname. + + The host is accepted if it is either present in the hard-coded `KNOWN_GIT_HOSTS` list or if it satisfies the + simple heuristics in `_looks_like_git_host`, which try to recognise common self-hosted Git services (e.g. GitLab + instances on sub-domains such as `gitlab.example.com` or `git.example.com`). + + Parameters + ---------- + host : str + Hostname (case-insensitive). + + Raises + ------ + ValueError + If the host cannot be recognised as a probable Git hosting domain. + """ + host = host.lower() + if host not in KNOWN_GIT_HOSTS and not _looks_like_git_host(host): + raise ValueError(f"Unknown domain '{host}' in URL") + + +def _looks_like_git_host(host: str) -> bool: + """ + Check if the given host looks like a Git host. + + The current heuristic returns `True` when the host starts with `git.` (e.g. `git.example.com`) or starts with + `gitlab.` (e.g. `gitlab.company.com`). + + Parameters + ---------- + host : str + Hostname (case-insensitive). + + Returns + ------- + bool + True if the host looks like a Git host, otherwise False. + """ + host = host.lower() + return host.startswith(("git.", "gitlab.")) + + +def _validate_url_scheme(scheme: str) -> None: + """ + Validate the given scheme against the known schemes. + + Parameters + ---------- + scheme : str + The scheme to validate. + + Raises + ------ + ValueError + If the scheme is not 'http' or 'https'. + """ + scheme = scheme.lower() + if scheme not in ("https", "http"): + raise ValueError(f"Invalid URL scheme '{scheme}' in URL") + + +def _get_user_and_repo_from_path(path: str) -> Tuple[str, str]: + """ + Extract the user and repository names from a given path. + + Parameters + ---------- + path : str + The path to extract the user and repository names from. + + Returns + ------- + Tuple[str, str] + A tuple containing the user and repository names. + + Raises + ------ + ValueError + If the path does not contain at least two parts. + """ + path_parts = path.lower().strip("/").split("/") + if len(path_parts) < 2: + raise ValueError(f"Invalid repository URL '{path}'") + return path_parts[0], path_parts[1] + + +def _normalize_pattern(pattern: str) -> str: + """ + Normalize the given pattern by removing leading separators and appending a wildcard. + + This function processes the pattern string by stripping leading directory separators + and appending a wildcard (`*`) if the pattern ends with a separator. + + Parameters + ---------- + pattern : str + The pattern to normalize. + + Returns + ------- + str + The normalized pattern. + """ + pattern = pattern.lstrip(os.sep) + if pattern.endswith(os.sep): + pattern += "*" + return pattern + + + +================================================ +FILE: src/gitingest/utils/timeout_wrapper.py +================================================ +"""Utility functions for the Gitingest package.""" + +import asyncio +import functools +from typing import Any, Awaitable, Callable, TypeVar + +from gitingest.utils.exceptions import AsyncTimeoutError + +T = TypeVar("T") + + +def async_timeout(seconds) -> Callable[[Callable[..., Awaitable[T]]], Callable[..., Awaitable[T]]]: + """ + Async Timeout decorator. + + This decorator wraps an asynchronous function and ensures it does not run for + longer than the specified number of seconds. If the function execution exceeds + this limit, it raises an `AsyncTimeoutError`. + + Parameters + ---------- + seconds : int + The maximum allowed time (in seconds) for the asynchronous function to complete. + + Returns + ------- + Callable[[Callable[..., Awaitable[T]]], Callable[..., Awaitable[T]]] + A decorator that, when applied to an async function, ensures the function + completes within the specified time limit. If the function takes too long, + an `AsyncTimeoutError` is raised. + """ + + def decorator(func: Callable[..., Awaitable[T]]) -> Callable[..., Awaitable[T]]: + @functools.wraps(func) + async def wrapper(*args: Any, **kwargs: Any) -> T: + try: + return await asyncio.wait_for(func(*args, **kwargs), timeout=seconds) + except asyncio.TimeoutError as exc: + raise AsyncTimeoutError(f"Operation timed out after {seconds} seconds") from exc + + return wrapper + + return decorator + + + +================================================ +FILE: src/server/__init__.py +================================================ + + + +================================================ +FILE: src/server/main.py +================================================ +"""Main module for the FastAPI application.""" + +import os +from pathlib import Path +from typing import Dict + +from dotenv import load_dotenv +from fastapi import FastAPI, Request +from fastapi.responses import FileResponse, HTMLResponse +from fastapi.staticfiles import StaticFiles +from slowapi.errors import RateLimitExceeded +from starlette.middleware.trustedhost import TrustedHostMiddleware + +from server.routers import download, dynamic, index +from server.server_config import templates +from server.server_utils import lifespan, limiter, rate_limit_exception_handler + +# Load environment variables from .env file +load_dotenv() + +# Initialize the FastAPI application with lifespan +app = FastAPI(lifespan=lifespan) +app.state.limiter = limiter + +# Register the custom exception handler for rate limits +app.add_exception_handler(RateLimitExceeded, rate_limit_exception_handler) + + +# Mount static files dynamically to serve CSS, JS, and other static assets +static_dir = Path(__file__).parent.parent / "static" +app.mount("/static", StaticFiles(directory=static_dir), name="static") + + +# Fetch allowed hosts from the environment or use the default values +allowed_hosts = os.getenv("ALLOWED_HOSTS") +if allowed_hosts: + allowed_hosts = allowed_hosts.split(",") +else: + # Define the default allowed hosts for the application + default_allowed_hosts = ["gitingest.com", "*.gitingest.com", "localhost", "127.0.0.1"] + allowed_hosts = default_allowed_hosts + +# Add middleware to enforce allowed hosts +app.add_middleware(TrustedHostMiddleware, allowed_hosts=allowed_hosts) + + +@app.get("/health") +async def health_check() -> Dict[str, str]: + """ + Health check endpoint to verify that the server is running. + + Returns + ------- + Dict[str, str] + A JSON object with a "status" key indicating the server's health status. + """ + return {"status": "healthy"} + + +@app.head("/") +async def head_root() -> HTMLResponse: + """ + Respond to HTTP HEAD requests for the root URL. + + Mirrors the headers and status code of the index page. + + Returns + ------- + HTMLResponse + An empty HTML response with appropriate headers. + """ + return HTMLResponse(content=None, headers={"content-type": "text/html; charset=utf-8"}) + + +@app.get("/api/", response_class=HTMLResponse) +@app.get("/api", response_class=HTMLResponse) +async def api_docs(request: Request) -> HTMLResponse: + """ + Render the API documentation page. + + Parameters + ---------- + request : Request + The incoming HTTP request. + + Returns + ------- + HTMLResponse + A rendered HTML page displaying API documentation. + """ + return templates.TemplateResponse("api.jinja", {"request": request}) + + +@app.get("/robots.txt") +async def robots() -> FileResponse: + """ + Serve the `robots.txt` file to guide search engine crawlers. + + Returns + ------- + FileResponse + The `robots.txt` file located in the static directory. + """ + return FileResponse("static/robots.txt") + + +@app.get("/llm.txt") +async def llm_txt() -> FileResponse: + """ + Serve the `llm.txt` file to provide information about the site to LLMs. + + Returns + ------- + FileResponse + The `llm.txt` file located in the static directory. + """ + return FileResponse("static/llm.txt") + + +# Include routers for modular endpoints +app.include_router(index) +app.include_router(download) +app.include_router(dynamic) + + + +================================================ +FILE: src/server/query_processor.py +================================================ +"""Process a query by parsing input, cloning a repository, and generating a summary.""" + +from functools import partial +from typing import Optional + +from fastapi import Request +from starlette.templating import _TemplateResponse + +from gitingest.cloning import clone_repo +from gitingest.ingestion import ingest_query +from gitingest.query_parsing import IngestionQuery, parse_query +from server.server_config import EXAMPLE_REPOS, MAX_DISPLAY_SIZE, templates +from server.server_utils import Colors, log_slider_to_size + + +async def process_query( + request: Request, + input_text: str, + slider_position: int, + pattern_type: str = "exclude", + pattern: str = "", + is_index: bool = False, + token: Optional[str] = None, +) -> _TemplateResponse: + """ + Process a query by parsing input, cloning a repository, and generating a summary. + + Handle user input, process Git repository data, and prepare + a response for rendering a template with the processed results or an error message. + + Parameters + ---------- + request : Request + The HTTP request object. + input_text : str + Input text provided by the user, typically a Git repository URL or slug. + slider_position : int + Position of the slider, representing the maximum file size in the query. + pattern_type : str + Type of pattern to use, either "include" or "exclude" (default is "exclude"). + pattern : str + Pattern to include or exclude in the query, depending on the pattern type. + is_index : bool + Flag indicating whether the request is for the index page (default is False). + token : str, optional + GitHub personal-access token (PAT). Needed when *input_text* refers to a + **private** repository. + + Returns + ------- + _TemplateResponse + Rendered template response containing the processed results or an error message. + + Raises + ------ + ValueError + If an invalid pattern type is provided. + """ + if pattern_type == "include": + include_patterns = pattern + exclude_patterns = None + elif pattern_type == "exclude": + exclude_patterns = pattern + include_patterns = None + else: + raise ValueError(f"Invalid pattern type: {pattern_type}") + + template = "index.jinja" if is_index else "git.jinja" + template_response = partial(templates.TemplateResponse, name=template) + max_file_size = log_slider_to_size(slider_position) + + context = { + "request": request, + "repo_url": input_text, + "examples": EXAMPLE_REPOS if is_index else [], + "default_file_size": slider_position, + "pattern_type": pattern_type, + "pattern": pattern, + "token": token, + } + + try: + query: IngestionQuery = await parse_query( + source=input_text, + max_file_size=max_file_size, + from_web=True, + include_patterns=include_patterns, + ignore_patterns=exclude_patterns, + token=token, + ) + if not query.url: + raise ValueError("The 'url' parameter is required.") + + # Sets the "/" for the page title + context["short_repo_url"] = f"{query.user_name}/{query.repo_name}" + + clone_config = query.extract_clone_config() + await clone_repo(clone_config, token=token) + summary, tree, content = ingest_query(query) + with open(f"{clone_config.local_path}.txt", "w", encoding="utf-8") as f: + f.write(tree + "\n" + content) + except Exception as exc: + # hack to print error message when query is not defined + if "query" in locals() and query is not None and isinstance(query, dict): + _print_error(query["url"], exc, max_file_size, pattern_type, pattern) + else: + print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<- {Colors.END}", end="") + print(f"{Colors.RED}{exc}{Colors.END}") + + context["error_message"] = f"Error: {exc}" + if "405" in str(exc): + context["error_message"] = ( + "Repository not found. Please make sure it is public (private repositories will be supported soon)" + ) + return template_response(context=context) + + if len(content) > MAX_DISPLAY_SIZE: + content = ( + f"(Files content cropped to {int(MAX_DISPLAY_SIZE / 1_000)}k characters, " + "download full ingest to see more)\n" + content[:MAX_DISPLAY_SIZE] + ) + + _print_success( + url=query.url, + max_file_size=max_file_size, + pattern_type=pattern_type, + pattern=pattern, + summary=summary, + ) + + context.update( + { + "result": True, + "summary": summary, + "tree": tree, + "content": content, + "ingest_id": query.id, + } + ) + + return template_response(context=context) + + +def _print_query(url: str, max_file_size: int, pattern_type: str, pattern: str) -> None: + """ + Print a formatted summary of the query details, including the URL, file size, + and pattern information, for easier debugging or logging. + + Parameters + ---------- + url : str + The URL associated with the query. + max_file_size : int + The maximum file size allowed for the query, in bytes. + pattern_type : str + Specifies the type of pattern to use, either "include" or "exclude". + pattern : str + The actual pattern string to include or exclude in the query. + """ + print(f"{Colors.WHITE}{url:<20}{Colors.END}", end="") + if int(max_file_size / 1024) != 50: + print(f" | {Colors.YELLOW}Size: {int(max_file_size/1024)}kb{Colors.END}", end="") + if pattern_type == "include" and pattern != "": + print(f" | {Colors.YELLOW}Include {pattern}{Colors.END}", end="") + elif pattern_type == "exclude" and pattern != "": + print(f" | {Colors.YELLOW}Exclude {pattern}{Colors.END}", end="") + + +def _print_error(url: str, e: Exception, max_file_size: int, pattern_type: str, pattern: str) -> None: + """ + Print a formatted error message including the URL, file size, pattern details, and the exception encountered, + for debugging or logging purposes. + + Parameters + ---------- + url : str + The URL associated with the query that caused the error. + e : Exception + The exception raised during the query or process. + max_file_size : int + The maximum file size allowed for the query, in bytes. + pattern_type : str + Specifies the type of pattern to use, either "include" or "exclude". + pattern : str + The actual pattern string to include or exclude in the query. + """ + print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<- {Colors.END}", end="") + _print_query(url, max_file_size, pattern_type, pattern) + print(f" | {Colors.RED}{e}{Colors.END}") + + +def _print_success(url: str, max_file_size: int, pattern_type: str, pattern: str, summary: str) -> None: + """ + Print a formatted success message, including the URL, file size, pattern details, and a summary with estimated + tokens, for debugging or logging purposes. + + Parameters + ---------- + url : str + The URL associated with the successful query. + max_file_size : int + The maximum file size allowed for the query, in bytes. + pattern_type : str + Specifies the type of pattern to use, either "include" or "exclude". + pattern : str + The actual pattern string to include or exclude in the query. + summary : str + A summary of the query result, including details like estimated tokens. + """ + estimated_tokens = summary[summary.index("Estimated tokens:") + len("Estimated ") :] + print(f"{Colors.GREEN}INFO{Colors.END}: {Colors.GREEN}<- {Colors.END}", end="") + _print_query(url, max_file_size, pattern_type, pattern) + print(f" | {Colors.PURPLE}{estimated_tokens}{Colors.END}") + + + +================================================ +FILE: src/server/server_config.py +================================================ +"""Configuration for the server.""" + +from typing import Dict, List + +from fastapi.templating import Jinja2Templates + +MAX_DISPLAY_SIZE: int = 300_000 +DELETE_REPO_AFTER: int = 60 * 60 # In seconds + + +EXAMPLE_REPOS: List[Dict[str, str]] = [ + {"name": "Gitingest", "url": "https://github.com/cyclotruc/gitingest"}, + {"name": "FastAPI", "url": "https://github.com/tiangolo/fastapi"}, + {"name": "Flask", "url": "https://github.com/pallets/flask"}, + {"name": "Excalidraw", "url": "https://github.com/excalidraw/excalidraw"}, + {"name": "ApiAnalytics", "url": "https://github.com/tom-draper/api-analytics"}, +] + +templates = Jinja2Templates(directory="server/templates") + + + +================================================ +FILE: src/server/server_utils.py +================================================ +"""Utility functions for the server.""" + +import asyncio +import math +import shutil +import time +from contextlib import asynccontextmanager +from pathlib import Path + +from fastapi import FastAPI, Request +from fastapi.responses import Response +from slowapi import Limiter, _rate_limit_exceeded_handler +from slowapi.errors import RateLimitExceeded +from slowapi.util import get_remote_address + +from gitingest.config import TMP_BASE_PATH +from server.server_config import DELETE_REPO_AFTER + +# Initialize a rate limiter +limiter = Limiter(key_func=get_remote_address) + + +async def rate_limit_exception_handler(request: Request, exc: Exception) -> Response: + """ + Custom exception handler for rate-limiting errors. + + Parameters + ---------- + request : Request + The incoming HTTP request. + exc : Exception + The exception raised, expected to be RateLimitExceeded. + + Returns + ------- + Response + A response indicating that the rate limit has been exceeded. + + Raises + ------ + exc + If the exception is not a RateLimitExceeded error, it is re-raised. + """ + if isinstance(exc, RateLimitExceeded): + # Delegate to the default rate limit handler + return _rate_limit_exceeded_handler(request, exc) + # Re-raise other exceptions + raise exc + + +@asynccontextmanager +async def lifespan(_: FastAPI): + """ + Lifecycle manager for handling startup and shutdown events for the FastAPI application. + + Parameters + ---------- + _ : FastAPI + The FastAPI application instance (unused). + + Yields + ------- + None + Yields control back to the FastAPI application while the background task runs. + """ + task = asyncio.create_task(_remove_old_repositories()) + + yield + # Cancel the background task on shutdown + task.cancel() + try: + await task + except asyncio.CancelledError: + pass + + +async def _remove_old_repositories(): + """ + Periodically remove old repository folders. + + Background task that runs periodically to clean up old repository directories. + + This task: + - Scans the TMP_BASE_PATH directory every 60 seconds + - Removes directories older than DELETE_REPO_AFTER seconds + - Before deletion, logs repository URLs to history.txt if a matching .txt file exists + - Handles errors gracefully if deletion fails + + The repository URL is extracted from the first .txt file in each directory, + assuming the filename format: "owner-repository.txt" + """ + while True: + try: + if not TMP_BASE_PATH.exists(): + await asyncio.sleep(60) + continue + + current_time = time.time() + + for folder in TMP_BASE_PATH.iterdir(): + # Skip if folder is not old enough + if current_time - folder.stat().st_ctime <= DELETE_REPO_AFTER: + continue + + await _process_folder(folder) + + except Exception as exc: + print(f"Error in _remove_old_repositories: {exc}") + + await asyncio.sleep(60) + + +async def _process_folder(folder: Path) -> None: + """ + Process a single folder for deletion and logging. + + Parameters + ---------- + folder : Path + The path to the folder to be processed. + """ + # Try to log repository URL before deletion + try: + txt_files = [f for f in folder.iterdir() if f.suffix == ".txt"] + + # Extract owner and repository name from the filename + filename = txt_files[0].stem + if txt_files and "-" in filename: + owner, repo = filename.split("-", 1) + repo_url = f"{owner}/{repo}" + + with open("history.txt", mode="a", encoding="utf-8") as history: + history.write(f"{repo_url}\n") + + except Exception as exc: + print(f"Error logging repository URL for {folder}: {exc}") + + # Delete the folder + try: + shutil.rmtree(folder) + except Exception as exc: + print(f"Error deleting {folder}: {exc}") + + +def log_slider_to_size(position: int) -> int: + """ + Convert a slider position to a file size in bytes using a logarithmic scale. + + Parameters + ---------- + position : int + Slider position ranging from 0 to 500. + + Returns + ------- + int + File size in bytes corresponding to the slider position. + """ + maxp = 500 + minv = math.log(1) + maxv = math.log(102_400) + return round(math.exp(minv + (maxv - minv) * pow(position / maxp, 1.5))) * 1024 + + +## Color printing utility +class Colors: + """ANSI color codes""" + + BLACK = "\033[0;30m" + RED = "\033[0;31m" + GREEN = "\033[0;32m" + BROWN = "\033[0;33m" + BLUE = "\033[0;34m" + PURPLE = "\033[0;35m" + CYAN = "\033[0;36m" + LIGHT_GRAY = "\033[0;37m" + DARK_GRAY = "\033[1;30m" + LIGHT_RED = "\033[1;31m" + LIGHT_GREEN = "\033[1;32m" + YELLOW = "\033[1;33m" + LIGHT_BLUE = "\033[1;34m" + LIGHT_PURPLE = "\033[1;35m" + LIGHT_CYAN = "\033[1;36m" + WHITE = "\033[1;37m" + BOLD = "\033[1m" + FAINT = "\033[2m" + ITALIC = "\033[3m" + UNDERLINE = "\033[4m" + BLINK = "\033[5m" + NEGATIVE = "\033[7m" + CROSSED = "\033[9m" + END = "\033[0m" + + + +================================================ +FILE: src/server/routers/__init__.py +================================================ +"""This module contains the routers for the FastAPI application.""" + +from server.routers.download import router as download +from server.routers.dynamic import router as dynamic +from server.routers.index import router as index + +__all__ = ["download", "dynamic", "index"] + + + +================================================ +FILE: src/server/routers/download.py +================================================ +"""This module contains the FastAPI router for downloading a digest file.""" + +from fastapi import APIRouter, HTTPException +from fastapi.responses import Response + +from gitingest.config import TMP_BASE_PATH + +router = APIRouter() + + +@router.get("/download/{digest_id}") +async def download_ingest(digest_id: str) -> Response: + """ + Download a .txt file associated with a given digest ID. + + This function searches for a `.txt` file in a directory corresponding to the provided + digest ID. If a file is found, it is read and returned as a downloadable attachment. + If no `.txt` file is found, an error is raised. + + Parameters + ---------- + digest_id : str + The unique identifier for the digest. It is used to find the corresponding directory + and locate the .txt file within that directory. + + Returns + ------- + Response + A FastAPI Response object containing the content of the found `.txt` file. The file is + sent with the appropriate media type (`text/plain`) and the correct `Content-Disposition` + header to prompt a file download. + + Raises + ------ + HTTPException + If the digest directory is not found or if no `.txt` file exists in the directory. + """ + directory = TMP_BASE_PATH / digest_id + + try: + if not directory.exists(): + raise FileNotFoundError("Directory not found") + + txt_files = [f for f in directory.iterdir() if f.suffix == ".txt"] + if not txt_files: + raise FileNotFoundError("No .txt file found") + + except FileNotFoundError as exc: + raise HTTPException(status_code=404, detail="Digest not found") from exc + + # Find the first .txt file in the directory + first_file = txt_files[0] + + with first_file.open(encoding="utf-8") as f: + content = f.read() + + return Response( + content=content, + media_type="text/plain", + headers={"Content-Disposition": f"attachment; filename={first_file.name}"}, + ) + + + +================================================ +FILE: src/server/routers/dynamic.py +================================================ +"""This module defines the dynamic router for handling dynamic path requests.""" + +from fastapi import APIRouter, Form, Request +from fastapi.responses import HTMLResponse + +from server.query_processor import process_query +from server.server_config import templates +from server.server_utils import limiter + +router = APIRouter() + + +@router.get("/{full_path:path}") +async def catch_all(request: Request, full_path: str) -> HTMLResponse: + """ + Render a page with a Git URL based on the provided path. + + This endpoint catches all GET requests with a dynamic path, constructs a Git URL + using the `full_path` parameter, and renders the `git.jinja` template with that URL. + + Parameters + ---------- + request : Request + The incoming request object, which provides context for rendering the response. + full_path : str + The full path extracted from the URL, which is used to build the Git URL. + + Returns + ------- + HTMLResponse + An HTML response containing the rendered template, with the Git URL + and other default parameters such as loading state and file size. + """ + return templates.TemplateResponse( + "git.jinja", + { + "request": request, + "repo_url": full_path, + "loading": True, + "default_file_size": 243, + }, + ) + + +@router.post("/{full_path:path}", response_class=HTMLResponse) +@limiter.limit("10/minute") +async def process_catch_all( + request: Request, + input_text: str = Form(...), + max_file_size: int = Form(...), + pattern_type: str = Form(...), + pattern: str = Form(...), + token: str = Form(...), +) -> HTMLResponse: + """ + Process the form submission with user input for query parameters. + + This endpoint handles POST requests, processes the input parameters (e.g., text, file size, pattern), + and calls the `process_query` function to handle the query logic, returning the result as an HTML response. + + Parameters + ---------- + request : Request + The incoming request object, which provides context for rendering the response. + input_text : str + The input text provided by the user for processing, by default taken from the form. + max_file_size : int + The maximum allowed file size for the input, specified by the user. + pattern_type : str + The type of pattern used for the query, specified by the user. + pattern : str + The pattern string used in the query, specified by the user. + token : str + GitHub personal-access token (PAT). Needed when *input_text* refers to a + **private** repository. + Returns + ------- + HTMLResponse + An HTML response generated after processing the form input and query logic, + which will be rendered and returned to the user. + """ + resolved_token = None if token == "" else token + return await process_query( + request, + input_text, + max_file_size, + pattern_type, + pattern, + is_index=False, + token=resolved_token, + ) + + + +================================================ +FILE: src/server/routers/index.py +================================================ +"""This module defines the FastAPI router for the home page of the application.""" + +from fastapi import APIRouter, Form, Request +from fastapi.responses import HTMLResponse + +from server.query_processor import process_query +from server.server_config import EXAMPLE_REPOS, templates +from server.server_utils import limiter + +router = APIRouter() + + +@router.get("/", response_class=HTMLResponse) +async def home(request: Request) -> HTMLResponse: + """ + Render the home page with example repositories and default parameters. + + This endpoint serves the home page of the application, rendering the `index.jinja` template + and providing it with a list of example repositories and default file size values. + + Parameters + ---------- + request : Request + The incoming request object, which provides context for rendering the response. + + Returns + ------- + HTMLResponse + An HTML response containing the rendered home page template, with example repositories + and other default parameters such as file size. + """ + return templates.TemplateResponse( + "index.jinja", + { + "request": request, + "examples": EXAMPLE_REPOS, + "default_file_size": 243, + }, + ) + + +@router.post("/", response_class=HTMLResponse) +@limiter.limit("10/minute") +async def index_post( + request: Request, + input_text: str = Form(...), + max_file_size: int = Form(...), + pattern_type: str = Form(...), + pattern: str = Form(...), + token: str = Form(...), +) -> HTMLResponse: + """ + Process the form submission with user input for query parameters. + + This endpoint handles POST requests from the home page form. It processes the user-submitted + input (e.g., text, file size, pattern type) and invokes the `process_query` function to handle + the query logic, returning the result as an HTML response. + + Parameters + ---------- + request : Request + The incoming request object, which provides context for rendering the response. + input_text : str + The input text provided by the user for processing, by default taken from the form. + max_file_size : int + The maximum allowed file size for the input, specified by the user. + pattern_type : str + The type of pattern used for the query, specified by the user. + pattern : str + The pattern string used in the query, specified by the user. + token : str + GitHub personal-access token (PAT). Needed when *input_text* refers to a + **private** repository. + Returns + ------- + HTMLResponse + An HTML response containing the results of processing the form input and query logic, + which will be rendered and returned to the user. + """ + resolved_token = None if token == "" else token + return await process_query( + request, + input_text, + max_file_size, + pattern_type, + pattern, + is_index=True, + token=resolved_token, + ) + + + +================================================ +FILE: tests/__init__.py +================================================ + + + +================================================ +FILE: tests/conftest.py +================================================ +""" +Fixtures for tests. + +This file provides shared fixtures for creating sample queries, a temporary directory structure, and a helper function +to write `.ipynb` notebooks for testing notebook utilities. +""" + +import json +from pathlib import Path +from typing import Any, Callable, Dict, List +from unittest.mock import AsyncMock + +import pytest +from pytest_mock import MockerFixture + +from gitingest.query_parsing import IngestionQuery + +WriteNotebookFunc = Callable[[str, Dict[str, Any]], Path] + +DEMO_URL = "https://github.com/user/repo" +LOCAL_REPO_PATH = "/tmp/repo" + + +@pytest.fixture +def sample_query() -> IngestionQuery: + """ + Provide a default `IngestionQuery` object for use in tests. + + This fixture returns a `IngestionQuery` pre-populated with typical fields and some default ignore patterns. + + Returns + ------- + IngestionQuery + The sample `IngestionQuery` object. + """ + return IngestionQuery( + user_name="test_user", + repo_name="test_repo", + url=None, + subpath="/", + local_path=Path("/tmp/test_repo").resolve(), + slug="test_user/test_repo", + id="id", + branch="main", + max_file_size=1_000_000, + ignore_patterns={"*.pyc", "__pycache__", ".git"}, + include_patterns=None, + ) + + +@pytest.fixture +def temp_directory(tmp_path: Path) -> Path: + """ + Create a temporary directory structure for testing repository scanning. + + The structure includes: + test_repo/ + ├── file1.txt + ├── file2.py + ├── src/ + │ ├── subfile1.txt + │ ├── subfile2.py + │ └── subdir/ + │ ├── file_subdir.txt + │ └── file_subdir.py + ├── dir1/ + │ └── file_dir1.txt + └── dir2/ + └── file_dir2.txt + + Parameters + ---------- + tmp_path : Path + The temporary directory path provided by the `tmp_path` fixture. + + Returns + ------- + Path + The path to the created `test_repo` directory. + """ + test_dir = tmp_path / "test_repo" + test_dir.mkdir() + + # Root files + (test_dir / "file1.txt").write_text("Hello World") + (test_dir / "file2.py").write_text("print('Hello')") + + # src directory and its files + src_dir = test_dir / "src" + src_dir.mkdir() + (src_dir / "subfile1.txt").write_text("Hello from src") + (src_dir / "subfile2.py").write_text("print('Hello from src')") + + # src/subdir and its files + subdir = src_dir / "subdir" + subdir.mkdir() + (subdir / "file_subdir.txt").write_text("Hello from subdir") + (subdir / "file_subdir.py").write_text("print('Hello from subdir')") + + # dir1 and its file + dir1 = test_dir / "dir1" + dir1.mkdir() + (dir1 / "file_dir1.txt").write_text("Hello from dir1") + + # dir2 and its file + dir2 = test_dir / "dir2" + dir2.mkdir() + (dir2 / "file_dir2.txt").write_text("Hello from dir2") + + return test_dir + + +@pytest.fixture +def write_notebook(tmp_path: Path) -> WriteNotebookFunc: + """ + Provide a helper function to write a `.ipynb` notebook file with the given content. + + Parameters + ---------- + tmp_path : Path + The temporary directory path provided by the `tmp_path` fixture. + + Returns + ------- + WriteNotebookFunc + A callable that accepts a filename and a dictionary (representing JSON notebook data), writes it to a `.ipynb` + file, and returns the path to the file. + """ + + def _write_notebook(name: str, content: Dict[str, Any]) -> Path: + notebook_path = tmp_path / name + with notebook_path.open(mode="w", encoding="utf-8") as f: + json.dump(content, f) + return notebook_path + + return _write_notebook + + +@pytest.fixture +def stub_branches(mocker: MockerFixture) -> Callable[[List[str]], None]: + """Return a function that stubs git branch discovery to *branches*.""" + + def _factory(branches: List[str]) -> None: + mocker.patch( + "gitingest.utils.git_utils.run_command", + new_callable=AsyncMock, + return_value=("\n".join(f"refs/heads/{b}" for b in branches).encode() + b"\n", b""), + ) + mocker.patch( + "gitingest.utils.git_utils.fetch_remote_branch_list", + new_callable=AsyncMock, + return_value=branches, + ) + + return _factory + + +@pytest.fixture +def repo_exists_true(mocker: MockerFixture) -> AsyncMock: + """Patch `gitingest.cloning.check_repo_exists` to always return ``True``. + + Many cloning-related tests assume that the remote repository exists. This fixture centralises + that behaviour so individual tests no longer need to repeat the same ``mocker.patch`` call. + The mock object is returned so that tests can make assertions on how it was used or override + its behaviour when needed. + """ + return mocker.patch("gitingest.cloning.check_repo_exists", return_value=True) + + +@pytest.fixture +def run_command_mock(mocker: MockerFixture) -> AsyncMock: + """Patch `gitingest.cloning.run_command` with an ``AsyncMock``. + + The mocked function returns a dummy process whose ``communicate`` method yields generic + *stdout* / *stderr* bytes. Tests can still access / tweak the mock via the fixture argument. + """ + mock_exec = mocker.patch("gitingest.cloning.run_command", new_callable=AsyncMock) + + # Provide a default dummy process so most tests don't have to create one. + dummy_process = AsyncMock() + dummy_process.communicate.return_value = (b"output", b"error") + mock_exec.return_value = dummy_process + + return mock_exec + + + +================================================ +FILE: tests/test_cli.py +================================================ +"""Tests for the Gitingest CLI.""" + +import os +from inspect import signature +from pathlib import Path +from typing import List + +import pytest +from _pytest.monkeypatch import MonkeyPatch +from click.testing import CliRunner, Result + +from gitingest.cli import main +from gitingest.config import MAX_FILE_SIZE, OUTPUT_FILE_NAME + + +@pytest.mark.parametrize( + "cli_args, expect_file", + [ + pytest.param(["./"], True, id="default-options"), + pytest.param( + [ + "./", + "--output", + str(OUTPUT_FILE_NAME), + "--max-size", + str(MAX_FILE_SIZE), + "--exclude-pattern", + "tests/", + "--include-pattern", + "src/", + ], + True, + id="custom-options", + ), + ], +) +def test_cli_writes_file(tmp_path: Path, monkeypatch: MonkeyPatch, cli_args: List[str], expect_file: bool) -> None: + """Run the CLI and verify that the SARIF file is created (or not).""" + # Work inside an isolated temp directory + monkeypatch.chdir(tmp_path) + + result = _invoke_isolated_cli_runner(cli_args) + + assert result.exit_code == 0, result.stderr + + # Summary line should be on STDOUT + stdout_lines = result.stdout.splitlines() + assert f"Analysis complete! Output written to: {OUTPUT_FILE_NAME}" in stdout_lines + + # File side-effect + sarif_file = tmp_path / OUTPUT_FILE_NAME + assert sarif_file.exists() is expect_file, f"{OUTPUT_FILE_NAME} existence did not match expectation" + + +def test_cli_with_stdout_output() -> None: + """Test CLI invocation with output directed to STDOUT.""" + # Clean up any existing digest.txt file before test + if os.path.exists(OUTPUT_FILE_NAME): + os.remove(OUTPUT_FILE_NAME) + + try: + result = _invoke_isolated_cli_runner(["./", "--output", "-", "--exclude-pattern", "tests/"]) + + # ─── core expectations (stdout) ────────────────────────────────────- + assert result.exit_code == 0, f"CLI exited with code {result.exit_code}, stderr: {result.stderr}" + assert "---" in result.stdout, "Expected file separator '---' not found in STDOUT" + assert ( + "src/gitingest/cli.py" in result.stdout + ), "Expected content (e.g., src/gitingest/cli.py) not found in STDOUT" + assert not os.path.exists(OUTPUT_FILE_NAME), f"Output file {OUTPUT_FILE_NAME} was unexpectedly created." + + # ─── the summary must *not* pollute STDOUT, must appear on STDERR ─── + summary = "Analysis complete! Output sent to stdout." + stdout_lines = result.stdout.splitlines() + stderr_lines = result.stderr.splitlines() + assert summary not in stdout_lines, "Unexpected summary message found in STDOUT" + assert summary in stderr_lines, "Expected summary message not found in STDERR" + assert f"Output written to: {OUTPUT_FILE_NAME}" not in stderr_lines + finally: + # Clean up any digest.txt file that might have been created during test + if os.path.exists(OUTPUT_FILE_NAME): + os.remove(OUTPUT_FILE_NAME) + + +def _invoke_isolated_cli_runner(args: List[str]) -> Result: + """Return a CliRunner that keeps stderr apart on Click 8.0-8.1.""" + kwargs = {} + if "mix_stderr" in signature(CliRunner.__init__).parameters: + kwargs["mix_stderr"] = False # Click 8.0–8.1 + runner = CliRunner(**kwargs) + return runner.invoke(main, args) + + + +================================================ +FILE: tests/test_flow_integration.py +================================================ +"""Integration tests covering core functionalities, edge cases, and concurrency handling.""" + +import shutil +from concurrent.futures import ThreadPoolExecutor +from pathlib import Path +from typing import Generator + +import pytest +from fastapi.testclient import TestClient +from pytest import FixtureRequest +from pytest_mock import MockerFixture + +from src.server.main import app + +BASE_DIR = Path(__file__).resolve().parent.parent +TEMPLATE_DIR = BASE_DIR / "src" / "templates" + + +@pytest.fixture(scope="module") +def test_client() -> Generator[TestClient, None, None]: + """Create a test client fixture.""" + with TestClient(app) as client_instance: + client_instance.headers.update({"Host": "localhost"}) + yield client_instance + + +@pytest.fixture(autouse=True) +def mock_static_files(mocker: MockerFixture) -> Generator[None, None, None]: + """Mock the static file mount to avoid directory errors.""" + mock_static = mocker.patch("src.server.main.StaticFiles", autospec=True) + mock_static.return_value = None + yield mock_static + + +@pytest.fixture(autouse=True) +def mock_templates(mocker: MockerFixture) -> Generator[None, None, None]: + """Mock Jinja2 template rendering to bypass actual file loading.""" + mock_template = mocker.patch("starlette.templating.Jinja2Templates.TemplateResponse", autospec=True) + mock_template.return_value = "Mocked Template Response" + yield mock_template + + +@pytest.fixture(scope="module", autouse=True) +def cleanup_tmp_dir() -> Generator[None, None, None]: + """Remove /tmp/gitingest after this test-module is done.""" + yield # run tests + temp_dir = Path("/tmp/gitingest") + if temp_dir.exists(): + try: + shutil.rmtree(temp_dir) + except PermissionError as exc: + print(f"Error cleaning up {temp_dir}: {exc}") + + +@pytest.mark.asyncio +async def test_remote_repository_analysis(request: FixtureRequest) -> None: + """Test the complete flow of analyzing a remote repository.""" + client = request.getfixturevalue("test_client") + form_data = { + "input_text": "https://github.com/octocat/Hello-World", + "max_file_size": "243", + "pattern_type": "exclude", + "pattern": "", + "token": "", + } + + response = client.post("/", data=form_data) + assert response.status_code == 200, f"Form submission failed: {response.text}" + assert "Mocked Template Response" in response.text + + +@pytest.mark.asyncio +async def test_invalid_repository_url(request: FixtureRequest) -> None: + """Test handling of an invalid repository URL.""" + client = request.getfixturevalue("test_client") + form_data = { + "input_text": "https://github.com/nonexistent/repo", + "max_file_size": "243", + "pattern_type": "exclude", + "pattern": "", + "token": "", + } + + response = client.post("/", data=form_data) + assert response.status_code == 200, f"Request failed: {response.text}" + assert "Mocked Template Response" in response.text + + +@pytest.mark.asyncio +async def test_large_repository(request: FixtureRequest) -> None: + """Simulate analysis of a large repository with nested folders.""" + client = request.getfixturevalue("test_client") + form_data = { + "input_text": "https://github.com/large/repo-with-many-files", + "max_file_size": "243", + "pattern_type": "exclude", + "pattern": "", + "token": "", + } + + response = client.post("/", data=form_data) + assert response.status_code == 200, f"Request failed: {response.text}" + assert "Mocked Template Response" in response.text + + +@pytest.mark.asyncio +async def test_concurrent_requests(request: FixtureRequest) -> None: + """Test handling of multiple concurrent requests.""" + client = request.getfixturevalue("test_client") + + def make_request(): + form_data = { + "input_text": "https://github.com/octocat/Hello-World", + "max_file_size": "243", + "pattern_type": "exclude", + "pattern": "", + "token": "", + } + response = client.post("/", data=form_data) + assert response.status_code == 200, f"Request failed: {response.text}" + assert "Mocked Template Response" in response.text + + with ThreadPoolExecutor(max_workers=5) as executor: + futures = [executor.submit(make_request) for _ in range(5)] + for future in futures: + future.result() + + +@pytest.mark.asyncio +async def test_large_file_handling(request: FixtureRequest) -> None: + """Test handling of repositories with large files.""" + client = request.getfixturevalue("test_client") + form_data = { + "input_text": "https://github.com/octocat/Hello-World", + "max_file_size": "1", + "pattern_type": "exclude", + "pattern": "", + "token": "", + } + + response = client.post("/", data=form_data) + assert response.status_code == 200, f"Request failed: {response.text}" + assert "Mocked Template Response" in response.text + + +@pytest.mark.asyncio +async def test_repository_with_patterns(request: FixtureRequest) -> None: + """Test repository analysis with include/exclude patterns.""" + client = request.getfixturevalue("test_client") + form_data = { + "input_text": "https://github.com/octocat/Hello-World", + "max_file_size": "243", + "pattern_type": "include", + "pattern": "*.md", + "token": "", + } + + response = client.post("/", data=form_data) + assert response.status_code == 200, f"Request failed: {response.text}" + assert "Mocked Template Response" in response.text + + + +================================================ +FILE: tests/test_git_utils.py +================================================ +""" +Tests for the `git_utils` module. + +These tests validate the `validate_github_token` function, which ensures that +GitHub personal access tokens (PATs) are properly formatted. +""" + +import base64 + +import pytest + +from gitingest.utils.exceptions import InvalidGitHubTokenError +from gitingest.utils.git_utils import ( + create_git_auth_header, + create_git_command, + validate_github_token, +) + + +@pytest.mark.parametrize( + "token", + [ + # Valid tokens: correct prefixes and at least 36 allowed characters afterwards + "github_pat_" + "a" * 36, + "ghp_" + "A" * 36, + "github_pat_1234567890abcdef1234567890abcdef1234", + ], +) +def test_validate_github_token_valid(token): + """validate_github_token should accept properly-formatted tokens.""" + # Should not raise any exception + validate_github_token(token) + + +@pytest.mark.parametrize( + "token", + [ + "github_pat_short", # Too short after prefix + "ghp_" + "b" * 35, # one character short + "invalidprefix_" + "c" * 36, # Wrong prefix + "github_pat_" + "!" * 36, # Disallowed characters + "", # Empty string + ], +) +def test_validate_github_token_invalid(token): + """validate_github_token should raise ValueError on malformed tokens.""" + with pytest.raises(InvalidGitHubTokenError): + validate_github_token(token) + + +@pytest.mark.parametrize( + "base_cmd, local_path, url, token, expected_suffix", + [ + ( + ["git", "clone"], + "/some/path", + "https://github.com/owner/repo.git", + None, + [], # No auth header expected when token is None + ), + ( + ["git", "clone"], + "/some/path", + "https://github.com/owner/repo.git", + "ghp_" + "d" * 36, + [ + "-c", + create_git_auth_header("ghp_" + "d" * 36), + ], # Auth header expected for GitHub URL + token + ), + ( + ["git", "clone"], + "/some/path", + "https://gitlab.com/owner/repo.git", + "ghp_" + "e" * 36, + [], # No auth header for non-GitHub URL even if token provided + ), + ], +) +def test_create_git_command(base_cmd, local_path, url, token, expected_suffix): + """create_git_command should build the correct command list based on inputs.""" + cmd = create_git_command(base_cmd, local_path, url, token) + + # The command should start with base_cmd and the -C option + expected_prefix = base_cmd + ["-C", local_path] + assert cmd[: len(expected_prefix)] == expected_prefix + + # The suffix (anything after prefix) should match expected + assert cmd[len(expected_prefix) :] == expected_suffix + + +def test_create_git_command_invalid_token(): + """Supplying an invalid token for a GitHub URL should raise ValueError.""" + with pytest.raises(InvalidGitHubTokenError): + create_git_command( + ["git", "clone"], + "/some/path", + "https://github.com/owner/repo.git", + "invalid_token", + ) + + +@pytest.mark.parametrize( + "token", + [ + "ghp_abcdefghijklmnopqrstuvwxyz012345", # typical ghp_ token + "github_pat_1234567890abcdef1234567890abcdef1234", + ], +) +def test_create_git_auth_header(token): + """create_git_auth_header should produce correct base64-encoded header.""" + header = create_git_auth_header(token) + expected_basic = base64.b64encode(f"x-oauth-basic:{token}".encode()).decode() + expected = f"http.https://github.com/.extraheader=Authorization: Basic {expected_basic}" + assert header == expected + + +@pytest.mark.parametrize( + "url, token, should_call", + [ + ("https://github.com/foo/bar.git", "ghp_" + "f" * 36, True), + ("https://github.com/foo/bar.git", None, False), + ("https://gitlab.com/foo/bar.git", "ghp_" + "g" * 36, False), + ], +) +def test_create_git_command_helper_calls(mocker, url, token, should_call): + """Verify validate_github_token & create_git_auth_header are invoked only when appropriate.""" + + validate_mock = mocker.patch("gitingest.utils.git_utils.validate_github_token") + header_mock = mocker.patch("gitingest.utils.git_utils.create_git_auth_header", return_value="HEADER") + + cmd = create_git_command(["git", "clone"], "/tmp", url, token) + + if should_call: + validate_mock.assert_called_once_with(token) + header_mock.assert_called_once_with(token) + assert "HEADER" in cmd + else: + validate_mock.assert_not_called() + header_mock.assert_not_called() + # HEADER should not be included in command list + assert "HEADER" not in cmd + + + +================================================ +FILE: tests/test_gitignore_feature.py +================================================ +""" +Tests for the gitignore functionality in Gitingest. +""" + +from pathlib import Path + +import pytest + +from gitingest.entrypoint import ingest_async +from gitingest.utils.ignore_patterns import load_gitignore_patterns + + +@pytest.fixture(name="repo_path") +def repo_fixture(tmp_path: Path) -> Path: + """ + Create a temporary repository structure with: + - A .gitignore that excludes 'exclude.txt' + - 'include.txt' (should be processed) + - 'exclude.txt' (should be skipped when gitignore rules are respected) + """ + # Create a .gitignore file that excludes 'exclude.txt' + gitignore_file = tmp_path / ".gitignore" + gitignore_file.write_text("exclude.txt\n") + + # Create a file that should be included + include_file = tmp_path / "include.txt" + include_file.write_text("This file should be included.") + + # Create a file that should be excluded + exclude_file = tmp_path / "exclude.txt" + exclude_file.write_text("This file should be excluded.") + + return tmp_path + + +def test_load_gitignore_patterns(tmp_path: Path): + """ + Test that load_gitignore_patterns() correctly loads patterns from a .gitignore file. + """ + gitignore = tmp_path / ".gitignore" + # Write some sample patterns with a comment line included + gitignore.write_text("exclude.txt\n*.log\n# a comment\n") + + patterns = load_gitignore_patterns(tmp_path) + + # Check that the expected patterns are loaded + assert "exclude.txt" in patterns + assert "*.log" in patterns + # Ensure that comment lines are not added + for pattern in patterns: + assert not pattern.startswith("#") + + +@pytest.mark.asyncio +async def test_ingest_with_gitignore(repo_path: Path): + """ + Integration test for ingest_async() respecting .gitignore rules. + + When ``include_gitignored`` is ``False`` (default), the content of 'exclude.txt' should be omitted. + When ``include_gitignored`` is ``True``, both files should be present. + """ + # Run ingestion with the gitignore functionality enabled. + _, _, content_with_ignore = await ingest_async(source=str(repo_path)) + # 'exclude.txt' should be skipped. + assert "This file should be excluded." not in content_with_ignore + # 'include.txt' should be processed. + assert "This file should be included." in content_with_ignore + + # Run ingestion with the gitignore functionality disabled. + _, _, content_without_ignore = await ingest_async(source=str(repo_path), include_gitignored=True) + # Now both files should be present. + assert "This file should be excluded." in content_without_ignore + assert "This file should be included." in content_without_ignore + + + +================================================ +FILE: tests/test_ingestion.py +================================================ +""" +Tests for the `query_ingestion` module. + +These tests validate directory scanning, file content extraction, notebook handling, and the overall ingestion logic, +including filtering patterns and subpaths. +""" + +import re +from pathlib import Path +from typing import Set, TypedDict + +import pytest + +from gitingest.ingestion import ingest_query +from gitingest.query_parsing import IngestionQuery + + +def test_run_ingest_query(temp_directory: Path, sample_query: IngestionQuery) -> None: + """ + Test `ingest_query` to ensure it processes the directory and returns expected results. + + Given a directory with .txt and .py files: + When `ingest_query` is invoked, + Then it should produce a summary string listing the files analyzed and a combined content string. + """ + sample_query.local_path = temp_directory + sample_query.subpath = "/" + sample_query.type = None + + summary, _, content = ingest_query(sample_query) + + assert "Repository: test_user/test_repo" in summary + assert "Files analyzed: 8" in summary + + # Check presence of key files in the content + assert "src/subfile1.txt" in content + assert "src/subfile2.py" in content + assert "src/subdir/file_subdir.txt" in content + assert "src/subdir/file_subdir.py" in content + assert "file1.txt" in content + assert "file2.py" in content + assert "dir1/file_dir1.txt" in content + assert "dir2/file_dir2.txt" in content + + +# TODO: Additional tests: +# - Multiple include patterns, e.g. ["*.txt", "*.py"] or ["/src/*", "*.txt"]. +# - Edge cases with weird file names or deep subdirectory structures. +# TODO : def test_include_nonexistent_extension + + +class PatternScenario(TypedDict): + include_patterns: Set[str] + ignore_patterns: Set[str] + expected_num_files: int + expected_content: Set[str] + expected_structure: Set[str] + expected_not_structure: Set[str] + + +@pytest.mark.parametrize( + "pattern_scenario", + [ + pytest.param( + PatternScenario( + { + "include_patterns": {"file2.py", "dir2/file_dir2.txt"}, + "ignore_patterns": {*()}, + "expected_num_files": 2, + "expected_content": {"file2.py", "dir2/file_dir2.txt"}, + "expected_structure": {"test_repo/", "dir2/"}, + "expected_not_structure": {"src/", "subdir/", "dir1/"}, + } + ), + id="include-explicit-files", + ), + pytest.param( + PatternScenario( + { + "include_patterns": { + "file1.txt", + "file2.py", + "file_dir1.txt", + "*/file_dir2.txt", + }, + "ignore_patterns": {*()}, + "expected_num_files": 4, + "expected_content": {"file1.txt", "file2.py", "dir1/file_dir1.txt", "dir2/file_dir2.txt"}, + "expected_structure": {"test_repo/", "dir1/", "dir2/"}, + "expected_not_structure": {"src/", "subdir/"}, + } + ), + id="include-wildcard-directory", + ), + pytest.param( + PatternScenario( + { + "include_patterns": {"*.py"}, + "ignore_patterns": {*()}, + "expected_num_files": 3, + "expected_content": { + "file2.py", + "src/subfile2.py", + "src/subdir/file_subdir.py", + }, + "expected_structure": {"test_repo/", "src/", "subdir/"}, + "expected_not_structure": {"dir1/", "dir2/"}, + } + ), + id="include-wildcard-files", + ), + pytest.param( + PatternScenario( + { + "include_patterns": {"**/file_dir2.txt", "src/**/*.py"}, + "ignore_patterns": {*()}, + "expected_num_files": 3, + "expected_content": { + "dir2/file_dir2.txt", + "src/subfile2.py", + "src/subdir/file_subdir.py", + }, + "expected_structure": {"test_repo/", "dir2/", "src/", "subdir/"}, + "expected_not_structure": {"dir1/"}, + } + ), + id="include-recursive-wildcard", + ), + pytest.param( + PatternScenario( + { + "include_patterns": {*()}, + "ignore_patterns": {"file2.py", "dir2/file_dir2.txt"}, + "expected_num_files": 6, + "expected_content": { + "file1.txt", + "src/subfile1.txt", + "src/subfile2.py", + "src/subdir/file_subdir.txt", + "src/subdir/file_subdir.py", + "dir1/file_dir1.txt", + }, + "expected_structure": {"test_repo/", "src/", "subdir/", "dir1/"}, + "expected_not_structure": {"dir2/"}, + } + ), + id="exclude-explicit-files", + ), + pytest.param( + PatternScenario( + { + "include_patterns": {*()}, + "ignore_patterns": {"file1.txt", "file2.py", "*/file_dir1.txt"}, + "expected_num_files": 5, + "expected_content": { + "src/subfile1.txt", + "src/subfile2.py", + "src/subdir/file_subdir.txt", + "src/subdir/file_subdir.py", + "dir2/file_dir2.txt", + }, + "expected_structure": {"test_repo/", "src/", "subdir/", "dir2/"}, + "expected_not_structure": {"dir1/"}, + } + ), + id="exclude-wildcard-directory", + ), + pytest.param( + PatternScenario( + { + "include_patterns": {*()}, + "ignore_patterns": {"src/**/*.py"}, + "expected_num_files": 6, + "expected_content": { + "file1.txt", + "file2.py", + "src/subfile1.txt", + "src/subdir/file_subdir.txt", + "dir1/file_dir1.txt", + "dir2/file_dir2.txt", + }, + "expected_structure": { + "test_repo/", + "dir1/", + "dir2/", + "src/", + "subdir/", + }, + "expected_not_structure": {*()}, + } + ), + id="exclude-recursive-wildcard", + ), + ], +) +def test_include_ignore_patterns( + temp_directory: Path, + sample_query: IngestionQuery, + pattern_scenario: PatternScenario, +) -> None: + """ + Test `ingest_query` to ensure included and ignored paths are included and ignored respectively. + + Given a directory with .txt and .py files, and a set of include patterns or a set of ignore patterns: + When `ingest_query` is invoked, + Then it should produce a summary string listing the files analyzed and a combined content string. + """ + + sample_query.local_path = temp_directory + sample_query.subpath = "/" + sample_query.type = None + sample_query.include_patterns = pattern_scenario["include_patterns"] or None + sample_query.ignore_patterns = pattern_scenario["ignore_patterns"] or None + + summary, structure, content = ingest_query(sample_query) + + assert "Repository: test_user/test_repo" in summary + num_files_regex = re.compile(r"^Files analyzed: (\d+)$", re.MULTILINE) + assert (num_files_match := num_files_regex.search(summary)) is not None + assert int(num_files_match.group(1)) == pattern_scenario["expected_num_files"] + + # Check presence of key files in the content + for expected_content_item in pattern_scenario["expected_content"]: + assert expected_content_item in content + + # check presence of included directories in structure + for expected_structure_item in pattern_scenario["expected_structure"]: + assert expected_structure_item in structure + + # check non-presence of non-included directories in structure + for expected_not_structure_item in pattern_scenario["expected_not_structure"]: + assert expected_not_structure_item not in structure + + + +================================================ +FILE: tests/test_notebook_utils.py +================================================ +""" +Tests for the `notebook_utils` module. + +These tests validate how notebooks are processed into Python-like output, ensuring that markdown/raw cells are +converted to triple-quoted blocks, code cells remain executable code, and various edge cases (multiple worksheets, +empty cells, outputs, etc.) are handled appropriately. +""" + +import pytest + +from gitingest.utils.notebook_utils import process_notebook +from tests.conftest import WriteNotebookFunc + + +def test_process_notebook_all_cells(write_notebook: WriteNotebookFunc) -> None: + """ + Test processing a notebook containing markdown, code, and raw cells. + + Given a notebook with: + - One markdown cell + - One code cell + - One raw cell + When `process_notebook` is invoked, + Then markdown and raw cells should appear in triple-quoted blocks, and code cells remain as normal code. + """ + notebook_content = { + "cells": [ + {"cell_type": "markdown", "source": ["# Markdown cell"]}, + {"cell_type": "code", "source": ['print("Hello Code")']}, + {"cell_type": "raw", "source": [""]}, + ] + } + nb_path = write_notebook("all_cells.ipynb", notebook_content) + result = process_notebook(nb_path) + + assert result.count('"""') == 4, "Two non-code cells => 2 triple-quoted blocks => 4 total triple quotes." + + # Ensure markdown and raw cells are in triple quotes + assert "# Markdown cell" in result + assert "" in result + + # Ensure code cell is not in triple quotes + assert 'print("Hello Code")' in result + assert '"""\nprint("Hello Code")\n"""' not in result + + +def test_process_notebook_with_worksheets(write_notebook: WriteNotebookFunc) -> None: + """ + Test a notebook containing the (as of IPEP-17 deprecated) 'worksheets' key. + + Given a notebook that uses the 'worksheets' key with a single worksheet, + When `process_notebook` is called, + Then a `DeprecationWarning` should be raised, and the content should match an equivalent notebook + that has top-level 'cells'. + """ + with_worksheets = { + "worksheets": [ + { + "cells": [ + {"cell_type": "markdown", "source": ["# Markdown cell"]}, + {"cell_type": "code", "source": ['print("Hello Code")']}, + {"cell_type": "raw", "source": [""]}, + ] + } + ] + } + without_worksheets = with_worksheets["worksheets"][0] # same, but no 'worksheets' key + + nb_with = write_notebook("with_worksheets.ipynb", with_worksheets) + nb_without = write_notebook("without_worksheets.ipynb", without_worksheets) + + with pytest.warns(DeprecationWarning, match="Worksheets are deprecated as of IPEP-17."): + result_with = process_notebook(nb_with) + + # Should not raise a warning + result_without = process_notebook(nb_without) + + assert result_with == result_without, "Content from the single worksheet should match the top-level equivalent." + + +def test_process_notebook_multiple_worksheets(write_notebook: WriteNotebookFunc) -> None: + """ + Test a notebook containing multiple 'worksheets'. + + Given a notebook with two worksheets: + - First with a markdown cell + - Second with a code cell + When `process_notebook` is called, + Then a warning about multiple worksheets should be raised, and the second worksheet's content should appear + in the final output. + """ + multi_worksheets = { + "worksheets": [ + {"cells": [{"cell_type": "markdown", "source": ["# First Worksheet"]}]}, + {"cells": [{"cell_type": "code", "source": ["# Second Worksheet"]}]}, + ] + } + + single_worksheet = { + "worksheets": [ + {"cells": [{"cell_type": "markdown", "source": ["# First Worksheet"]}]}, + ] + } + + nb_multi = write_notebook("multiple_worksheets.ipynb", multi_worksheets) + nb_single = write_notebook("single_worksheet.ipynb", single_worksheet) + + # Expect DeprecationWarning + UserWarning + with pytest.warns( + DeprecationWarning, match="Worksheets are deprecated as of IPEP-17. Consider updating the notebook." + ): + with pytest.warns( + UserWarning, match="Multiple worksheets detected. Combining all worksheets into a single script." + ): + result_multi = process_notebook(nb_multi) + + # Expect DeprecationWarning only + with pytest.warns( + DeprecationWarning, match="Worksheets are deprecated as of IPEP-17. Consider updating the notebook." + ): + result_single = process_notebook(nb_single) + + assert result_multi != result_single, "Two worksheets should produce more content than one." + assert len(result_multi) > len(result_single), "The multi-worksheet notebook should have extra code content." + assert "# First Worksheet" in result_single + assert "# Second Worksheet" not in result_single + assert "# First Worksheet" in result_multi + assert "# Second Worksheet" in result_multi + + +def test_process_notebook_code_only(write_notebook: WriteNotebookFunc) -> None: + """ + Test a notebook containing only code cells. + + Given a notebook with code cells only: + When `process_notebook` is called, + Then no triple quotes should appear in the output. + """ + notebook_content = { + "cells": [ + {"cell_type": "code", "source": ["print('Code Cell 1')"]}, + {"cell_type": "code", "source": ["x = 42"]}, + ] + } + nb_path = write_notebook("code_only.ipynb", notebook_content) + result = process_notebook(nb_path) + + assert '"""' not in result, "No triple quotes expected when there are only code cells." + assert "print('Code Cell 1')" in result + assert "x = 42" in result + + +def test_process_notebook_markdown_only(write_notebook: WriteNotebookFunc) -> None: + """ + Test a notebook with only markdown cells. + + Given a notebook with two markdown cells: + When `process_notebook` is called, + Then each markdown cell should become a triple-quoted block (2 blocks => 4 triple quotes total). + """ + notebook_content = { + "cells": [ + {"cell_type": "markdown", "source": ["# Markdown Header"]}, + {"cell_type": "markdown", "source": ["Some more markdown."]}, + ] + } + nb_path = write_notebook("markdown_only.ipynb", notebook_content) + result = process_notebook(nb_path) + + assert result.count('"""') == 4, "Two markdown cells => 2 blocks => 4 triple quotes total." + assert "# Markdown Header" in result + assert "Some more markdown." in result + + +def test_process_notebook_raw_only(write_notebook: WriteNotebookFunc) -> None: + """ + Test a notebook with only raw cells. + + Given two raw cells: + When `process_notebook` is called, + Then each raw cell should become a triple-quoted block (2 blocks => 4 triple quotes total). + """ + notebook_content = { + "cells": [ + {"cell_type": "raw", "source": ["Raw content line 1"]}, + {"cell_type": "raw", "source": ["Raw content line 2"]}, + ] + } + nb_path = write_notebook("raw_only.ipynb", notebook_content) + result = process_notebook(nb_path) + + assert result.count('"""') == 4, "Two raw cells => 2 blocks => 4 triple quotes." + assert "Raw content line 1" in result + assert "Raw content line 2" in result + + +def test_process_notebook_empty_cells(write_notebook: WriteNotebookFunc) -> None: + """ + Test that cells with an empty 'source' are skipped. + + Given a notebook with 4 cells, 3 of which have empty `source`: + When `process_notebook` is called, + Then only the non-empty cell should appear in the output (1 block => 2 triple quotes). + """ + notebook_content = { + "cells": [ + {"cell_type": "markdown", "source": []}, + {"cell_type": "code", "source": []}, + {"cell_type": "raw", "source": []}, + {"cell_type": "markdown", "source": ["# Non-empty markdown"]}, + ] + } + nb_path = write_notebook("empty_cells.ipynb", notebook_content) + result = process_notebook(nb_path) + + assert result.count('"""') == 2, "Only one non-empty cell => 1 block => 2 triple quotes" + assert "# Non-empty markdown" in result + + +def test_process_notebook_invalid_cell_type(write_notebook: WriteNotebookFunc) -> None: + """ + Test a notebook with an unknown cell type. + + Given a notebook cell whose `cell_type` is unrecognized: + When `process_notebook` is called, + Then a ValueError should be raised. + """ + notebook_content = { + "cells": [ + {"cell_type": "markdown", "source": ["# Valid markdown"]}, + {"cell_type": "unknown", "source": ["Unrecognized cell type"]}, + ] + } + nb_path = write_notebook("invalid_cell_type.ipynb", notebook_content) + + with pytest.raises(ValueError, match="Unknown cell type: unknown"): + process_notebook(nb_path) + + +def test_process_notebook_with_output(write_notebook: WriteNotebookFunc) -> None: + """ + Test a notebook that has code cells with outputs. + + Given a code cell and multiple output objects: + When `process_notebook` is called with `include_output=True`, + Then the outputs should be appended as commented lines under the code. + """ + notebook_content = { + "cells": [ + { + "cell_type": "code", + "source": [ + "import matplotlib.pyplot as plt\n", + "print('my_data')\n", + "my_data = [1, 2, 3, 4, 5]\n", + "plt.plot(my_data)\n", + "my_data", + ], + "outputs": [ + {"output_type": "stream", "text": ["my_data"]}, + {"output_type": "execute_result", "data": {"text/plain": ["[1, 2, 3, 4, 5]"]}}, + {"output_type": "display_data", "data": {"text/plain": ["
"]}}, + ], + } + ] + } + + nb_path = write_notebook("with_output.ipynb", notebook_content) + with_output = process_notebook(nb_path, include_output=True) + without_output = process_notebook(nb_path, include_output=False) + + expected_source = "\n".join( + [ + "# Jupyter notebook converted to Python script.\n", + "import matplotlib.pyplot as plt", + "print('my_data')", + "my_data = [1, 2, 3, 4, 5]", + "plt.plot(my_data)", + "my_data\n", + ] + ) + expected_output = "\n".join( + [ + "# Output:", + "# my_data", + "# [1, 2, 3, 4, 5]", + "#
\n", + ] + ) + + expected_combined = expected_source + expected_output + + assert with_output == expected_combined, "Should include source code and comment-ified output." + assert without_output == expected_source, "Should include only the source code without output." + + + +================================================ +FILE: tests/test_repository_clone.py +================================================ +""" +Tests for the `cloning` module. + +These tests cover various scenarios for cloning repositories, verifying that the appropriate Git commands are invoked +and handling edge cases such as nonexistent URLs, timeouts, redirects, and specific commits or branches. +""" + +import asyncio +import os +from pathlib import Path +from unittest.mock import AsyncMock + +import pytest +from pytest_mock import MockerFixture + +from gitingest.cloning import clone_repo +from gitingest.schemas import CloneConfig +from gitingest.utils.exceptions import AsyncTimeoutError +from gitingest.utils.git_utils import check_repo_exists +from tests.conftest import DEMO_URL, LOCAL_REPO_PATH + +# All cloning-related tests assume (unless explicitly overridden) that the repository exists. +# Apply the check-repo patch automatically so individual tests don't need to repeat it. +pytestmark = pytest.mark.usefixtures("repo_exists_true") + + +@pytest.mark.asyncio +async def test_clone_with_commit(repo_exists_true: AsyncMock, run_command_mock: AsyncMock) -> None: + """ + Test cloning a repository with a specific commit hash. + + Given a valid URL and a commit hash: + When `clone_repo` is called, + Then the repository should be cloned and checked out at that commit. + """ + clone_config = CloneConfig( + url=DEMO_URL, + local_path=LOCAL_REPO_PATH, + commit="a" * 40, # Simulating a valid commit hash + branch="main", + ) + + await clone_repo(clone_config) + + repo_exists_true.assert_called_once_with(clone_config.url, token=None) + assert run_command_mock.call_count == 2 # Clone and checkout calls + + +@pytest.mark.asyncio +async def test_clone_without_commit(repo_exists_true: AsyncMock, run_command_mock: AsyncMock) -> None: + """ + Test cloning a repository when no commit hash is provided. + + Given a valid URL and no commit hash: + When `clone_repo` is called, + Then only the clone_repo operation should be performed (no checkout). + """ + clone_config = CloneConfig(url=DEMO_URL, local_path=LOCAL_REPO_PATH, commit=None, branch="main") + + await clone_repo(clone_config) + + repo_exists_true.assert_called_once_with(clone_config.url, token=None) + assert run_command_mock.call_count == 1 # Only clone call + + +@pytest.mark.asyncio +async def test_clone_nonexistent_repository(repo_exists_true: AsyncMock) -> None: + """ + Test cloning a nonexistent repository URL. + + Given an invalid or nonexistent URL: + When `clone_repo` is called, + Then a ValueError should be raised with an appropriate error message. + """ + clone_config = CloneConfig( + url="https://github.com/user/nonexistent-repo", + local_path=LOCAL_REPO_PATH, + commit=None, + branch="main", + ) + # Override the default fixture behaviour for this test + repo_exists_true.return_value = False + + with pytest.raises(ValueError, match="Repository not found"): + await clone_repo(clone_config) + + repo_exists_true.assert_called_once_with(clone_config.url, token=None) + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "mock_stdout, return_code, expected", + [ + (b"HTTP/1.1 200 OK\n", 0, True), # Existing repo + (b"HTTP/1.1 404 Not Found\n", 0, False), # Non-existing repo + (b"HTTP/1.1 200 OK\n", 1, False), # Failed request + ], +) +async def test_check_repo_exists(mock_stdout: bytes, return_code: int, expected: bool, mocker: MockerFixture) -> None: + """ + Test the `check_repo_exists` function with different Git HTTP responses. + + Given various stdout lines and return codes: + When `check_repo_exists` is called, + Then it should correctly indicate whether the repository exists. + """ + mock_exec = mocker.patch("asyncio.create_subprocess_exec", new_callable=AsyncMock) + mock_process = AsyncMock() + mock_process.communicate.return_value = (mock_stdout, b"") + mock_process.returncode = return_code + mock_exec.return_value = mock_process + + repo_exists = await check_repo_exists(DEMO_URL) + + assert repo_exists is expected + + +@pytest.mark.asyncio +async def test_clone_with_custom_branch(run_command_mock: AsyncMock) -> None: + """ + Test cloning a repository with a specified custom branch. + + Given a valid URL and a branch: + When `clone_repo` is called, + Then the repository should be cloned shallowly to that branch. + """ + clone_config = CloneConfig(url=DEMO_URL, local_path=LOCAL_REPO_PATH, branch="feature-branch") + + await clone_repo(clone_config) + + run_command_mock.assert_called_once_with( + "git", + "clone", + "--single-branch", + "--depth=1", + "--branch", + "feature-branch", + clone_config.url, + clone_config.local_path, + ) + + +@pytest.mark.asyncio +async def test_git_command_failure(run_command_mock: AsyncMock) -> None: + """ + Test cloning when the Git command fails during execution. + + Given a valid URL, but `run_command` raises a RuntimeError: + When `clone_repo` is called, + Then a RuntimeError should be raised with the correct message. + """ + clone_config = CloneConfig(url=DEMO_URL, local_path=LOCAL_REPO_PATH) + + run_command_mock.side_effect = RuntimeError("Git command failed") + + with pytest.raises(RuntimeError, match="Git command failed"): + await clone_repo(clone_config) + + +@pytest.mark.asyncio +async def test_clone_default_shallow_clone(run_command_mock: AsyncMock) -> None: + """ + Test cloning a repository with the default shallow clone options. + + Given a valid URL and no branch or commit: + When `clone_repo` is called, + Then the repository should be cloned with `--depth=1` and `--single-branch`. + """ + clone_config = CloneConfig(url=DEMO_URL, local_path=LOCAL_REPO_PATH) + + await clone_repo(clone_config) + + run_command_mock.assert_called_once_with( + "git", + "clone", + "--single-branch", + "--depth=1", + clone_config.url, + clone_config.local_path, + ) + + +@pytest.mark.asyncio +async def test_clone_commit_without_branch(run_command_mock: AsyncMock) -> None: + """ + Test cloning when a commit hash is provided but no branch is specified. + + Given a valid URL and a commit hash (but no branch): + When `clone_repo` is called, + Then the repository should be cloned and checked out at that commit. + """ + # Simulating a valid commit hash + clone_config = CloneConfig(url=DEMO_URL, local_path=LOCAL_REPO_PATH, commit="a" * 40) + + await clone_repo(clone_config) + + assert run_command_mock.call_count == 2 # Clone and checkout calls + run_command_mock.assert_any_call("git", "clone", "--single-branch", clone_config.url, clone_config.local_path) + run_command_mock.assert_any_call("git", "-C", clone_config.local_path, "checkout", clone_config.commit) + + +@pytest.mark.asyncio +async def test_check_repo_exists_with_redirect(mocker: MockerFixture) -> None: + """ + Test `check_repo_exists` when a redirect (302) is returned. + + Given a URL that responds with "302 Found": + When `check_repo_exists` is called, + Then it should return `False`, indicating the repo is inaccessible. + """ + mock_exec = mocker.patch("asyncio.create_subprocess_exec", new_callable=AsyncMock) + mock_process = AsyncMock() + mock_process.communicate.return_value = (b"HTTP/1.1 302 Found\n", b"") + mock_process.returncode = 0 # Simulate successful request + mock_exec.return_value = mock_process + + repo_exists = await check_repo_exists(DEMO_URL) + + assert repo_exists is False + + +@pytest.mark.asyncio +async def test_check_repo_exists_with_permanent_redirect(mocker: MockerFixture) -> None: + """ + Test `check_repo_exists` when a permanent redirect (301) is returned. + + Given a URL that responds with "301 Found": + When `check_repo_exists` is called, + Then it should return `True`, indicating the repo may exist at the new location. + """ + mock_exec = mocker.patch("asyncio.create_subprocess_exec", new_callable=AsyncMock) + mock_process = AsyncMock() + mock_process.communicate.return_value = (b"HTTP/1.1 301 Found\n", b"") + mock_process.returncode = 0 # Simulate successful request + mock_exec.return_value = mock_process + + repo_exists = await check_repo_exists(DEMO_URL) + + assert repo_exists + + +@pytest.mark.asyncio +async def test_clone_with_timeout(run_command_mock: AsyncMock) -> None: + """ + Test cloning a repository when a timeout occurs. + + Given a valid URL, but `run_command` times out: + When `clone_repo` is called, + Then an `AsyncTimeoutError` should be raised to indicate the operation exceeded time limits. + """ + clone_config = CloneConfig(url=DEMO_URL, local_path=LOCAL_REPO_PATH) + + run_command_mock.side_effect = asyncio.TimeoutError + + with pytest.raises(AsyncTimeoutError, match="Operation timed out after"): + await clone_repo(clone_config) + + +@pytest.mark.asyncio +async def test_clone_specific_branch(tmp_path: Path) -> None: + """ + Test cloning a specific branch of a repository. + + Given a valid repository URL and a branch name: + When `clone_repo` is called, + Then the repository should be cloned and checked out at that branch. + """ + repo_url = "https://github.com/cyclotruc/gitingest.git" + branch_name = "main" + local_path = tmp_path / "gitingest" + clone_config = CloneConfig(url=repo_url, local_path=str(local_path), branch=branch_name) + + await clone_repo(clone_config) + + assert local_path.exists(), "The repository was not cloned successfully." + assert local_path.is_dir(), "The cloned repository path is not a directory." + current_branch = os.popen(f"git -C {local_path} branch --show-current").read().strip() + assert current_branch == branch_name, f"Expected branch '{branch_name}', got '{current_branch}'." + + +@pytest.mark.asyncio +async def test_clone_branch_with_slashes(tmp_path: Path, run_command_mock: AsyncMock) -> None: + """ + Test cloning a branch with slashes in the name. + + Given a valid repository URL and a branch name with slashes: + When `clone_repo` is called, + Then the repository should be cloned and checked out at that branch. + """ + branch_name = "fix/in-operator" + local_path = tmp_path / "gitingest" + clone_config = CloneConfig(url=DEMO_URL, local_path=str(local_path), branch=branch_name) + + await clone_repo(clone_config) + + run_command_mock.assert_called_once_with( + "git", + "clone", + "--single-branch", + "--depth=1", + "--branch", + "fix/in-operator", + clone_config.url, + clone_config.local_path, + ) + + +@pytest.mark.asyncio +async def test_clone_creates_parent_directory(tmp_path: Path, run_command_mock: AsyncMock) -> None: + """ + Test that clone_repo creates parent directories if they don't exist. + + Given a local path with non-existent parent directories: + When `clone_repo` is called, + Then it should create the parent directories before attempting to clone. + """ + nested_path = tmp_path / "deep" / "nested" / "path" / "repo" + clone_config = CloneConfig(url=DEMO_URL, local_path=str(nested_path)) + + await clone_repo(clone_config) + + assert nested_path.parent.exists() + run_command_mock.assert_called_once_with( + "git", + "clone", + "--single-branch", + "--depth=1", + clone_config.url, + str(nested_path), + ) + + +@pytest.mark.asyncio +async def test_clone_with_specific_subpath(run_command_mock: AsyncMock) -> None: + """ + Test cloning a repository with a specific subpath. + + Given a valid repository URL and a specific subpath: + When `clone_repo` is called, + Then the repository should be cloned with sparse checkout enabled and the specified subpath. + """ + clone_config = CloneConfig(url=DEMO_URL, local_path=LOCAL_REPO_PATH, subpath="src/docs") + + await clone_repo(clone_config) + + # Verify the clone command includes sparse checkout flags + run_command_mock.assert_any_call( + "git", + "clone", + "--single-branch", + "--filter=blob:none", + "--sparse", + "--depth=1", + clone_config.url, + clone_config.local_path, + ) + + # Verify the sparse-checkout command sets the correct path + run_command_mock.assert_any_call("git", "-C", clone_config.local_path, "sparse-checkout", "set", "src/docs") + + assert run_command_mock.call_count == 2 + + +@pytest.mark.asyncio +async def test_clone_with_commit_and_subpath(run_command_mock: AsyncMock) -> None: + """ + Test cloning a repository with both a specific commit and subpath. + + Given a valid repository URL, commit hash, and subpath: + When `clone_repo` is called, + Then the repository should be cloned with sparse checkout enabled, + checked out at the specific commit, and only include the specified subpath. + """ + # Simulating a valid commit hash + clone_config = CloneConfig(url=DEMO_URL, local_path=LOCAL_REPO_PATH, commit="a" * 40, subpath="src/docs") + + await clone_repo(clone_config) + + # Verify the clone command includes sparse checkout flags + run_command_mock.assert_any_call( + "git", + "clone", + "--single-branch", + "--filter=blob:none", + "--sparse", + clone_config.url, + clone_config.local_path, + ) + + # Verify sparse-checkout set + run_command_mock.assert_any_call( + "git", + "-C", + clone_config.local_path, + "sparse-checkout", + "set", + "src/docs", + ) + + # Verify checkout commit + run_command_mock.assert_any_call( + "git", + "-C", + clone_config.local_path, + "checkout", + clone_config.commit, + ) + + assert run_command_mock.call_count == 3 + + + +================================================ +FILE: tests/query_parser/__init__.py +================================================ + + + +================================================ +FILE: tests/query_parser/test_git_host_agnostic.py +================================================ +""" +Tests to verify that the query parser is Git host agnostic. + +These tests confirm that `parse_query` correctly identifies user/repo pairs and canonical URLs for GitHub, GitLab, +Bitbucket, Gitea, and Codeberg, even if the host is omitted. +""" + +from typing import List, Tuple + +import pytest + +from gitingest.query_parsing import parse_query +from gitingest.utils.query_parser_utils import KNOWN_GIT_HOSTS + +# Repository matrix: (host, user, repo) +_REPOS: List[Tuple[str, str, str]] = [ + ("github.com", "tiangolo", "fastapi"), + ("gitlab.com", "gitlab-org", "gitlab-runner"), + ("bitbucket.org", "na-dna", "llm-knowledge-share"), + ("gitea.com", "xorm", "xorm"), + ("codeberg.org", "forgejo", "forgejo"), + ("git.rwth-aachen.de", "medialab", "19squared"), + ("gitlab.alpinelinux.org", "alpine", "apk-tools"), +] + + +# Generate cartesian product of repository tuples with URL variants. +@pytest.mark.parametrize("host, user, repo", _REPOS, ids=[f"{h}:{u}/{r}" for h, u, r in _REPOS]) +@pytest.mark.parametrize("variant", ["full", "noscheme", "slug"]) +@pytest.mark.asyncio +async def test_parse_query_without_host( + host: str, + user: str, + repo: str, + variant: str, +) -> None: + """Verify that `parse_query` handles URLs, host-omitted URLs and raw slugs.""" + + # Build the input URL based on the selected variant + if variant == "full": + url = f"https://{host}/{user}/{repo}" + elif variant == "noscheme": + url = f"{host}/{user}/{repo}" + else: # "slug" + url = f"{user}/{repo}" + + expected_url = f"https://{host}/{user}/{repo}" + + # For slug form with a custom host (not in KNOWN_GIT_HOSTS) we expect a failure, + # because the parser cannot guess which domain to use. + if variant == "slug" and host not in KNOWN_GIT_HOSTS: + with pytest.raises(ValueError): + await parse_query(url, max_file_size=50, from_web=True) + return + + query = await parse_query(url, max_file_size=50, from_web=True) + + # Compare against the canonical dict while ignoring unpredictable fields. + actual = query.model_dump(exclude={"id", "local_path", "ignore_patterns"}) + + expected = { + "user_name": user, + "repo_name": repo, + "url": expected_url, + "slug": f"{user}-{repo}", + "subpath": "/", + "type": None, + "branch": None, + "commit": None, + "max_file_size": 50, + "include_patterns": None, + } + + assert actual == expected + + + +================================================ +FILE: tests/query_parser/test_query_parser.py +================================================ +""" +Tests for the `query_parsing` module. + +These tests cover URL parsing, pattern parsing, and handling of branches/subpaths for HTTP(S) repositories and local +paths. +""" + +from pathlib import Path +from typing import Callable, List, Optional +from unittest.mock import AsyncMock + +import pytest +from pytest_mock import MockerFixture + +from gitingest.query_parsing import _parse_patterns, _parse_remote_repo, parse_query +from gitingest.schemas.ingestion_schema import IngestionQuery +from gitingest.utils.ignore_patterns import DEFAULT_IGNORE_PATTERNS +from tests.conftest import DEMO_URL + +URLS_HTTPS: List[str] = [ + DEMO_URL, + "https://gitlab.com/user/repo", + "https://bitbucket.org/user/repo", + "https://gitea.com/user/repo", + "https://codeberg.org/user/repo", + "https://gist.github.com/user/repo", + "https://git.example.com/user/repo", + "https://gitlab.example.com/user/repo", + "https://gitlab.example.se/user/repo", +] + +URLS_HTTP: List[str] = [url.replace("https://", "http://") for url in URLS_HTTPS] + + +@pytest.mark.parametrize("url", URLS_HTTPS, ids=lambda u: u) +@pytest.mark.asyncio +async def test_parse_url_valid_https(url: str) -> None: + """Valid HTTPS URLs parse correctly and `query.url` equals the input.""" + query = await _assert_basic_repo_fields(url) + + assert query.url == url # HTTPS: canonical URL should equal input + + +@pytest.mark.parametrize("url", URLS_HTTP, ids=lambda u: u) +@pytest.mark.asyncio +async def test_parse_url_valid_http(url: str) -> None: + """Valid HTTP URLs parse correctly (slug check only).""" + await _assert_basic_repo_fields(url) + + +@pytest.mark.asyncio +async def test_parse_url_invalid() -> None: + """ + Test `_parse_remote_repo` with an invalid URL. + + Given an HTTPS URL lacking a repository structure (e.g., "https://github.com"), + When `_parse_remote_repo` is called, + Then a ValueError should be raised indicating an invalid repository URL. + """ + url = "https://github.com" + + with pytest.raises(ValueError, match="Invalid repository URL"): + await _parse_remote_repo(url) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("url", [DEMO_URL, "https://gitlab.com/user/repo"]) +async def test_parse_query_basic(url: str) -> None: + """ + Test `parse_query` with a basic valid repository URL. + + Given an HTTPS URL and ignore_patterns="*.txt": + When `parse_query` is called, + Then user/repo, URL, and ignore patterns should be parsed correctly. + """ + query = await parse_query(source=url, max_file_size=50, from_web=True, ignore_patterns="*.txt") + + assert query.user_name == "user" + assert query.repo_name == "repo" + assert query.url == url + assert query.ignore_patterns + assert "*.txt" in query.ignore_patterns + + +@pytest.mark.asyncio +async def test_parse_query_mixed_case() -> None: + """ + Test `parse_query` with mixed-case URLs. + + Given a URL with mixed-case parts (e.g. "Https://GitHub.COM/UsEr/rEpO"): + When `parse_query` is called, + Then the user and repo names should be normalized to lowercase. + """ + url = "Https://GitHub.COM/UsEr/rEpO" + query = await parse_query(url, max_file_size=50, from_web=True) + + assert query.user_name == "user" + assert query.repo_name == "repo" + + +@pytest.mark.asyncio +async def test_parse_query_include_pattern() -> None: + """ + Test `parse_query` with a specified include pattern. + + Given a URL and include_patterns="*.py": + When `parse_query` is called, + Then the include pattern should be set, and default ignore patterns remain applied. + """ + query = await parse_query(DEMO_URL, max_file_size=50, from_web=True, include_patterns="*.py") + + assert query.include_patterns == {"*.py"} + assert query.ignore_patterns == DEFAULT_IGNORE_PATTERNS + + +@pytest.mark.asyncio +async def test_parse_query_invalid_pattern() -> None: + """ + Test `parse_query` with an invalid pattern. + + Given an include pattern containing special characters (e.g., "*.py;rm -rf"): + When `parse_query` is called, + Then a ValueError should be raised indicating invalid characters. + """ + with pytest.raises(ValueError, match="Pattern.*contains invalid characters"): + await parse_query(DEMO_URL, max_file_size=50, from_web=True, include_patterns="*.py;rm -rf") + + +@pytest.mark.asyncio +async def test_parse_url_with_subpaths(stub_branches: Callable[[List[str]], None]) -> None: + """ + Test `_parse_remote_repo` with a URL containing branch and subpath. + + Given a URL referencing a branch ("main") and a subdir ("subdir/file"): + When `_parse_remote_repo` is called with remote branch fetching, + Then user, repo, branch, and subpath should be identified correctly. + """ + url = DEMO_URL + "/tree/main/subdir/file" + + stub_branches(["main", "dev", "feature-branch"]) + + query = await _assert_basic_repo_fields(url) + + assert query.user_name == "user" + assert query.repo_name == "repo" + assert query.branch == "main" + assert query.subpath == "/subdir/file" + + +@pytest.mark.asyncio +async def test_parse_url_invalid_repo_structure() -> None: + """ + Test `_parse_remote_repo` with a URL missing a repository name. + + Given a URL like "https://github.com/user": + When `_parse_remote_repo` is called, + Then a ValueError should be raised indicating an invalid repository URL. + """ + url = "https://github.com/user" + + with pytest.raises(ValueError, match="Invalid repository URL"): + await _parse_remote_repo(url) + + +def test_parse_patterns_valid() -> None: + """ + Test `_parse_patterns` with valid comma-separated patterns. + + Given patterns like "*.py, *.md, docs/*": + When `_parse_patterns` is called, + Then it should return a set of parsed strings. + """ + patterns = "*.py, *.md, docs/*" + parsed_patterns = _parse_patterns(patterns) + + assert parsed_patterns == {"*.py", "*.md", "docs/*"} + + +def test_parse_patterns_invalid_characters() -> None: + """ + Test `_parse_patterns` with invalid characters. + + Given a pattern string containing special characters (e.g. "*.py;rm -rf"): + When `_parse_patterns` is called, + Then a ValueError should be raised indicating invalid pattern syntax. + """ + patterns = "*.py;rm -rf" + + with pytest.raises(ValueError, match="Pattern.*contains invalid characters"): + _parse_patterns(patterns) + + +@pytest.mark.asyncio +async def test_parse_query_with_large_file_size() -> None: + """ + Test `parse_query` with a very large file size limit. + + Given a URL and max_file_size=10**9: + When `parse_query` is called, + Then `max_file_size` should be set correctly and default ignore patterns remain unchanged. + """ + query = await parse_query(DEMO_URL, max_file_size=10**9, from_web=True) + + assert query.max_file_size == 10**9 + assert query.ignore_patterns == DEFAULT_IGNORE_PATTERNS + + +@pytest.mark.asyncio +async def test_parse_query_empty_patterns() -> None: + """ + Test `parse_query` with empty patterns. + + Given empty include_patterns and ignore_patterns: + When `parse_query` is called, + Then include_patterns becomes None and default ignore patterns apply. + """ + query = await parse_query(DEMO_URL, max_file_size=50, from_web=True, include_patterns="", ignore_patterns="") + + assert query.include_patterns is None + assert query.ignore_patterns == DEFAULT_IGNORE_PATTERNS + + +@pytest.mark.asyncio +async def test_parse_query_include_and_ignore_overlap() -> None: + """ + Test `parse_query` with overlapping patterns. + + Given include="*.py" and ignore={"*.py", "*.txt"}: + When `parse_query` is called, + Then "*.py" should be removed from ignore patterns. + """ + query = await parse_query( + DEMO_URL, + max_file_size=50, + from_web=True, + include_patterns="*.py", + ignore_patterns={"*.py", "*.txt"}, + ) + + assert query.include_patterns == {"*.py"} + assert query.ignore_patterns is not None + assert "*.py" not in query.ignore_patterns + assert "*.txt" in query.ignore_patterns + + +@pytest.mark.asyncio +async def test_parse_query_local_path() -> None: + """ + Test `parse_query` with a local file path. + + Given "/home/user/project" and from_web=False: + When `parse_query` is called, + Then the local path should be set, id generated, and slug formed accordingly. + """ + path = "/home/user/project" + query = await parse_query(path, max_file_size=100, from_web=False) + tail = Path("home/user/project") + + assert query.local_path.parts[-len(tail.parts) :] == tail.parts + assert query.id is not None + assert query.slug == "home/user/project" + + +@pytest.mark.asyncio +async def test_parse_query_relative_path() -> None: + """ + Test `parse_query` with a relative path. + + Given "./project" and from_web=False: + When `parse_query` is called, + Then local_path resolves relatively, and slug ends with "project". + """ + path = "./project" + query = await parse_query(path, max_file_size=100, from_web=False) + tail = Path("project") + + assert query.local_path.parts[-len(tail.parts) :] == tail.parts + assert query.slug.endswith("project") + + +@pytest.mark.asyncio +async def test_parse_query_empty_source() -> None: + """ + Test `parse_query` with an empty string. + + Given an empty source string: + When `parse_query` is called, + Then a ValueError should be raised indicating an invalid repository URL. + """ + url = "" + + with pytest.raises(ValueError, match="Invalid repository URL"): + await parse_query(url, max_file_size=100, from_web=True) + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "path, expected_branch, expected_commit", + [ + ("/tree/main", "main", None), + ("/tree/abcd1234abcd1234abcd1234abcd1234abcd1234", None, "abcd1234abcd1234abcd1234abcd1234abcd1234"), + ], +) +async def test_parse_url_branch_and_commit_distinction( + path: str, + expected_branch: str, + expected_commit: str, + stub_branches: Callable[[List[str]], None], +) -> None: + """ + Test `_parse_remote_repo` distinguishing branch vs. commit hash. + + Given either a branch URL (e.g., ".../tree/main") or a 40-character commit URL: + When `_parse_remote_repo` is called with branch fetching, + Then the function should correctly set `branch` or `commit` based on the URL content. + """ + stub_branches(["main", "dev", "feature-branch"]) + + url = DEMO_URL + path + query = await _assert_basic_repo_fields(url) + + assert query.branch == expected_branch + assert query.commit == expected_commit + + +@pytest.mark.asyncio +async def test_parse_query_uuid_uniqueness() -> None: + """ + Test `parse_query` for unique UUID generation. + + Given the same path twice: + When `parse_query` is called repeatedly, + Then each call should produce a different query id. + """ + path = "/home/user/project" + query_1 = await parse_query(path, max_file_size=100, from_web=False) + query_2 = await parse_query(path, max_file_size=100, from_web=False) + + assert query_1.id != query_2.id + + +@pytest.mark.asyncio +async def test_parse_url_with_query_and_fragment() -> None: + """ + Test `_parse_remote_repo` with query parameters and a fragment. + + Given a URL like "https://github.com/user/repo?arg=value#fragment": + When `_parse_remote_repo` is called, + Then those parts should be stripped, leaving a clean user/repo URL. + """ + url = DEMO_URL + "?arg=value#fragment" + query = await _parse_remote_repo(url) + + assert query.user_name == "user" + assert query.repo_name == "repo" + assert query.url == DEMO_URL # URL should be cleaned + + +@pytest.mark.asyncio +async def test_parse_url_unsupported_host() -> None: + """ + Test `_parse_remote_repo` with an unsupported host. + + Given "https://only-domain.com": + When `_parse_remote_repo` is called, + Then a ValueError should be raised for the unknown domain. + """ + url = "https://only-domain.com" + + with pytest.raises(ValueError, match="Unknown domain 'only-domain.com' in URL"): + await _parse_remote_repo(url) + + +@pytest.mark.asyncio +async def test_parse_query_with_branch() -> None: + """ + Test `parse_query` when a branch is specified in a blob path. + + Given "https://github.com/pandas-dev/pandas/blob/2.2.x/...": + When `parse_query` is called, + Then the branch should be identified, subpath set, and commit remain None. + """ + url = "https://github.com/pandas-dev/pandas/blob/2.2.x/.github/ISSUE_TEMPLATE/documentation_improvement.yaml" + query = await parse_query(url, max_file_size=10**9, from_web=True) + + assert query.user_name == "pandas-dev" + assert query.repo_name == "pandas" + assert query.url == "https://github.com/pandas-dev/pandas" + assert query.slug == "pandas-dev-pandas" + assert query.id is not None + assert query.subpath == "/.github/ISSUE_TEMPLATE/documentation_improvement.yaml" + assert query.branch == "2.2.x" + assert query.commit is None + assert query.type == "blob" + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "path, expected_branch, expected_subpath", + [ + ("/tree/main/src", "main", "/src"), + ("/tree/fix1", "fix1", "/"), + ("/tree/nonexistent-branch/src", "nonexistent-branch", "/src"), + ], +) +async def test_parse_repo_source_with_failed_git_command( + path: str, + expected_branch: str, + expected_subpath: str, + mocker: MockerFixture, +) -> None: + """ + Test `_parse_remote_repo` when git fetch fails. + + Given a URL referencing a branch, but Git fetching fails: + When `_parse_remote_repo` is called, + Then it should fall back to path components for branch identification. + """ + url = DEMO_URL + path + + mock_fetch_branches = mocker.patch("gitingest.utils.git_utils.fetch_remote_branch_list", new_callable=AsyncMock) + mock_fetch_branches.side_effect = Exception("Failed to fetch branch list") + + with pytest.warns( + RuntimeWarning, + match="Warning: Failed to fetch branch list: Command failed: " + "git ls-remote --heads https://github.com/user/repo", + ): + query = await _parse_remote_repo(url) + + assert query.branch == expected_branch + assert query.subpath == expected_subpath + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + ("path", "expected_branch", "expected_subpath"), + [ + ("/tree/feature/fix1/src", "feature/fix1", "/src"), + ("/tree/main/src", "main", "/src"), + ("", None, "/"), + ("/tree/nonexistent-branch/src", None, "/"), + ("/tree/fix", "fix", "/"), + ("/blob/fix/page.html", "fix", "/page.html"), + ], +) +async def test_parse_repo_source_with_various_url_patterns( + path: str, + expected_branch: Optional[str], + expected_subpath: str, + stub_branches: Callable[[List[str]], None], +) -> None: + """ + `_parse_remote_repo` should detect (or reject) a branch and resolve the + sub-path for various GitHub-style URL permutations. + + Branch discovery is stubbed so that only names passed to `stub_branches` are considered "remote". + """ + stub_branches(["feature/fix1", "main", "feature-branch", "fix"]) + + url = DEMO_URL + path + query = await _assert_basic_repo_fields(url) + + assert query.branch == expected_branch + assert query.subpath == expected_subpath + + +async def _assert_basic_repo_fields(url: str) -> IngestionQuery: + """Run _parse_remote_repo and assert user, repo and slug are parsed.""" + + query = await _parse_remote_repo(url) + + assert query.user_name == "user" + assert query.repo_name == "repo" + assert query.slug == "user-repo" + + return query + + From 2ae2f14b0194bd0d9e1b689952462d4c8c9f5122 Mon Sep 17 00:00:00 2001 From: Sendi John Date: Sat, 28 Jun 2025 09:12:30 +0100 Subject: [PATCH 2/9] Delete current_help.txt --- current_help.txt | 36 ------------------------------------ 1 file changed, 36 deletions(-) delete mode 100644 current_help.txt diff --git a/current_help.txt b/current_help.txt deleted file mode 100644 index 0477c796..00000000 --- a/current_help.txt +++ /dev/null @@ -1,36 +0,0 @@ -Usage: gitingest [OPTIONS] [SOURCE] - - Main entry point for the CLI. This function is called when the CLI is run as - a script. - - It calls the async main function to run the command. - - Parameters ---------- source : str A directory path or a Git repository - URL. output : str, optional The path where the output file will be - written. If not specified, the output will be written to a file named - `.txt` in the current directory. Use '-' to output to stdout. - max_size : int Maximum file size (in bytes) to consider. exclude_pattern - : Tuple[str, ...] Glob patterns for pruning the file set. - include_pattern : Tuple[str, ...] Glob patterns for including files in - the output. branch : str, optional Specific branch to ingest (defaults - to the repository's default). include_gitignored : bool If provided, - include files normally ignored by .gitignore. token: str, optional - GitHub personal-access token (PAT). Needed when *source* refers to a - **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. - -Options: - -o, --output TEXT Output file path (default: .txt in - current directory) - -s, --max-size INTEGER Maximum file size to process in bytes - -e, --exclude-pattern TEXT Patterns to exclude. Handles Python's arbitrary - subset of Unix shell-style wildcards. See: - https://docs.python.org/3/library/fnmatch.html - -i, --include-pattern TEXT Patterns to include. Handles Python's arbitrary - subset of Unix shell-style wildcards. See: - https://docs.python.org/3/library/fnmatch.html - -b, --branch TEXT Branch to clone and ingest - --include-gitignored Include files matched by .gitignore - -t, --token TEXT GitHub personal access token for accessing - private repositories. If omitted, the CLI will - look for the GITHUB_TOKEN environment variable. - --help Show this message and exit. From eb424e11bf06e2c0c388dd8c0f0aede35ec79ca0 Mon Sep 17 00:00:00 2001 From: Sendi John Date: Sat, 28 Jun 2025 09:13:18 +0100 Subject: [PATCH 3/9] Delete test.txt --- test.txt | 5928 ------------------------------------------------------ 1 file changed, 5928 deletions(-) delete mode 100644 test.txt diff --git a/test.txt b/test.txt deleted file mode 100644 index 0e2d7c2c..00000000 --- a/test.txt +++ /dev/null @@ -1,5928 +0,0 @@ -Directory structure: -└── gitingest/ - ├── src/ - │ ├── gitingest/ - │ │ ├── __init__.py - │ │ ├── cli.py - │ │ ├── cloning.py - │ │ ├── config.py - │ │ ├── entrypoint.py - │ │ ├── ingestion.py - │ │ ├── output_formatters.py - │ │ ├── query_parsing.py - │ │ ├── schemas/ - │ │ │ ├── __init__.py - │ │ │ ├── filesystem_schema.py - │ │ │ └── ingestion_schema.py - │ │ └── utils/ - │ │ ├── __init__.py - │ │ ├── exceptions.py - │ │ ├── file_utils.py - │ │ ├── git_utils.py - │ │ ├── ignore_patterns.py - │ │ ├── ingestion_utils.py - │ │ ├── notebook_utils.py - │ │ ├── os_utils.py - │ │ ├── path_utils.py - │ │ ├── query_parser_utils.py - │ │ └── timeout_wrapper.py - │ └── server/ - │ ├── __init__.py - │ ├── main.py - │ ├── query_processor.py - │ ├── server_config.py - │ ├── server_utils.py - │ └── routers/ - │ ├── __init__.py - │ ├── download.py - │ ├── dynamic.py - │ └── index.py - └── tests/ - ├── __init__.py - ├── conftest.py - ├── test_cli.py - ├── test_flow_integration.py - ├── test_git_utils.py - ├── test_gitignore_feature.py - ├── test_ingestion.py - ├── test_notebook_utils.py - ├── test_repository_clone.py - └── query_parser/ - ├── __init__.py - ├── test_git_host_agnostic.py - └── test_query_parser.py - -================================================ -FILE: src/gitingest/__init__.py -================================================ -"""Gitingest: A package for ingesting data from Git repositories.""" - -from gitingest.cloning import clone_repo -from gitingest.entrypoint import ingest, ingest_async -from gitingest.ingestion import ingest_query -from gitingest.query_parsing import parse_query - -__all__ = ["ingest_query", "clone_repo", "parse_query", "ingest", "ingest_async"] - - - -================================================ -FILE: src/gitingest/cli.py -================================================ -"""Command-line interface for the Gitingest package.""" - -# pylint: disable=no-value-for-parameter - -import asyncio -from typing import Optional, Tuple - -import click - -from gitingest.config import MAX_FILE_SIZE, OUTPUT_FILE_NAME -from gitingest.entrypoint import ingest_async - - -@click.command() -@click.argument("source", type=str, default=".") -@click.option( - "--output", - "-o", - default=None, - help="Output file path (default: digest.txt in current directory). Use '-' for stdout.", -) -@click.option( - "--max-size", - "-s", - default=MAX_FILE_SIZE, - help="Maximum file size to process in bytes", -) -@click.option( - "--exclude-pattern", - "-e", - multiple=True, - help=( - "Patterns to exclude. Handles Python's arbitrary subset of Unix shell-style " - "wildcards. See: https://docs.python.org/3/library/fnmatch.html" - ), -) -@click.option( - "--include-pattern", - "-i", - multiple=True, - help=( - "Patterns to include. Handles Python's arbitrary subset of Unix shell-style " - "wildcards. See: https://docs.python.org/3/library/fnmatch.html" - ), -) -@click.option("--branch", "-b", default=None, help="Branch to clone and ingest") -@click.option( - "--include-gitignored", - is_flag=True, - default=False, - help="Include files matched by .gitignore", -) -@click.option( - "--token", - "-t", - envvar="GITHUB_TOKEN", - default=None, - help=( - "GitHub personal access token for accessing private repositories. " - "If omitted, the CLI will look for the GITHUB_TOKEN environment variable." - ), -) -def main( - source: str, - output: Optional[str], - max_size: int, - exclude_pattern: Tuple[str, ...], - include_pattern: Tuple[str, ...], - branch: Optional[str], - include_gitignored: bool, - token: Optional[str], -): - """ - Main entry point for the CLI. This function is called when the CLI is run as a script. - - It calls the async main function to run the command. - - Parameters - ---------- - source : str - A directory path or a Git repository URL. - output : str, optional - The path where the output file will be written. If not specified, the output will be written - to a file named `digest.txt` in the current directory. Use '-' to output to stdout. - max_size : int - Maximum file size (in bytes) to consider. - exclude_pattern : Tuple[str, ...] - Glob patterns for pruning the file set. - include_pattern : Tuple[str, ...] - Glob patterns for including files in the output. - branch : str, optional - Specific branch to ingest (defaults to the repository's default). - include_gitignored : bool - If provided, include files normally ignored by .gitignore. - token: str, optional - GitHub personal-access token (PAT). Needed when *source* refers to a - **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. - - Examples - -------- - Basic usage: - $ gitingest . - $ gitingest /path/to/repo - $ gitingest https://github.com/user/repo - - Output to stdout: - $ gitingest . -o - - $ gitingest https://github.com/user/repo --output - - - With filtering: - $ gitingest . -i "*.py" -e "*.log" - $ gitingest . --include-pattern "*.js" --exclude-pattern "node_modules/*" - - Private repositories: - $ gitingest https://github.com/user/private-repo -t ghp_token - $ GITHUB_TOKEN=ghp_token gitingest https://github.com/user/private-repo - """ - asyncio.run( - _async_main( - source=source, - output=output, - max_size=max_size, - exclude_pattern=exclude_pattern, - include_pattern=include_pattern, - branch=branch, - include_gitignored=include_gitignored, - token=token, - ) - ) - - -async def _async_main( - source: str, - output: Optional[str], - max_size: int, - exclude_pattern: Tuple[str, ...], - include_pattern: Tuple[str, ...], - branch: Optional[str], - include_gitignored: bool, - token: Optional[str], -) -> None: - """ - Analyze a directory or repository and create a text dump of its contents. - - This command analyzes the contents of a specified source directory or repository, applies custom include and - exclude patterns, and generates a text summary of the analysis which is then written to an output file - or printed to stdout. - - Parameters - ---------- - source : str - A directory path or a Git repository URL. - output : str, optional - The path where the output file will be written. If not specified, the output will be written - to a file named `digest.txt` in the current directory. Use '-' to output to stdout. - max_size : int - Maximum file size (in bytes) to consider. - exclude_pattern : Tuple[str, ...] - Glob patterns for pruning the file set. - include_pattern : Tuple[str, ...] - Glob patterns for including files in the output. - branch : str, optional - Specific branch to ingest (defaults to the repository's default). - include_gitignored : bool - If provided, include files normally ignored by .gitignore. - token: str, optional - GitHub personal-access token (PAT). Needed when *source* refers to a - **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. - - Raises - ------ - Abort - If there is an error during the execution of the command, this exception is raised to abort the process. - """ - try: - # Normalise pattern containers (the ingest layer expects sets) - exclude_patterns = set(exclude_pattern) - include_patterns = set(include_pattern) - - output_target = output if output is not None else OUTPUT_FILE_NAME - - if output_target == "-": - click.echo("Analyzing source, preparing output for stdout...", err=True) - else: - click.echo(f"Analyzing source, output will be written to '{output_target}'...", err=True) - - summary, _, _ = await ingest_async( - source=source, - max_file_size=max_size, - include_patterns=include_patterns, - exclude_patterns=exclude_patterns, - branch=branch, - output=output_target, - include_gitignored=include_gitignored, - token=token, - ) - - if output_target == "-": # stdout - click.echo("\n--- Summary ---", err=True) - click.echo(summary, err=True) - click.echo("--- End Summary ---", err=True) - click.echo("Analysis complete! Output sent to stdout.", err=True) - else: # file - click.echo(f"Analysis complete! Output written to: {output_target}") - click.echo("\nSummary:") - click.echo(summary) - - except Exception as exc: - # Convert any exception into Click.Abort so that exit status is non-zero - click.echo(f"Error: {exc}", err=True) - raise click.Abort() from exc - - -if __name__ == "__main__": - main() - - -================================================ -FILE: src/gitingest/cloning.py -================================================ -"""This module contains functions for cloning a Git repository to a local path.""" - -from pathlib import Path -from typing import Optional - -from gitingest.config import DEFAULT_TIMEOUT -from gitingest.schemas import CloneConfig -from gitingest.utils.git_utils import ( - check_repo_exists, - create_git_auth_header, - create_git_command, - ensure_git_installed, - run_command, - validate_github_token, -) -from gitingest.utils.os_utils import ensure_directory -from gitingest.utils.timeout_wrapper import async_timeout - - -@async_timeout(DEFAULT_TIMEOUT) -async def clone_repo(config: CloneConfig, token: Optional[str] = None) -> None: - """ - Clone a repository to a local path based on the provided configuration. - - This function handles the process of cloning a Git repository to the local file system. - It can clone a specific branch or commit if provided, and it raises exceptions if - any errors occur during the cloning process. - - Parameters - ---------- - config : CloneConfig - The configuration for cloning the repository. - token : str, optional - GitHub personal-access token (PAT). Needed when *source* refers to a - **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. - Must start with 'github_pat_' or 'gph_' for GitHub repositories. - - Raises - ------ - ValueError - If the repository is not found, if the provided URL is invalid, or if the token format is invalid. - """ - # Extract and validate query parameters - url: str = config.url - local_path: str = config.local_path - commit: Optional[str] = config.commit - branch: Optional[str] = config.branch - partial_clone: bool = config.subpath != "/" - - # Validate token if provided - if token and url.startswith("https://github.com"): - validate_github_token(token) - - # Create parent directory if it doesn't exist - await ensure_directory(Path(local_path).parent) - - # Check if the repository exists - if not await check_repo_exists(url, token=token): - raise ValueError("Repository not found. Make sure it is public or that you have provided a valid token.") - - clone_cmd = ["git"] - if token and url.startswith("https://github.com"): - clone_cmd += ["-c", create_git_auth_header(token)] - - clone_cmd += ["clone", "--single-branch"] - # TODO: Re-enable --recurse-submodules when submodule support is needed - - if partial_clone: - clone_cmd += ["--filter=blob:none", "--sparse"] - - if not commit: - clone_cmd += ["--depth=1"] - if branch and branch.lower() not in ("main", "master"): - clone_cmd += ["--branch", branch] - - clone_cmd += [url, local_path] - - # Clone the repository - await ensure_git_installed() - await run_command(*clone_cmd) - - # Checkout the subpath if it is a partial clone - if partial_clone: - subpath = config.subpath.lstrip("/") - if config.blob: - # When ingesting from a file url (blob/branch/path/file.txt), we need to remove the file name. - subpath = str(Path(subpath).parent.as_posix()) - - checkout_cmd = create_git_command(["git"], local_path, url, token) - await run_command(*checkout_cmd, "sparse-checkout", "set", subpath) - - # Checkout the commit if it is provided - if commit: - checkout_cmd = create_git_command(["git"], local_path, url, token) - await run_command(*checkout_cmd, "checkout", commit) - - - -================================================ -FILE: src/gitingest/config.py -================================================ -"""Configuration file for the project.""" - -import tempfile -from pathlib import Path - -MAX_FILE_SIZE = 10 * 1024 * 1024 # 10 MB -MAX_DIRECTORY_DEPTH = 20 # Maximum depth of directory traversal -MAX_FILES = 10_000 # Maximum number of files to process -MAX_TOTAL_SIZE_BYTES = 500 * 1024 * 1024 # 500 MB -DEFAULT_TIMEOUT = 60 # seconds - -OUTPUT_FILE_NAME = "digest.txt" - -TMP_BASE_PATH = Path(tempfile.gettempdir()) / "gitingest" - - - -================================================ -FILE: src/gitingest/entrypoint.py -================================================ -"""Main entry point for ingesting a source and processing its contents.""" - -import asyncio -import inspect -import os -import shutil -import sys -from typing import Optional, Set, Tuple, Union - -from gitingest.cloning import clone_repo -from gitingest.config import TMP_BASE_PATH -from gitingest.ingestion import ingest_query -from gitingest.query_parsing import IngestionQuery, parse_query -from gitingest.utils.ignore_patterns import load_gitignore_patterns - - -async def ingest_async( - source: str, - max_file_size: int = 10 * 1024 * 1024, # 10 MB - include_patterns: Optional[Union[str, Set[str]]] = None, - exclude_patterns: Optional[Union[str, Set[str]]] = None, - branch: Optional[str] = None, - include_gitignored: bool = False, - token: Optional[str] = None, - output: Optional[str] = None, -) -> Tuple[str, str, str]: - """ - Main entry point for ingesting a source and processing its contents. - - This function analyzes a source (URL or local path), clones the corresponding repository (if applicable), - and processes its files according to the specified query parameters. It returns a summary, a tree-like - structure of the files, and the content of the files. The results can optionally be written to an output file. - - Parameters - ---------- - source : str - The source to analyze, which can be a URL (for a Git repository) or a local directory path. - max_file_size : int - Maximum allowed file size for file ingestion. Files larger than this size are ignored, by default - 10*1024*1024 (10 MB). - include_patterns : Union[str, Set[str]], optional - Pattern or set of patterns specifying which files to include. If `None`, all files are included. - exclude_patterns : Union[str, Set[str]], optional - Pattern or set of patterns specifying which files to exclude. If `None`, no files are excluded. - branch : str, optional - The branch to clone and ingest. If `None`, the default branch is used. - include_gitignored : bool - If ``True``, include files ignored by ``.gitignore``. Defaults to ``False``. - token : str, optional - GitHub personal-access token (PAT). Needed when *source* refers to a - **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. - output : str, optional - File path where the summary and content should be written. If `None`, the results are not written to a file. - - Returns - ------- - Tuple[str, str, str] - A tuple containing: - - A summary string of the analyzed repository or directory. - - A tree-like string representation of the file structure. - - The content of the files in the repository or directory. - - Raises - ------ - TypeError - If `clone_repo` does not return a coroutine, or if the `source` is of an unsupported type. - """ - repo_cloned = False - - if not token: - token = os.getenv("GITHUB_TOKEN") - - try: - query: IngestionQuery = await parse_query( - source=source, - max_file_size=max_file_size, - from_web=False, - include_patterns=include_patterns, - ignore_patterns=exclude_patterns, - token=token, - ) - - if not include_gitignored: - gitignore_patterns = load_gitignore_patterns(query.local_path) - query.ignore_patterns.update(gitignore_patterns) - - if query.url: - selected_branch = branch if branch else query.branch # prioritize branch argument - query.branch = selected_branch - - clone_config = query.extract_clone_config() - clone_coroutine = clone_repo(clone_config, token=token) - - if inspect.iscoroutine(clone_coroutine): - if asyncio.get_event_loop().is_running(): - await clone_coroutine - else: - asyncio.run(clone_coroutine) - else: - raise TypeError("clone_repo did not return a coroutine as expected.") - - repo_cloned = True - - summary, tree, content = ingest_query(query) - - if output == "-": - loop = asyncio.get_running_loop() - output_data = tree + "\n" + content - await loop.run_in_executor(None, sys.stdout.write, output_data) - await loop.run_in_executor(None, sys.stdout.flush) - elif output is not None: - with open(output, "w", encoding="utf-8") as f: - f.write(tree + "\n" + content) - - return summary, tree, content - finally: - # Clean up the temporary directory if it was created - if repo_cloned: - shutil.rmtree(TMP_BASE_PATH, ignore_errors=True) - - -def ingest( - source: str, - max_file_size: int = 10 * 1024 * 1024, # 10 MB - include_patterns: Optional[Union[str, Set[str]]] = None, - exclude_patterns: Optional[Union[str, Set[str]]] = None, - branch: Optional[str] = None, - include_gitignored: bool = False, - token: Optional[str] = None, - output: Optional[str] = None, -) -> Tuple[str, str, str]: - """ - Synchronous version of ingest_async. - - This function analyzes a source (URL or local path), clones the corresponding repository (if applicable), - and processes its files according to the specified query parameters. It returns a summary, a tree-like - structure of the files, and the content of the files. The results can optionally be written to an output file. - - Parameters - ---------- - source : str - The source to analyze, which can be a URL (for a Git repository) or a local directory path. - max_file_size : int - Maximum allowed file size for file ingestion. Files larger than this size are ignored, by default - 10*1024*1024 (10 MB). - include_patterns : Union[str, Set[str]], optional - Pattern or set of patterns specifying which files to include. If `None`, all files are included. - exclude_patterns : Union[str, Set[str]], optional - Pattern or set of patterns specifying which files to exclude. If `None`, no files are excluded. - branch : str, optional - The branch to clone and ingest. If `None`, the default branch is used. - include_gitignored : bool - If ``True``, include files ignored by ``.gitignore``. Defaults to ``False``. - token : str, optional - GitHub personal-access token (PAT). Needed when *source* refers to a - **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. - output : str, optional - File path where the summary and content should be written. If `None`, the results are not written to a file. - - Returns - ------- - Tuple[str, str, str] - A tuple containing: - - A summary string of the analyzed repository or directory. - - A tree-like string representation of the file structure. - - The content of the files in the repository or directory. - - See Also - -------- - ingest_async : The asynchronous version of this function. - """ - return asyncio.run( - ingest_async( - source=source, - max_file_size=max_file_size, - include_patterns=include_patterns, - exclude_patterns=exclude_patterns, - branch=branch, - include_gitignored=include_gitignored, - token=token, - output=output, - ) - ) - - - -================================================ -FILE: src/gitingest/ingestion.py -================================================ -"""Functions to ingest and analyze a codebase directory or single file.""" - -import warnings -from pathlib import Path -from typing import Tuple - -from gitingest.config import MAX_DIRECTORY_DEPTH, MAX_FILES, MAX_TOTAL_SIZE_BYTES -from gitingest.output_formatters import format_node -from gitingest.query_parsing import IngestionQuery -from gitingest.schemas import FileSystemNode, FileSystemNodeType, FileSystemStats -from gitingest.utils.ingestion_utils import _should_exclude, _should_include - -try: - import tomllib # type: ignore[import] -except ImportError: - import tomli as tomllib - - -def ingest_query(query: IngestionQuery) -> Tuple[str, str, str]: - """ - Run the ingestion process for a parsed query. - - This is the main entry point for analyzing a codebase directory or single file. It processes the query - parameters, reads the file or directory content, and generates a summary, directory structure, and file content, - along with token estimations. - - Parameters - ---------- - query : IngestionQuery - The parsed query object containing information about the repository and query parameters. - - Returns - ------- - Tuple[str, str, str] - A tuple containing the summary, directory structure, and file contents. - - Raises - ------ - ValueError - If the path cannot be found, is not a file, or the file has no content. - """ - subpath = Path(query.subpath.strip("/")).as_posix() - path = query.local_path / subpath - - apply_gitingest_file(path, query) - - if not path.exists(): - raise ValueError(f"{query.slug} cannot be found") - - if (query.type and query.type == "blob") or query.local_path.is_file(): - # TODO: We do this wrong! We should still check the branch and commit! - if not path.is_file(): - raise ValueError(f"Path {path} is not a file") - - relative_path = path.relative_to(query.local_path) - - file_node = FileSystemNode( - name=path.name, - type=FileSystemNodeType.FILE, - size=path.stat().st_size, - file_count=1, - path_str=str(relative_path), - path=path, - ) - - if not file_node.content: - raise ValueError(f"File {file_node.name} has no content") - - return format_node(file_node, query) - - root_node = FileSystemNode( - name=path.name, - type=FileSystemNodeType.DIRECTORY, - path_str=str(path.relative_to(query.local_path)), - path=path, - ) - - stats = FileSystemStats() - - _process_node( - node=root_node, - query=query, - stats=stats, - ) - - return format_node(root_node, query) - - -def apply_gitingest_file(path: Path, query: IngestionQuery) -> None: - """ - Apply the .gitingest file to the query object. - - This function reads the .gitingest file in the specified path and updates the query object with the ignore - patterns found in the file. - - Parameters - ---------- - path : Path - The path of the directory to ingest. - query : IngestionQuery - The parsed query object containing information about the repository and query parameters. - It should have an attribute `ignore_patterns` which is either None or a set of strings. - """ - path_gitingest = path / ".gitingest" - - if not path_gitingest.is_file(): - return - - try: - with path_gitingest.open("rb") as f: - data = tomllib.load(f) - except tomllib.TOMLDecodeError as exc: - warnings.warn(f"Invalid TOML in {path_gitingest}: {exc}", UserWarning) - return - - config_section = data.get("config", {}) - ignore_patterns = config_section.get("ignore_patterns") - - if not ignore_patterns: - return - - # If a single string is provided, make it a list of one element - if isinstance(ignore_patterns, str): - ignore_patterns = [ignore_patterns] - - if not isinstance(ignore_patterns, (list, set)): - warnings.warn( - f"Expected a list/set for 'ignore_patterns', got {type(ignore_patterns)} in {path_gitingest}. Skipping.", - UserWarning, - ) - return - - # Filter out duplicated patterns - ignore_patterns = set(ignore_patterns) - - # Filter out any non-string entries - valid_patterns = {pattern for pattern in ignore_patterns if isinstance(pattern, str)} - invalid_patterns = ignore_patterns - valid_patterns - - if invalid_patterns: - warnings.warn(f"Ignore patterns {invalid_patterns} are not strings. Skipping.", UserWarning) - - if not valid_patterns: - return - - if query.ignore_patterns is None: - query.ignore_patterns = valid_patterns - else: - query.ignore_patterns.update(valid_patterns) - - return - - -def _process_node( - node: FileSystemNode, - query: IngestionQuery, - stats: FileSystemStats, -) -> None: - """ - Process a file or directory item within a directory. - - This function handles each file or directory item, checking if it should be included or excluded based on the - provided patterns. It handles symlinks, directories, and files accordingly. - - Parameters - ---------- - node : FileSystemNode - The current directory or file node being processed. - query : IngestionQuery - The parsed query object containing information about the repository and query parameters. - stats : FileSystemStats - Statistics tracking object for the total file count and size. - """ - - if limit_exceeded(stats, node.depth): - return - - for sub_path in node.path.iterdir(): - - if query.ignore_patterns and _should_exclude(sub_path, query.local_path, query.ignore_patterns): - continue - - if query.include_patterns and not _should_include(sub_path, query.local_path, query.include_patterns): - continue - - if sub_path.is_symlink(): - _process_symlink(path=sub_path, parent_node=node, stats=stats, local_path=query.local_path) - elif sub_path.is_file(): - _process_file(path=sub_path, parent_node=node, stats=stats, local_path=query.local_path) - elif sub_path.is_dir(): - - child_directory_node = FileSystemNode( - name=sub_path.name, - type=FileSystemNodeType.DIRECTORY, - path_str=str(sub_path.relative_to(query.local_path)), - path=sub_path, - depth=node.depth + 1, - ) - - _process_node( - node=child_directory_node, - query=query, - stats=stats, - ) - - if not child_directory_node.children: - continue - - node.children.append(child_directory_node) - node.size += child_directory_node.size - node.file_count += child_directory_node.file_count - node.dir_count += 1 + child_directory_node.dir_count - else: - print(f"Warning: {sub_path} is an unknown file type, skipping") - - node.sort_children() - - -def _process_symlink(path: Path, parent_node: FileSystemNode, stats: FileSystemStats, local_path: Path) -> None: - """ - Process a symlink in the file system. - - This function checks the symlink's target. - - Parameters - ---------- - path : Path - The full path of the symlink. - parent_node : FileSystemNode - The parent directory node. - stats : FileSystemStats - Statistics tracking object for the total file count and size. - local_path : Path - The base path of the repository or directory being processed. - """ - child = FileSystemNode( - name=path.name, - type=FileSystemNodeType.SYMLINK, - path_str=str(path.relative_to(local_path)), - path=path, - depth=parent_node.depth + 1, - ) - stats.total_files += 1 - parent_node.children.append(child) - parent_node.file_count += 1 - - -def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStats, local_path: Path) -> None: - """ - Process a file in the file system. - - This function checks the file's size, increments the statistics, and reads its content. - If the file size exceeds the maximum allowed, it raises an error. - - Parameters - ---------- - path : Path - The full path of the file. - parent_node : FileSystemNode - The dictionary to accumulate the results. - stats : FileSystemStats - Statistics tracking object for the total file count and size. - local_path : Path - The base path of the repository or directory being processed. - """ - file_size = path.stat().st_size - if stats.total_size + file_size > MAX_TOTAL_SIZE_BYTES: - print(f"Skipping file {path}: would exceed total size limit") - return - - stats.total_files += 1 - stats.total_size += file_size - - if stats.total_files > MAX_FILES: - print(f"Maximum file limit ({MAX_FILES}) reached") - return - - child = FileSystemNode( - name=path.name, - type=FileSystemNodeType.FILE, - size=file_size, - file_count=1, - path_str=str(path.relative_to(local_path)), - path=path, - depth=parent_node.depth + 1, - ) - - parent_node.children.append(child) - parent_node.size += file_size - parent_node.file_count += 1 - - -def limit_exceeded(stats: FileSystemStats, depth: int) -> bool: - """ - Check if any of the traversal limits have been exceeded. - - This function checks if the current traversal has exceeded any of the configured limits: - maximum directory depth, maximum number of files, or maximum total size in bytes. - - Parameters - ---------- - stats : FileSystemStats - Statistics tracking object for the total file count and size. - depth : int - The current depth of directory traversal. - - Returns - ------- - bool - True if any limit has been exceeded, False otherwise. - """ - if depth > MAX_DIRECTORY_DEPTH: - print(f"Maximum depth limit ({MAX_DIRECTORY_DEPTH}) reached") - return True - - if stats.total_files >= MAX_FILES: - print(f"Maximum file limit ({MAX_FILES}) reached") - return True # TODO: end recursion - - if stats.total_size >= MAX_TOTAL_SIZE_BYTES: - print(f"Maxumum total size limit ({MAX_TOTAL_SIZE_BYTES/1024/1024:.1f}MB) reached") - return True # TODO: end recursion - - return False - - - -================================================ -FILE: src/gitingest/output_formatters.py -================================================ -"""Functions to ingest and analyze a codebase directory or single file.""" - -from typing import Optional, Tuple - -import tiktoken - -from gitingest.query_parsing import IngestionQuery -from gitingest.schemas import FileSystemNode, FileSystemNodeType - - -def format_node(node: FileSystemNode, query: IngestionQuery) -> Tuple[str, str, str]: - """ - Generate a summary, directory structure, and file contents for a given file system node. - - If the node represents a directory, the function will recursively process its contents. - - Parameters - ---------- - node : FileSystemNode - The file system node to be summarized. - query : IngestionQuery - The parsed query object containing information about the repository and query parameters. - - Returns - ------- - Tuple[str, str, str] - A tuple containing the summary, directory structure, and file contents. - """ - is_single_file = node.type == FileSystemNodeType.FILE - summary = _create_summary_prefix(query, single_file=is_single_file) - - if node.type == FileSystemNodeType.DIRECTORY: - summary += f"Files analyzed: {node.file_count}\n" - elif node.type == FileSystemNodeType.FILE: - summary += f"File: {node.name}\n" - summary += f"Lines: {len(node.content.splitlines()):,}\n" - - tree = "Directory structure:\n" + _create_tree_structure(query, node) - _create_tree_structure(query, node) - - content = _gather_file_contents(node) - - token_estimate = _format_token_count(tree + content) - if token_estimate: - summary += f"\nEstimated tokens: {token_estimate}" - - return summary, tree, content - - -def _create_summary_prefix(query: IngestionQuery, single_file: bool = False) -> str: - """ - Create a prefix string for summarizing a repository or local directory. - - Includes repository name (if provided), commit/branch details, and subpath if relevant. - - Parameters - ---------- - query : IngestionQuery - The parsed query object containing information about the repository and query parameters. - single_file : bool - A flag indicating whether the summary is for a single file, by default False. - - Returns - ------- - str - A summary prefix string containing repository, commit, branch, and subpath details. - """ - parts = [] - - if query.user_name: - parts.append(f"Repository: {query.user_name}/{query.repo_name}") - else: - # Local scenario - parts.append(f"Directory: {query.slug}") - - if query.commit: - parts.append(f"Commit: {query.commit}") - elif query.branch and query.branch not in ("main", "master"): - parts.append(f"Branch: {query.branch}") - - if query.subpath != "/" and not single_file: - parts.append(f"Subpath: {query.subpath}") - - return "\n".join(parts) + "\n" - - -def _gather_file_contents(node: FileSystemNode) -> str: - """ - Recursively gather contents of all files under the given node. - - This function recursively processes a directory node and gathers the contents of all files - under that node. It returns the concatenated content of all files as a single string. - - Parameters - ---------- - node : FileSystemNode - The current directory or file node being processed. - - Returns - ------- - str - The concatenated content of all files under the given node. - """ - if node.type != FileSystemNodeType.DIRECTORY: - return node.content_string - - # Recursively gather contents of all files under the current directory - return "\n".join(_gather_file_contents(child) for child in node.children) - - -def _create_tree_structure(query: IngestionQuery, node: FileSystemNode, prefix: str = "", is_last: bool = True) -> str: - """ - Generate a tree-like string representation of the file structure. - - This function generates a string representation of the directory structure, formatted - as a tree with appropriate indentation for nested directories and files. - - Parameters - ---------- - query : IngestionQuery - The parsed query object containing information about the repository and query parameters. - node : FileSystemNode - The current directory or file node being processed. - prefix : str - A string used for indentation and formatting of the tree structure, by default "". - is_last : bool - A flag indicating whether the current node is the last in its directory, by default True. - - Returns - ------- - str - A string representing the directory structure formatted as a tree. - """ - if not node.name: - # If no name is present, use the slug as the top-level directory name - node.name = query.slug - - tree_str = "" - current_prefix = "└── " if is_last else "├── " - - # Indicate directories with a trailing slash - display_name = node.name - if node.type == FileSystemNodeType.DIRECTORY: - display_name += "/" - elif node.type == FileSystemNodeType.SYMLINK: - display_name += " -> " + node.path.readlink().name - - tree_str += f"{prefix}{current_prefix}{display_name}\n" - - if node.type == FileSystemNodeType.DIRECTORY and node.children: - prefix += " " if is_last else "│ " - for i, child in enumerate(node.children): - tree_str += _create_tree_structure(query, node=child, prefix=prefix, is_last=i == len(node.children) - 1) - return tree_str - - -def _format_token_count(text: str) -> Optional[str]: - """ - Return a human-readable string representing the token count of the given text. - - E.g., '120' -> '120', '1200' -> '1.2k', '1200000' -> '1.2M'. - - Parameters - ---------- - text : str - The text string for which the token count is to be estimated. - - Returns - ------- - str, optional - The formatted number of tokens as a string (e.g., '1.2k', '1.2M'), or `None` if an error occurs. - """ - try: - encoding = tiktoken.get_encoding("o200k_base") # gpt-4o, gpt-4o-mini - total_tokens = len(encoding.encode(text, disallowed_special=())) - except (ValueError, UnicodeEncodeError) as exc: - print(exc) - return None - - if total_tokens >= 1_000_000: - return f"{total_tokens / 1_000_000:.1f}M" - - if total_tokens >= 1_000: - return f"{total_tokens / 1_000:.1f}k" - - return str(total_tokens) - - - -================================================ -FILE: src/gitingest/query_parsing.py -================================================ -"""This module contains functions to parse and validate input sources and patterns.""" - -import re -import uuid -import warnings -from pathlib import Path -from typing import List, Optional, Set, Union -from urllib.parse import unquote, urlparse - -from gitingest.config import TMP_BASE_PATH -from gitingest.schemas import IngestionQuery -from gitingest.utils.exceptions import InvalidPatternError -from gitingest.utils.git_utils import check_repo_exists, fetch_remote_branch_list -from gitingest.utils.ignore_patterns import DEFAULT_IGNORE_PATTERNS -from gitingest.utils.query_parser_utils import ( - KNOWN_GIT_HOSTS, - _get_user_and_repo_from_path, - _is_valid_git_commit_hash, - _is_valid_pattern, - _normalize_pattern, - _validate_host, - _validate_url_scheme, -) - - -async def parse_query( - source: str, - max_file_size: int, - from_web: bool, - include_patterns: Optional[Union[str, Set[str]]] = None, - ignore_patterns: Optional[Union[str, Set[str]]] = None, - token: Optional[str] = None, -) -> IngestionQuery: - """ - Parse the input source (URL or path) to extract relevant details for the query. - - This function parses the input source to extract details such as the username, repository name, - commit hash, branch name, and other relevant information. It also processes the include and ignore - patterns to filter the files and directories to include or exclude from the query. - - Parameters - ---------- - source : str - The source URL or file path to parse. - max_file_size : int - The maximum file size in bytes to include. - from_web : bool - Flag indicating whether the source is a web URL. - include_patterns : Union[str, Set[str]], optional - Patterns to include, by default None. Can be a set of strings or a single string. - ignore_patterns : Union[str, Set[str]], optional - Patterns to ignore, by default None. Can be a set of strings or a single string. - token : str, optional - GitHub personal-access token (PAT). Needed when *source* refers to a - **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. - Must start with 'github_pat_' or 'gph_' for GitHub repositories. - Returns - ------- - IngestionQuery - A dataclass object containing the parsed details of the repository or file path. - """ - - # Determine the parsing method based on the source type - if from_web or urlparse(source).scheme in ("https", "http") or any(h in source for h in KNOWN_GIT_HOSTS): - # We either have a full URL or a domain-less slug - query = await _parse_remote_repo(source, token=token) - else: - # Local path scenario - query = _parse_local_dir_path(source) - - # Combine default ignore patterns + custom patterns - ignore_patterns_set = DEFAULT_IGNORE_PATTERNS.copy() - if ignore_patterns: - ignore_patterns_set.update(_parse_patterns(ignore_patterns)) - - # Process include patterns and override ignore patterns accordingly - if include_patterns: - parsed_include = _parse_patterns(include_patterns) - # Override ignore patterns with include patterns - ignore_patterns_set = set(ignore_patterns_set) - set(parsed_include) - else: - parsed_include = None - - return IngestionQuery( - user_name=query.user_name, - repo_name=query.repo_name, - url=query.url, - subpath=query.subpath, - local_path=query.local_path, - slug=query.slug, - id=query.id, - type=query.type, - branch=query.branch, - commit=query.commit, - max_file_size=max_file_size, - ignore_patterns=ignore_patterns_set, - include_patterns=parsed_include, - ) - - -async def _parse_remote_repo(source: str, token: Optional[str] = None) -> IngestionQuery: - """ - Parse a repository URL into a structured query dictionary. - - If source is: - - A fully qualified URL (https://gitlab.com/...), parse & verify that domain - - A URL missing 'https://' (gitlab.com/...), add 'https://' and parse - - A 'slug' (like 'pandas-dev/pandas'), attempt known domains until we find one that exists. - - Parameters - ---------- - source : str - The URL or domain-less slug to parse. - token : str, optional - GitHub personal-access token (PAT). Needed when *source* refers to a - **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. - - Returns - ------- - IngestionQuery - A dictionary containing the parsed details of the repository. - """ - source = unquote(source) - - # Attempt to parse - parsed_url = urlparse(source) - - if parsed_url.scheme: - _validate_url_scheme(parsed_url.scheme) - _validate_host(parsed_url.netloc.lower()) - - else: # Will be of the form 'host/user/repo' or 'user/repo' - tmp_host = source.split("/")[0].lower() - if "." in tmp_host: - _validate_host(tmp_host) - else: - # No scheme, no domain => user typed "user/repo", so we'll guess the domain. - host = await try_domains_for_user_and_repo(*_get_user_and_repo_from_path(source), token=token) - source = f"{host}/{source}" - - source = "https://" + source - parsed_url = urlparse(source) - - host = parsed_url.netloc.lower() - user_name, repo_name = _get_user_and_repo_from_path(parsed_url.path) - - _id = str(uuid.uuid4()) - slug = f"{user_name}-{repo_name}" - local_path = TMP_BASE_PATH / _id / slug - url = f"https://{host}/{user_name}/{repo_name}" - - parsed = IngestionQuery( - user_name=user_name, - repo_name=repo_name, - url=url, - local_path=local_path, - slug=slug, - id=_id, - ) - - remaining_parts = parsed_url.path.strip("/").split("/")[2:] - - if not remaining_parts: - return parsed - - possible_type = remaining_parts.pop(0) # e.g. 'issues', 'pull', 'tree', 'blob' - - # If no extra path parts, just return - if not remaining_parts: - return parsed - - # If this is an issues page or pull requests, return early without processing subpath - if remaining_parts and possible_type in ("issues", "pull"): - return parsed - - parsed.type = possible_type - - # Commit or branch - commit_or_branch = remaining_parts[0] - if _is_valid_git_commit_hash(commit_or_branch): - parsed.commit = commit_or_branch - remaining_parts.pop(0) - else: - parsed.branch = await _configure_branch_and_subpath(remaining_parts, url) - - # Subpath if anything left - if remaining_parts: - parsed.subpath += "/".join(remaining_parts) - - return parsed - - -async def _configure_branch_and_subpath(remaining_parts: List[str], url: str) -> Optional[str]: - """ - Configure the branch and subpath based on the remaining parts of the URL. - Parameters - ---------- - remaining_parts : List[str] - The remaining parts of the URL path. - url : str - The URL of the repository. - Returns - ------- - str, optional - The branch name if found, otherwise None. - - """ - try: - # Fetch the list of branches from the remote repository - branches: List[str] = await fetch_remote_branch_list(url) - except RuntimeError as exc: - warnings.warn(f"Warning: Failed to fetch branch list: {exc}", RuntimeWarning) - return remaining_parts.pop(0) - - branch = [] - while remaining_parts: - branch.append(remaining_parts.pop(0)) - branch_name = "/".join(branch) - if branch_name in branches: - return branch_name - - return None - - -def _parse_patterns(pattern: Union[str, Set[str]]) -> Set[str]: - """ - Parse and validate file/directory patterns for inclusion or exclusion. - - Takes either a single pattern string or set of pattern strings and processes them into a normalized list. - Patterns are split on commas and spaces, validated for allowed characters, and normalized. - - Parameters - ---------- - pattern : Set[str] | str - Pattern(s) to parse - either a single string or set of strings - - Returns - ------- - Set[str] - A set of normalized patterns. - - Raises - ------ - InvalidPatternError - If any pattern contains invalid characters. Only alphanumeric characters, - dash (-), underscore (_), dot (.), forward slash (/), plus (+), and - asterisk (*) are allowed. - """ - patterns = pattern if isinstance(pattern, set) else {pattern} - - parsed_patterns: Set[str] = set() - for p in patterns: - parsed_patterns = parsed_patterns.union(set(re.split(",| ", p))) - - # Remove empty string if present - parsed_patterns = parsed_patterns - {""} - - # Normalize Windows paths to Unix-style paths - parsed_patterns = {p.replace("\\", "/") for p in parsed_patterns} - - # Validate and normalize each pattern - for p in parsed_patterns: - if not _is_valid_pattern(p): - raise InvalidPatternError(p) - - return {_normalize_pattern(p) for p in parsed_patterns} - - -def _parse_local_dir_path(path_str: str) -> IngestionQuery: - """ - Parse the given file path into a structured query dictionary. - - Parameters - ---------- - path_str : str - The file path to parse. - - Returns - ------- - IngestionQuery - A dictionary containing the parsed details of the file path. - """ - path_obj = Path(path_str).resolve() - slug = path_obj.name if path_str == "." else path_str.strip("/") - return IngestionQuery( - user_name=None, - repo_name=None, - url=None, - local_path=path_obj, - slug=slug, - id=str(uuid.uuid4()), - ) - - -async def try_domains_for_user_and_repo(user_name: str, repo_name: str, token: Optional[str] = None) -> str: - """ - Attempt to find a valid repository host for the given user_name and repo_name. - - Parameters - ---------- - user_name : str - The username or owner of the repository. - repo_name : str - The name of the repository. - token : str, optional - GitHub personal-access token (PAT). Needed when *source* refers to a - **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. - - Returns - ------- - str - The domain of the valid repository host. - - Raises - ------ - ValueError - If no valid repository host is found for the given user_name and repo_name. - """ - for domain in KNOWN_GIT_HOSTS: - candidate = f"https://{domain}/{user_name}/{repo_name}" - if await check_repo_exists(candidate, token=token if domain == "github.com" else None): - return domain - raise ValueError(f"Could not find a valid repository host for '{user_name}/{repo_name}'.") - - - -================================================ -FILE: src/gitingest/schemas/__init__.py -================================================ -"""This module contains the schemas for the Gitingest package.""" - -from gitingest.schemas.filesystem_schema import FileSystemNode, FileSystemNodeType, FileSystemStats -from gitingest.schemas.ingestion_schema import CloneConfig, IngestionQuery - -__all__ = ["FileSystemNode", "FileSystemNodeType", "FileSystemStats", "CloneConfig", "IngestionQuery"] - - - -================================================ -FILE: src/gitingest/schemas/filesystem_schema.py -================================================ -"""Define the schema for the filesystem representation.""" - -from __future__ import annotations - -import os -from dataclasses import dataclass, field -from enum import Enum, auto -from pathlib import Path - -from gitingest.utils.file_utils import get_preferred_encodings, is_text_file -from gitingest.utils.notebook_utils import process_notebook - -SEPARATOR = "=" * 48 # Tiktoken, the tokenizer openai uses, counts 2 tokens if we have more than 48 - - -class FileSystemNodeType(Enum): - """Enum representing the type of a file system node (directory or file).""" - - DIRECTORY = auto() - FILE = auto() - SYMLINK = auto() - - -@dataclass -class FileSystemStats: - """Class for tracking statistics during file system traversal.""" - - visited: set[Path] = field(default_factory=set) - total_files: int = 0 - total_size: int = 0 - - -@dataclass -class FileSystemNode: # pylint: disable=too-many-instance-attributes - """ - Class representing a node in the file system (either a file or directory). - - Tracks properties of files/directories for comprehensive analysis. - """ - - name: str - type: FileSystemNodeType - path_str: str - path: Path - size: int = 0 - file_count: int = 0 - dir_count: int = 0 - depth: int = 0 - children: list[FileSystemNode] = field(default_factory=list) - - def sort_children(self) -> None: - """ - Sort the children nodes of a directory according to a specific order. - - Order of sorting: - 2. Regular files (not starting with dot) - 3. Hidden files (starting with dot) - 4. Regular directories (not starting with dot) - 5. Hidden directories (starting with dot) - - All groups are sorted alphanumerically within themselves. - - Raises - ------ - ValueError - If the node is not a directory. - """ - if self.type != FileSystemNodeType.DIRECTORY: - raise ValueError("Cannot sort children of a non-directory node") - - def _sort_key(child: FileSystemNode) -> tuple[int, str]: - # returns the priority order for the sort function, 0 is first - # Groups: 0=README, 1=regular file, 2=hidden file, 3=regular dir, 4=hidden dir - name = child.name.lower() - if child.type == FileSystemNodeType.FILE: - if name == "readme.md": - return (0, name) - return (1 if not name.startswith(".") else 2, name) - return (3 if not name.startswith(".") else 4, name) - - self.children.sort(key=_sort_key) - - @property - def content_string(self) -> str: - """ - Return the content of the node as a string, including path and content. - - Returns - ------- - str - A string representation of the node's content. - """ - parts = [ - SEPARATOR, - f"{self.type.name}: {str(self.path_str).replace(os.sep, '/')}" - + (f" -> {self.path.readlink().name}" if self.type == FileSystemNodeType.SYMLINK else ""), - SEPARATOR, - f"{self.content}", - ] - - return "\n".join(parts) + "\n\n" - - @property - def content(self) -> str: # pylint: disable=too-many-return-statements - """ - Read the content of a file if it's text (or a notebook). Return an error message otherwise. - - Returns - ------- - str - The content of the file, or an error message if the file could not be read. - - Raises - ------ - ValueError - If the node is a directory. - """ - if self.type == FileSystemNodeType.DIRECTORY: - raise ValueError("Cannot read content of a directory node") - - if self.type == FileSystemNodeType.SYMLINK: - return "" - - if not is_text_file(self.path): - return "[Non-text file]" - - if self.path.suffix == ".ipynb": - try: - return process_notebook(self.path) - except Exception as exc: - return f"Error processing notebook: {exc}" - - # Try multiple encodings - for encoding in get_preferred_encodings(): - try: - with self.path.open(encoding=encoding) as f: - return f.read() - except UnicodeDecodeError: - continue - except UnicodeError: - continue - except OSError as exc: - return f"Error reading file: {exc}" - - return "Error: Unable to decode file with available encodings" - - - -================================================ -FILE: src/gitingest/schemas/ingestion_schema.py -================================================ -"""This module contains the dataclasses for the ingestion process.""" - -from dataclasses import dataclass -from pathlib import Path -from typing import Optional, Set - -from pydantic import BaseModel, ConfigDict, Field - -from gitingest.config import MAX_FILE_SIZE - - -@dataclass -class CloneConfig: - """ - Configuration for cloning a Git repository. - - This class holds the necessary parameters for cloning a repository to a local path, including - the repository's URL, the target local path, and optional parameters for a specific commit or branch. - - Attributes - ---------- - url : str - The URL of the Git repository to clone. - local_path : str - The local directory where the repository will be cloned. - commit : str, optional - The specific commit hash to check out after cloning (default is None). - branch : str, optional - The branch to clone (default is None). - subpath : str - The subpath to clone from the repository (default is "/"). - blob: bool - Whether the repository is a blob (default is False). - """ - - url: str - local_path: str - commit: Optional[str] = None - branch: Optional[str] = None - subpath: str = "/" - blob: bool = False - - -class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes - """ - Pydantic model to store the parsed details of the repository or file path. - """ - - user_name: Optional[str] = None - repo_name: Optional[str] = None - local_path: Path - url: Optional[str] = None - slug: str - id: str - subpath: str = "/" - type: Optional[str] = None - branch: Optional[str] = None - commit: Optional[str] = None - max_file_size: int = Field(default=MAX_FILE_SIZE) - ignore_patterns: Optional[Set[str]] = None - include_patterns: Optional[Set[str]] = None - - model_config = ConfigDict(arbitrary_types_allowed=True) - - def extract_clone_config(self) -> CloneConfig: - """ - Extract the relevant fields for the CloneConfig object. - - Returns - ------- - CloneConfig - A CloneConfig object containing the relevant fields. - - Raises - ------ - ValueError - If the 'url' parameter is not provided. - """ - if not self.url: - raise ValueError("The 'url' parameter is required.") - - return CloneConfig( - url=self.url, - local_path=str(self.local_path), - commit=self.commit, - branch=self.branch, - subpath=self.subpath, - blob=self.type == "blob", - ) - - - -================================================ -FILE: src/gitingest/utils/__init__.py -================================================ - - - -================================================ -FILE: src/gitingest/utils/exceptions.py -================================================ -"""Custom exceptions for the Gitingest package.""" - - -class InvalidPatternError(ValueError): - """ - Exception raised when a pattern contains invalid characters. - This exception is used to signal that a pattern provided for some operation - contains characters that are not allowed. The valid characters for the pattern - include alphanumeric characters, dash (-), underscore (_), dot (.), forward slash (/), - plus (+), and asterisk (*). - Parameters - ---------- - pattern : str - The invalid pattern that caused the error. - """ - - def __init__(self, pattern: str) -> None: - super().__init__( - f"Pattern '{pattern}' contains invalid characters. Only alphanumeric characters, dash (-), " - "underscore (_), dot (.), forward slash (/), plus (+), and asterisk (*) are allowed." - ) - - -class AsyncTimeoutError(Exception): - """ - Exception raised when an async operation exceeds its timeout limit. - - This exception is used by the `async_timeout` decorator to signal that the wrapped - asynchronous function has exceeded the specified time limit for execution. - """ - - -class InvalidNotebookError(Exception): - """Exception raised when a Jupyter notebook is invalid or cannot be processed.""" - - def __init__(self, message: str) -> None: - super().__init__(message) - - -class InvalidGitHubTokenError(ValueError): - """Exception raised when a GitHub Personal Access Token is malformed.""" - - def __init__(self) -> None: - super().__init__( - "Invalid GitHub token format. Token should start with 'github_pat_' or 'ghp_' " - "followed by at least 36 characters of letters, numbers, and underscores." - ) - - - -================================================ -FILE: src/gitingest/utils/file_utils.py -================================================ -"""Utility functions for working with files and directories.""" - -import locale -import platform -from pathlib import Path -from typing import List - -try: - locale.setlocale(locale.LC_ALL, "") -except locale.Error: - locale.setlocale(locale.LC_ALL, "C") - - -def get_preferred_encodings() -> List[str]: - """ - Get list of encodings to try, prioritized for the current platform. - - Returns - ------- - List[str] - List of encoding names to try in priority order, starting with the - platform's default encoding followed by common fallback encodings. - """ - encodings = [locale.getpreferredencoding(), "utf-8", "utf-16", "utf-16le", "utf-8-sig", "latin"] - if platform.system() == "Windows": - encodings += ["cp1252", "iso-8859-1"] - return encodings - - -def is_text_file(path: Path) -> bool: - """ - Determine if the file is likely a text file by trying to decode a small chunk - with multiple encodings, and checking for common binary markers. - - Parameters - ---------- - path : Path - The path to the file to check. - - Returns - ------- - bool - True if the file is likely textual; False if it appears to be binary. - """ - - # Attempt to read a portion of the file in binary mode - try: - with path.open("rb") as f: - chunk = f.read(1024) - except OSError: - return False - - # If file is empty, treat as text - if not chunk: - return True - - # Check obvious binary bytes - if b"\x00" in chunk or b"\xff" in chunk: - return False - - # Attempt multiple encodings - for enc in get_preferred_encodings(): - try: - with path.open(encoding=enc) as f: - f.read() - return True - except UnicodeDecodeError: - continue - except UnicodeError: - continue - except OSError: - return False - - return False - - - -================================================ -FILE: src/gitingest/utils/git_utils.py -================================================ -"""Utility functions for interacting with Git repositories.""" - -import asyncio -import base64 -import re -from typing import List, Optional, Tuple - -from gitingest.utils.exceptions import InvalidGitHubTokenError - -GITHUB_PAT_PATTERN = r"^(?:github_pat_|ghp_)[A-Za-z0-9_]{36,}$" - - -async def run_command(*args: str) -> Tuple[bytes, bytes]: - """ - Execute a shell command asynchronously and return (stdout, stderr) bytes. - - Parameters - ---------- - *args : str - The command and its arguments to execute. - - Returns - ------- - Tuple[bytes, bytes] - A tuple containing the stdout and stderr of the command. - - Raises - ------ - RuntimeError - If command exits with a non-zero status. - """ - # Execute the requested command - proc = await asyncio.create_subprocess_exec( - *args, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) - stdout, stderr = await proc.communicate() - if proc.returncode != 0: - error_message = stderr.decode().strip() - raise RuntimeError(f"Command failed: {' '.join(args)}\nError: {error_message}") - - return stdout, stderr - - -async def ensure_git_installed() -> None: - """ - Ensure Git is installed and accessible on the system. - - Raises - ------ - RuntimeError - If Git is not installed or not accessible. - """ - try: - await run_command("git", "--version") - except RuntimeError as exc: - raise RuntimeError("Git is not installed or not accessible. Please install Git first.") from exc - - -async def check_repo_exists(url: str, token: Optional[str] = None) -> bool: - """ - Check if a Git repository exists at the provided URL. - - Parameters - ---------- - url : str - The URL of the Git repository to check. - token : str, optional - GitHub personal-access token (PAT). Needed when *source* refers to a - **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. - - Returns - ------- - bool - True if the repository exists, False otherwise. - - Raises - ------ - RuntimeError - If the curl command returns an unexpected status code. - """ - if token and "github.com" in url: - return await _check_github_repo_exists(url, token) - - proc = await asyncio.create_subprocess_exec( - "curl", - "-I", - url, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) - stdout, _ = await proc.communicate() - - if proc.returncode != 0: - return False # likely unreachable or private - - response = stdout.decode() - status_line = response.splitlines()[0].strip() - parts = status_line.split(" ") - if len(parts) >= 2: - status_code_str = parts[1] - if status_code_str in ("200", "301"): - return True - if status_code_str in ("302", "404"): - return False - raise RuntimeError(f"Unexpected status line: {status_line}") - - -async def _check_github_repo_exists(url: str, token: Optional[str] = None) -> bool: - """ - Return True iff the authenticated user can see `url`. - - Parameters - ---------- - url : str - The URL of the GitHub repository to check. - token : str, optional - GitHub personal-access token (PAT). Needed when *source* refers to a - **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. - - Returns - ------- - bool - True if the repository exists, False otherwise. - - Raises - ------ - ValueError - If the URL is not a valid GitHub repository URL. - RuntimeError - If the repository is not found, if the provided URL is invalid, or if the token format is invalid. - """ - m = re.match(r"https?://github\.com/([^/]+)/([^/]+?)(?:\.git)?/?$", url) - if not m: - raise ValueError(f"Un-recognised GitHub URL: {url!r}") - owner, repo = m.groups() - - api = f"https://api.github.com/repos/{owner}/{repo}" - cmd = [ - "curl", - "--silent", - "--location", - "--write-out", - "%{http_code}", - "-o", - "/dev/null", - "-H", - "Accept: application/vnd.github+json", - ] - if token: - cmd += ["-H", f"Authorization: Bearer {token}"] - cmd.append(api) - - proc = await asyncio.create_subprocess_exec( - *cmd, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) - stdout, _ = await proc.communicate() - status = stdout.decode()[-3:] # just the %{http_code} - - if status == "200": - return True - if status == "404": - return False - if status in ("401", "403"): - raise RuntimeError("Token invalid or lacks permissions") - raise RuntimeError(f"GitHub API returned unexpected HTTP {status}") - - -async def fetch_remote_branch_list(url: str, token: Optional[str] = None) -> List[str]: - """ - Fetch the list of branches from a remote Git repository. - - Parameters - ---------- - url : str - The URL of the Git repository to fetch branches from. - token : str, optional - GitHub personal-access token (PAT). Needed when *source* refers to a - **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. - - Returns - ------- - List[str] - A list of branch names available in the remote repository. - """ - fetch_branches_command = ["git"] - - # Add authentication if needed - if token and "github.com" in url: - fetch_branches_command += ["-c", create_git_auth_header(token)] - - fetch_branches_command += ["ls-remote", "--heads", url] - - await ensure_git_installed() - stdout, _ = await run_command(*fetch_branches_command) - stdout_decoded = stdout.decode() - - return [ - line.split("refs/heads/", 1)[1] - for line in stdout_decoded.splitlines() - if line.strip() and "refs/heads/" in line - ] - - -def create_git_command(base_cmd: List[str], local_path: str, url: str, token: Optional[str] = None) -> List[str]: - """Create a git command with authentication if needed. - - Parameters - ---------- - base_cmd : List[str] - The base git command to start with - local_path : str - The local path where the git command should be executed - url : str - The repository URL to check if it's a GitHub repository - token : Optional[str] - GitHub personal access token for authentication - - Returns - ------- - List[str] - The git command with authentication if needed - """ - cmd = base_cmd + ["-C", local_path] - if token and url.startswith("https://github.com"): - validate_github_token(token) - cmd += ["-c", create_git_auth_header(token)] - return cmd - - -def create_git_auth_header(token: str) -> str: - """Create a Basic authentication header for GitHub git operations. - - Parameters - ---------- - token : str - GitHub personal access token - - Returns - ------- - str - The git config command for setting the authentication header - """ - basic = base64.b64encode(f"x-oauth-basic:{token}".encode()).decode() - return f"http.https://github.com/.extraheader=Authorization: Basic {basic}" - - -def validate_github_token(token: str) -> None: - """Validate the format of a GitHub Personal Access Token. - - Parameters - ---------- - token : str - The GitHub token to validate - - Raises - ------ - InvalidGitHubTokenError - If the token format is invalid - """ - if not re.match(GITHUB_PAT_PATTERN, token): - raise InvalidGitHubTokenError() - - - -================================================ -FILE: src/gitingest/utils/ignore_patterns.py -================================================ -"""Default ignore patterns for Gitingest.""" - -import os -from pathlib import Path -from typing import Set - -DEFAULT_IGNORE_PATTERNS: Set[str] = { - # Python - "*.pyc", - "*.pyo", - "*.pyd", - "__pycache__", - ".pytest_cache", - ".coverage", - ".tox", - ".nox", - ".mypy_cache", - ".ruff_cache", - ".hypothesis", - "poetry.lock", - "Pipfile.lock", - # JavaScript/FileSystemNode - "node_modules", - "bower_components", - "package-lock.json", - "yarn.lock", - ".npm", - ".yarn", - ".pnpm-store", - "bun.lock", - "bun.lockb", - # Java - "*.class", - "*.jar", - "*.war", - "*.ear", - "*.nar", - ".gradle/", - "build/", - ".settings/", - ".classpath", - "gradle-app.setting", - "*.gradle", - # IDEs and editors / Java - ".project", - # C/C++ - "*.o", - "*.obj", - "*.dll", - "*.dylib", - "*.exe", - "*.lib", - "*.out", - "*.a", - "*.pdb", - # Swift/Xcode - ".build/", - "*.xcodeproj/", - "*.xcworkspace/", - "*.pbxuser", - "*.mode1v3", - "*.mode2v3", - "*.perspectivev3", - "*.xcuserstate", - "xcuserdata/", - ".swiftpm/", - # Ruby - "*.gem", - ".bundle/", - "vendor/bundle", - "Gemfile.lock", - ".ruby-version", - ".ruby-gemset", - ".rvmrc", - # Rust - "Cargo.lock", - "**/*.rs.bk", - # Java / Rust - "target/", - # Go - "pkg/", - # .NET/C# - "obj/", - "*.suo", - "*.user", - "*.userosscache", - "*.sln.docstates", - "packages/", - "*.nupkg", - # Go / .NET / C# - "bin/", - # Version control - ".git", - ".svn", - ".hg", - ".gitignore", - ".gitattributes", - ".gitmodules", - # Images and media - "*.svg", - "*.png", - "*.jpg", - "*.jpeg", - "*.gif", - "*.ico", - "*.pdf", - "*.mov", - "*.mp4", - "*.mp3", - "*.wav", - # Virtual environments - "venv", - ".venv", - "env", - ".env", - "virtualenv", - # IDEs and editors - ".idea", - ".vscode", - ".vs", - "*.swo", - "*.swn", - ".settings", - "*.sublime-*", - # Temporary and cache files - "*.log", - "*.bak", - "*.swp", - "*.tmp", - "*.temp", - ".cache", - ".sass-cache", - ".eslintcache", - ".DS_Store", - "Thumbs.db", - "desktop.ini", - # Build directories and artifacts - "build", - "dist", - "target", - "out", - "*.egg-info", - "*.egg", - "*.whl", - "*.so", - # Documentation - "site-packages", - ".docusaurus", - ".next", - ".nuxt", - # Other common patterns - ## Minified files - "*.min.js", - "*.min.css", - ## Source maps - "*.map", - ## Terraform - ".terraform", - "*.tfstate*", - ## Dependencies in various languages - "vendor/", - # Gitingest - "digest.txt", -} - - -def load_gitignore_patterns(root: Path) -> Set[str]: - """ - Recursively load ignore patterns from all .gitignore files under the given root directory. - - Parameters - ---------- - root : Path - The root directory to search for .gitignore files. - - Returns - ------- - Set[str] - A set of ignore patterns extracted from all .gitignore files found under the root directory. - """ - patterns: Set[str] = set() - for dirpath, _, filenames in os.walk(root): - if ".gitignore" not in filenames: - continue - - gitignore_path = Path(dirpath) / ".gitignore" - with gitignore_path.open("r", encoding="utf-8") as f: - for line in f: - stripped = line.strip() - - if not stripped or stripped.startswith("#"): - continue - - negated = stripped.startswith("!") - if negated: - stripped = stripped[1:] - - rel_dir = os.path.relpath(dirpath, root) - if stripped.startswith("/"): - pattern_body = os.path.join(rel_dir, stripped.lstrip("/")) - else: - pattern_body = os.path.join(rel_dir, stripped) if rel_dir != "." else stripped - - pattern_body = pattern_body.replace("\\", "/") - pattern = f"!{pattern_body}" if negated else pattern_body - patterns.add(pattern) - - return patterns - - - -================================================ -FILE: src/gitingest/utils/ingestion_utils.py -================================================ -"""Utility functions for the ingestion process.""" - -from pathlib import Path -from typing import Set - -from pathspec import PathSpec - - -def _should_include(path: Path, base_path: Path, include_patterns: Set[str]) -> bool: - """ - Determine if the given file or directory path matches any of the include patterns. - - This function checks whether the relative path of a file or directory matches any of the specified patterns. If a - match is found, it returns `True`, indicating that the file or directory should be included in further processing. - - Parameters - ---------- - path : Path - The absolute path of the file or directory to check. - base_path : Path - The base directory from which the relative path is calculated. - include_patterns : Set[str] - A set of patterns to check against the relative path. - - Returns - ------- - bool - `True` if the path matches any of the include patterns, `False` otherwise. - """ - try: - rel_path = path.relative_to(base_path) - except ValueError: - # If path is not under base_path at all - return False - - rel_str = str(rel_path) - - # if path is a directory, include it by default - if path.is_dir(): - return True - - spec = PathSpec.from_lines("gitwildmatch", include_patterns) - return spec.match_file(rel_str) - - -def _should_exclude(path: Path, base_path: Path, ignore_patterns: Set[str]) -> bool: - """ - Determine if the given file or directory path matches any of the ignore patterns. - - This function checks whether the relative path of a file or directory matches - any of the specified ignore patterns. If a match is found, it returns `True`, indicating - that the file or directory should be excluded from further processing. - - Parameters - ---------- - path : Path - The absolute path of the file or directory to check. - base_path : Path - The base directory from which the relative path is calculated. - ignore_patterns : Set[str] - A set of patterns to check against the relative path. - - Returns - ------- - bool - `True` if the path matches any of the ignore patterns, `False` otherwise. - """ - try: - rel_path = path.relative_to(base_path) - except ValueError: - # If path is not under base_path at all - return True - - rel_str = str(rel_path) - spec = PathSpec.from_lines("gitwildmatch", ignore_patterns) - return spec.match_file(rel_str) - - - -================================================ -FILE: src/gitingest/utils/notebook_utils.py -================================================ -"""Utilities for processing Jupyter notebooks.""" - -import json -import warnings -from itertools import chain -from pathlib import Path -from typing import Any, Dict, List, Optional - -from gitingest.utils.exceptions import InvalidNotebookError - - -def process_notebook(file: Path, include_output: bool = True) -> str: - """ - Process a Jupyter notebook file and return an executable Python script as a string. - - Parameters - ---------- - file : Path - The path to the Jupyter notebook file. - include_output : bool - Whether to include cell outputs in the generated script, by default True. - - Returns - ------- - str - The executable Python script as a string. - - Raises - ------ - InvalidNotebookError - If the notebook file is invalid or cannot be processed. - """ - try: - with file.open(encoding="utf-8") as f: - notebook: Dict[str, Any] = json.load(f) - except json.JSONDecodeError as exc: - raise InvalidNotebookError(f"Invalid JSON in notebook: {file}") from exc - - # Check if the notebook contains worksheets - worksheets = notebook.get("worksheets") - if worksheets: - warnings.warn( - "Worksheets are deprecated as of IPEP-17. Consider updating the notebook. " - "(See: https://github.com/jupyter/nbformat and " - "https://github.com/ipython/ipython/wiki/IPEP-17:-Notebook-Format-4#remove-multiple-worksheets " - "for more information.)", - DeprecationWarning, - ) - - if len(worksheets) > 1: - warnings.warn("Multiple worksheets detected. Combining all worksheets into a single script.", UserWarning) - - cells = list(chain.from_iterable(ws["cells"] for ws in worksheets)) - - else: - cells = notebook["cells"] - - result = ["# Jupyter notebook converted to Python script."] - - for cell in cells: - cell_str = _process_cell(cell, include_output=include_output) - if cell_str: - result.append(cell_str) - - return "\n\n".join(result) + "\n" - - -def _process_cell(cell: Dict[str, Any], include_output: bool) -> Optional[str]: - """ - Process a Jupyter notebook cell and return the cell content as a string. - - Parameters - ---------- - cell : Dict[str, Any] - The cell dictionary from a Jupyter notebook. - include_output : bool - Whether to include cell outputs in the generated script - - Returns - ------- - str, optional - The cell content as a string, or None if the cell is empty. - - Raises - ------ - ValueError - If an unexpected cell type is encountered. - """ - cell_type = cell["cell_type"] - - # Validate cell type and handle unexpected types - if cell_type not in ("markdown", "code", "raw"): - raise ValueError(f"Unknown cell type: {cell_type}") - - cell_str = "".join(cell["source"]) - - # Skip empty cells - if not cell_str: - return None - - # Convert Markdown and raw cells to multi-line comments - if cell_type in ("markdown", "raw"): - return f'"""\n{cell_str}\n"""' - - # Add cell output as comments - outputs = cell.get("outputs") - if include_output and outputs: - - # Include cell outputs as comments - output_lines = [] - - for output in outputs: - output_lines += _extract_output(output) - - for output_line in output_lines: - if not output_line.endswith("\n"): - output_line += "\n" - - cell_str += "\n# Output:\n# " + "\n# ".join(output_lines) - - return cell_str - - -def _extract_output(output: Dict[str, Any]) -> List[str]: - """ - Extract the output from a Jupyter notebook cell. - - Parameters - ---------- - output : Dict[str, Any] - The output dictionary from a Jupyter notebook cell. - - Returns - ------- - List[str] - The output as a list of strings. - - Raises - ------ - ValueError - If an unknown output type is encountered. - """ - output_type = output["output_type"] - - if output_type == "stream": - return output["text"] - - if output_type in ("execute_result", "display_data"): - return output["data"]["text/plain"] - - if output_type == "error": - return [f"Error: {output['ename']}: {output['evalue']}"] - - raise ValueError(f"Unknown output type: {output_type}") - - - -================================================ -FILE: src/gitingest/utils/os_utils.py -================================================ -"""Utility functions for working with the operating system.""" - -import os -from pathlib import Path - - -async def ensure_directory(path: Path) -> None: - """ - Ensure the directory exists, creating it if necessary. - - Parameters - ---------- - path : Path - The path to ensure exists - - Raises - ------ - OSError - If the directory cannot be created - """ - try: - os.makedirs(path, exist_ok=True) - except OSError as exc: - raise OSError(f"Failed to create directory {path}: {exc}") from exc - - - -================================================ -FILE: src/gitingest/utils/path_utils.py -================================================ -"""Utility functions for working with file paths.""" - -import os -import platform -from pathlib import Path - - -def _is_safe_symlink(symlink_path: Path, base_path: Path) -> bool: - """ - Check if a symlink points to a location within the base directory. - - This function resolves the target of a symlink and ensures it is within the specified - base directory, returning `True` if it is safe, or `False` if the symlink points outside - the base directory. - - Parameters - ---------- - symlink_path : Path - The path of the symlink to check. - base_path : Path - The base directory to ensure the symlink points within. - - Returns - ------- - bool - `True` if the symlink points within the base directory, `False` otherwise. - """ - try: - if platform.system() == "Windows": - if not os.path.islink(str(symlink_path)): - return False - - target_path = symlink_path.resolve() - base_resolved = base_path.resolve() - - return base_resolved in target_path.parents or target_path == base_resolved - except (OSError, ValueError): - # If there's any error resolving the paths, consider it unsafe - return False - - - -================================================ -FILE: src/gitingest/utils/query_parser_utils.py -================================================ -"""Utility functions for parsing and validating query parameters.""" - -import os -import string -from typing import List, Set, Tuple - -HEX_DIGITS: Set[str] = set(string.hexdigits) - - -KNOWN_GIT_HOSTS: List[str] = [ - "github.com", - "gitlab.com", - "bitbucket.org", - "gitea.com", - "codeberg.org", - "gist.github.com", -] - - -def _is_valid_git_commit_hash(commit: str) -> bool: - """ - Validate if the provided string is a valid Git commit hash. - - This function checks if the commit hash is a 40-character string consisting only - of hexadecimal digits, which is the standard format for Git commit hashes. - - Parameters - ---------- - commit : str - The string to validate as a Git commit hash. - - Returns - ------- - bool - True if the string is a valid 40-character Git commit hash, otherwise False. - """ - return len(commit) == 40 and all(c in HEX_DIGITS for c in commit) - - -def _is_valid_pattern(pattern: str) -> bool: - """ - Validate if the given pattern contains only valid characters. - - This function checks if the pattern contains only alphanumeric characters or one - of the following allowed characters: dash (`-`), underscore (`_`), dot (`.`), - forward slash (`/`), plus (`+`), asterisk (`*`), or the at sign (`@`). - - Parameters - ---------- - pattern : str - The pattern to validate. - - Returns - ------- - bool - True if the pattern is valid, otherwise False. - """ - return all(c.isalnum() or c in "-_./+*@" for c in pattern) - - -def _validate_host(host: str) -> None: - """ - Validate a hostname. - - The host is accepted if it is either present in the hard-coded `KNOWN_GIT_HOSTS` list or if it satisfies the - simple heuristics in `_looks_like_git_host`, which try to recognise common self-hosted Git services (e.g. GitLab - instances on sub-domains such as `gitlab.example.com` or `git.example.com`). - - Parameters - ---------- - host : str - Hostname (case-insensitive). - - Raises - ------ - ValueError - If the host cannot be recognised as a probable Git hosting domain. - """ - host = host.lower() - if host not in KNOWN_GIT_HOSTS and not _looks_like_git_host(host): - raise ValueError(f"Unknown domain '{host}' in URL") - - -def _looks_like_git_host(host: str) -> bool: - """ - Check if the given host looks like a Git host. - - The current heuristic returns `True` when the host starts with `git.` (e.g. `git.example.com`) or starts with - `gitlab.` (e.g. `gitlab.company.com`). - - Parameters - ---------- - host : str - Hostname (case-insensitive). - - Returns - ------- - bool - True if the host looks like a Git host, otherwise False. - """ - host = host.lower() - return host.startswith(("git.", "gitlab.")) - - -def _validate_url_scheme(scheme: str) -> None: - """ - Validate the given scheme against the known schemes. - - Parameters - ---------- - scheme : str - The scheme to validate. - - Raises - ------ - ValueError - If the scheme is not 'http' or 'https'. - """ - scheme = scheme.lower() - if scheme not in ("https", "http"): - raise ValueError(f"Invalid URL scheme '{scheme}' in URL") - - -def _get_user_and_repo_from_path(path: str) -> Tuple[str, str]: - """ - Extract the user and repository names from a given path. - - Parameters - ---------- - path : str - The path to extract the user and repository names from. - - Returns - ------- - Tuple[str, str] - A tuple containing the user and repository names. - - Raises - ------ - ValueError - If the path does not contain at least two parts. - """ - path_parts = path.lower().strip("/").split("/") - if len(path_parts) < 2: - raise ValueError(f"Invalid repository URL '{path}'") - return path_parts[0], path_parts[1] - - -def _normalize_pattern(pattern: str) -> str: - """ - Normalize the given pattern by removing leading separators and appending a wildcard. - - This function processes the pattern string by stripping leading directory separators - and appending a wildcard (`*`) if the pattern ends with a separator. - - Parameters - ---------- - pattern : str - The pattern to normalize. - - Returns - ------- - str - The normalized pattern. - """ - pattern = pattern.lstrip(os.sep) - if pattern.endswith(os.sep): - pattern += "*" - return pattern - - - -================================================ -FILE: src/gitingest/utils/timeout_wrapper.py -================================================ -"""Utility functions for the Gitingest package.""" - -import asyncio -import functools -from typing import Any, Awaitable, Callable, TypeVar - -from gitingest.utils.exceptions import AsyncTimeoutError - -T = TypeVar("T") - - -def async_timeout(seconds) -> Callable[[Callable[..., Awaitable[T]]], Callable[..., Awaitable[T]]]: - """ - Async Timeout decorator. - - This decorator wraps an asynchronous function and ensures it does not run for - longer than the specified number of seconds. If the function execution exceeds - this limit, it raises an `AsyncTimeoutError`. - - Parameters - ---------- - seconds : int - The maximum allowed time (in seconds) for the asynchronous function to complete. - - Returns - ------- - Callable[[Callable[..., Awaitable[T]]], Callable[..., Awaitable[T]]] - A decorator that, when applied to an async function, ensures the function - completes within the specified time limit. If the function takes too long, - an `AsyncTimeoutError` is raised. - """ - - def decorator(func: Callable[..., Awaitable[T]]) -> Callable[..., Awaitable[T]]: - @functools.wraps(func) - async def wrapper(*args: Any, **kwargs: Any) -> T: - try: - return await asyncio.wait_for(func(*args, **kwargs), timeout=seconds) - except asyncio.TimeoutError as exc: - raise AsyncTimeoutError(f"Operation timed out after {seconds} seconds") from exc - - return wrapper - - return decorator - - - -================================================ -FILE: src/server/__init__.py -================================================ - - - -================================================ -FILE: src/server/main.py -================================================ -"""Main module for the FastAPI application.""" - -import os -from pathlib import Path -from typing import Dict - -from dotenv import load_dotenv -from fastapi import FastAPI, Request -from fastapi.responses import FileResponse, HTMLResponse -from fastapi.staticfiles import StaticFiles -from slowapi.errors import RateLimitExceeded -from starlette.middleware.trustedhost import TrustedHostMiddleware - -from server.routers import download, dynamic, index -from server.server_config import templates -from server.server_utils import lifespan, limiter, rate_limit_exception_handler - -# Load environment variables from .env file -load_dotenv() - -# Initialize the FastAPI application with lifespan -app = FastAPI(lifespan=lifespan) -app.state.limiter = limiter - -# Register the custom exception handler for rate limits -app.add_exception_handler(RateLimitExceeded, rate_limit_exception_handler) - - -# Mount static files dynamically to serve CSS, JS, and other static assets -static_dir = Path(__file__).parent.parent / "static" -app.mount("/static", StaticFiles(directory=static_dir), name="static") - - -# Fetch allowed hosts from the environment or use the default values -allowed_hosts = os.getenv("ALLOWED_HOSTS") -if allowed_hosts: - allowed_hosts = allowed_hosts.split(",") -else: - # Define the default allowed hosts for the application - default_allowed_hosts = ["gitingest.com", "*.gitingest.com", "localhost", "127.0.0.1"] - allowed_hosts = default_allowed_hosts - -# Add middleware to enforce allowed hosts -app.add_middleware(TrustedHostMiddleware, allowed_hosts=allowed_hosts) - - -@app.get("/health") -async def health_check() -> Dict[str, str]: - """ - Health check endpoint to verify that the server is running. - - Returns - ------- - Dict[str, str] - A JSON object with a "status" key indicating the server's health status. - """ - return {"status": "healthy"} - - -@app.head("/") -async def head_root() -> HTMLResponse: - """ - Respond to HTTP HEAD requests for the root URL. - - Mirrors the headers and status code of the index page. - - Returns - ------- - HTMLResponse - An empty HTML response with appropriate headers. - """ - return HTMLResponse(content=None, headers={"content-type": "text/html; charset=utf-8"}) - - -@app.get("/api/", response_class=HTMLResponse) -@app.get("/api", response_class=HTMLResponse) -async def api_docs(request: Request) -> HTMLResponse: - """ - Render the API documentation page. - - Parameters - ---------- - request : Request - The incoming HTTP request. - - Returns - ------- - HTMLResponse - A rendered HTML page displaying API documentation. - """ - return templates.TemplateResponse("api.jinja", {"request": request}) - - -@app.get("/robots.txt") -async def robots() -> FileResponse: - """ - Serve the `robots.txt` file to guide search engine crawlers. - - Returns - ------- - FileResponse - The `robots.txt` file located in the static directory. - """ - return FileResponse("static/robots.txt") - - -@app.get("/llm.txt") -async def llm_txt() -> FileResponse: - """ - Serve the `llm.txt` file to provide information about the site to LLMs. - - Returns - ------- - FileResponse - The `llm.txt` file located in the static directory. - """ - return FileResponse("static/llm.txt") - - -# Include routers for modular endpoints -app.include_router(index) -app.include_router(download) -app.include_router(dynamic) - - - -================================================ -FILE: src/server/query_processor.py -================================================ -"""Process a query by parsing input, cloning a repository, and generating a summary.""" - -from functools import partial -from typing import Optional - -from fastapi import Request -from starlette.templating import _TemplateResponse - -from gitingest.cloning import clone_repo -from gitingest.ingestion import ingest_query -from gitingest.query_parsing import IngestionQuery, parse_query -from server.server_config import EXAMPLE_REPOS, MAX_DISPLAY_SIZE, templates -from server.server_utils import Colors, log_slider_to_size - - -async def process_query( - request: Request, - input_text: str, - slider_position: int, - pattern_type: str = "exclude", - pattern: str = "", - is_index: bool = False, - token: Optional[str] = None, -) -> _TemplateResponse: - """ - Process a query by parsing input, cloning a repository, and generating a summary. - - Handle user input, process Git repository data, and prepare - a response for rendering a template with the processed results or an error message. - - Parameters - ---------- - request : Request - The HTTP request object. - input_text : str - Input text provided by the user, typically a Git repository URL or slug. - slider_position : int - Position of the slider, representing the maximum file size in the query. - pattern_type : str - Type of pattern to use, either "include" or "exclude" (default is "exclude"). - pattern : str - Pattern to include or exclude in the query, depending on the pattern type. - is_index : bool - Flag indicating whether the request is for the index page (default is False). - token : str, optional - GitHub personal-access token (PAT). Needed when *input_text* refers to a - **private** repository. - - Returns - ------- - _TemplateResponse - Rendered template response containing the processed results or an error message. - - Raises - ------ - ValueError - If an invalid pattern type is provided. - """ - if pattern_type == "include": - include_patterns = pattern - exclude_patterns = None - elif pattern_type == "exclude": - exclude_patterns = pattern - include_patterns = None - else: - raise ValueError(f"Invalid pattern type: {pattern_type}") - - template = "index.jinja" if is_index else "git.jinja" - template_response = partial(templates.TemplateResponse, name=template) - max_file_size = log_slider_to_size(slider_position) - - context = { - "request": request, - "repo_url": input_text, - "examples": EXAMPLE_REPOS if is_index else [], - "default_file_size": slider_position, - "pattern_type": pattern_type, - "pattern": pattern, - "token": token, - } - - try: - query: IngestionQuery = await parse_query( - source=input_text, - max_file_size=max_file_size, - from_web=True, - include_patterns=include_patterns, - ignore_patterns=exclude_patterns, - token=token, - ) - if not query.url: - raise ValueError("The 'url' parameter is required.") - - # Sets the "/" for the page title - context["short_repo_url"] = f"{query.user_name}/{query.repo_name}" - - clone_config = query.extract_clone_config() - await clone_repo(clone_config, token=token) - summary, tree, content = ingest_query(query) - with open(f"{clone_config.local_path}.txt", "w", encoding="utf-8") as f: - f.write(tree + "\n" + content) - except Exception as exc: - # hack to print error message when query is not defined - if "query" in locals() and query is not None and isinstance(query, dict): - _print_error(query["url"], exc, max_file_size, pattern_type, pattern) - else: - print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<- {Colors.END}", end="") - print(f"{Colors.RED}{exc}{Colors.END}") - - context["error_message"] = f"Error: {exc}" - if "405" in str(exc): - context["error_message"] = ( - "Repository not found. Please make sure it is public (private repositories will be supported soon)" - ) - return template_response(context=context) - - if len(content) > MAX_DISPLAY_SIZE: - content = ( - f"(Files content cropped to {int(MAX_DISPLAY_SIZE / 1_000)}k characters, " - "download full ingest to see more)\n" + content[:MAX_DISPLAY_SIZE] - ) - - _print_success( - url=query.url, - max_file_size=max_file_size, - pattern_type=pattern_type, - pattern=pattern, - summary=summary, - ) - - context.update( - { - "result": True, - "summary": summary, - "tree": tree, - "content": content, - "ingest_id": query.id, - } - ) - - return template_response(context=context) - - -def _print_query(url: str, max_file_size: int, pattern_type: str, pattern: str) -> None: - """ - Print a formatted summary of the query details, including the URL, file size, - and pattern information, for easier debugging or logging. - - Parameters - ---------- - url : str - The URL associated with the query. - max_file_size : int - The maximum file size allowed for the query, in bytes. - pattern_type : str - Specifies the type of pattern to use, either "include" or "exclude". - pattern : str - The actual pattern string to include or exclude in the query. - """ - print(f"{Colors.WHITE}{url:<20}{Colors.END}", end="") - if int(max_file_size / 1024) != 50: - print(f" | {Colors.YELLOW}Size: {int(max_file_size/1024)}kb{Colors.END}", end="") - if pattern_type == "include" and pattern != "": - print(f" | {Colors.YELLOW}Include {pattern}{Colors.END}", end="") - elif pattern_type == "exclude" and pattern != "": - print(f" | {Colors.YELLOW}Exclude {pattern}{Colors.END}", end="") - - -def _print_error(url: str, e: Exception, max_file_size: int, pattern_type: str, pattern: str) -> None: - """ - Print a formatted error message including the URL, file size, pattern details, and the exception encountered, - for debugging or logging purposes. - - Parameters - ---------- - url : str - The URL associated with the query that caused the error. - e : Exception - The exception raised during the query or process. - max_file_size : int - The maximum file size allowed for the query, in bytes. - pattern_type : str - Specifies the type of pattern to use, either "include" or "exclude". - pattern : str - The actual pattern string to include or exclude in the query. - """ - print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<- {Colors.END}", end="") - _print_query(url, max_file_size, pattern_type, pattern) - print(f" | {Colors.RED}{e}{Colors.END}") - - -def _print_success(url: str, max_file_size: int, pattern_type: str, pattern: str, summary: str) -> None: - """ - Print a formatted success message, including the URL, file size, pattern details, and a summary with estimated - tokens, for debugging or logging purposes. - - Parameters - ---------- - url : str - The URL associated with the successful query. - max_file_size : int - The maximum file size allowed for the query, in bytes. - pattern_type : str - Specifies the type of pattern to use, either "include" or "exclude". - pattern : str - The actual pattern string to include or exclude in the query. - summary : str - A summary of the query result, including details like estimated tokens. - """ - estimated_tokens = summary[summary.index("Estimated tokens:") + len("Estimated ") :] - print(f"{Colors.GREEN}INFO{Colors.END}: {Colors.GREEN}<- {Colors.END}", end="") - _print_query(url, max_file_size, pattern_type, pattern) - print(f" | {Colors.PURPLE}{estimated_tokens}{Colors.END}") - - - -================================================ -FILE: src/server/server_config.py -================================================ -"""Configuration for the server.""" - -from typing import Dict, List - -from fastapi.templating import Jinja2Templates - -MAX_DISPLAY_SIZE: int = 300_000 -DELETE_REPO_AFTER: int = 60 * 60 # In seconds - - -EXAMPLE_REPOS: List[Dict[str, str]] = [ - {"name": "Gitingest", "url": "https://github.com/cyclotruc/gitingest"}, - {"name": "FastAPI", "url": "https://github.com/tiangolo/fastapi"}, - {"name": "Flask", "url": "https://github.com/pallets/flask"}, - {"name": "Excalidraw", "url": "https://github.com/excalidraw/excalidraw"}, - {"name": "ApiAnalytics", "url": "https://github.com/tom-draper/api-analytics"}, -] - -templates = Jinja2Templates(directory="server/templates") - - - -================================================ -FILE: src/server/server_utils.py -================================================ -"""Utility functions for the server.""" - -import asyncio -import math -import shutil -import time -from contextlib import asynccontextmanager -from pathlib import Path - -from fastapi import FastAPI, Request -from fastapi.responses import Response -from slowapi import Limiter, _rate_limit_exceeded_handler -from slowapi.errors import RateLimitExceeded -from slowapi.util import get_remote_address - -from gitingest.config import TMP_BASE_PATH -from server.server_config import DELETE_REPO_AFTER - -# Initialize a rate limiter -limiter = Limiter(key_func=get_remote_address) - - -async def rate_limit_exception_handler(request: Request, exc: Exception) -> Response: - """ - Custom exception handler for rate-limiting errors. - - Parameters - ---------- - request : Request - The incoming HTTP request. - exc : Exception - The exception raised, expected to be RateLimitExceeded. - - Returns - ------- - Response - A response indicating that the rate limit has been exceeded. - - Raises - ------ - exc - If the exception is not a RateLimitExceeded error, it is re-raised. - """ - if isinstance(exc, RateLimitExceeded): - # Delegate to the default rate limit handler - return _rate_limit_exceeded_handler(request, exc) - # Re-raise other exceptions - raise exc - - -@asynccontextmanager -async def lifespan(_: FastAPI): - """ - Lifecycle manager for handling startup and shutdown events for the FastAPI application. - - Parameters - ---------- - _ : FastAPI - The FastAPI application instance (unused). - - Yields - ------- - None - Yields control back to the FastAPI application while the background task runs. - """ - task = asyncio.create_task(_remove_old_repositories()) - - yield - # Cancel the background task on shutdown - task.cancel() - try: - await task - except asyncio.CancelledError: - pass - - -async def _remove_old_repositories(): - """ - Periodically remove old repository folders. - - Background task that runs periodically to clean up old repository directories. - - This task: - - Scans the TMP_BASE_PATH directory every 60 seconds - - Removes directories older than DELETE_REPO_AFTER seconds - - Before deletion, logs repository URLs to history.txt if a matching .txt file exists - - Handles errors gracefully if deletion fails - - The repository URL is extracted from the first .txt file in each directory, - assuming the filename format: "owner-repository.txt" - """ - while True: - try: - if not TMP_BASE_PATH.exists(): - await asyncio.sleep(60) - continue - - current_time = time.time() - - for folder in TMP_BASE_PATH.iterdir(): - # Skip if folder is not old enough - if current_time - folder.stat().st_ctime <= DELETE_REPO_AFTER: - continue - - await _process_folder(folder) - - except Exception as exc: - print(f"Error in _remove_old_repositories: {exc}") - - await asyncio.sleep(60) - - -async def _process_folder(folder: Path) -> None: - """ - Process a single folder for deletion and logging. - - Parameters - ---------- - folder : Path - The path to the folder to be processed. - """ - # Try to log repository URL before deletion - try: - txt_files = [f for f in folder.iterdir() if f.suffix == ".txt"] - - # Extract owner and repository name from the filename - filename = txt_files[0].stem - if txt_files and "-" in filename: - owner, repo = filename.split("-", 1) - repo_url = f"{owner}/{repo}" - - with open("history.txt", mode="a", encoding="utf-8") as history: - history.write(f"{repo_url}\n") - - except Exception as exc: - print(f"Error logging repository URL for {folder}: {exc}") - - # Delete the folder - try: - shutil.rmtree(folder) - except Exception as exc: - print(f"Error deleting {folder}: {exc}") - - -def log_slider_to_size(position: int) -> int: - """ - Convert a slider position to a file size in bytes using a logarithmic scale. - - Parameters - ---------- - position : int - Slider position ranging from 0 to 500. - - Returns - ------- - int - File size in bytes corresponding to the slider position. - """ - maxp = 500 - minv = math.log(1) - maxv = math.log(102_400) - return round(math.exp(minv + (maxv - minv) * pow(position / maxp, 1.5))) * 1024 - - -## Color printing utility -class Colors: - """ANSI color codes""" - - BLACK = "\033[0;30m" - RED = "\033[0;31m" - GREEN = "\033[0;32m" - BROWN = "\033[0;33m" - BLUE = "\033[0;34m" - PURPLE = "\033[0;35m" - CYAN = "\033[0;36m" - LIGHT_GRAY = "\033[0;37m" - DARK_GRAY = "\033[1;30m" - LIGHT_RED = "\033[1;31m" - LIGHT_GREEN = "\033[1;32m" - YELLOW = "\033[1;33m" - LIGHT_BLUE = "\033[1;34m" - LIGHT_PURPLE = "\033[1;35m" - LIGHT_CYAN = "\033[1;36m" - WHITE = "\033[1;37m" - BOLD = "\033[1m" - FAINT = "\033[2m" - ITALIC = "\033[3m" - UNDERLINE = "\033[4m" - BLINK = "\033[5m" - NEGATIVE = "\033[7m" - CROSSED = "\033[9m" - END = "\033[0m" - - - -================================================ -FILE: src/server/routers/__init__.py -================================================ -"""This module contains the routers for the FastAPI application.""" - -from server.routers.download import router as download -from server.routers.dynamic import router as dynamic -from server.routers.index import router as index - -__all__ = ["download", "dynamic", "index"] - - - -================================================ -FILE: src/server/routers/download.py -================================================ -"""This module contains the FastAPI router for downloading a digest file.""" - -from fastapi import APIRouter, HTTPException -from fastapi.responses import Response - -from gitingest.config import TMP_BASE_PATH - -router = APIRouter() - - -@router.get("/download/{digest_id}") -async def download_ingest(digest_id: str) -> Response: - """ - Download a .txt file associated with a given digest ID. - - This function searches for a `.txt` file in a directory corresponding to the provided - digest ID. If a file is found, it is read and returned as a downloadable attachment. - If no `.txt` file is found, an error is raised. - - Parameters - ---------- - digest_id : str - The unique identifier for the digest. It is used to find the corresponding directory - and locate the .txt file within that directory. - - Returns - ------- - Response - A FastAPI Response object containing the content of the found `.txt` file. The file is - sent with the appropriate media type (`text/plain`) and the correct `Content-Disposition` - header to prompt a file download. - - Raises - ------ - HTTPException - If the digest directory is not found or if no `.txt` file exists in the directory. - """ - directory = TMP_BASE_PATH / digest_id - - try: - if not directory.exists(): - raise FileNotFoundError("Directory not found") - - txt_files = [f for f in directory.iterdir() if f.suffix == ".txt"] - if not txt_files: - raise FileNotFoundError("No .txt file found") - - except FileNotFoundError as exc: - raise HTTPException(status_code=404, detail="Digest not found") from exc - - # Find the first .txt file in the directory - first_file = txt_files[0] - - with first_file.open(encoding="utf-8") as f: - content = f.read() - - return Response( - content=content, - media_type="text/plain", - headers={"Content-Disposition": f"attachment; filename={first_file.name}"}, - ) - - - -================================================ -FILE: src/server/routers/dynamic.py -================================================ -"""This module defines the dynamic router for handling dynamic path requests.""" - -from fastapi import APIRouter, Form, Request -from fastapi.responses import HTMLResponse - -from server.query_processor import process_query -from server.server_config import templates -from server.server_utils import limiter - -router = APIRouter() - - -@router.get("/{full_path:path}") -async def catch_all(request: Request, full_path: str) -> HTMLResponse: - """ - Render a page with a Git URL based on the provided path. - - This endpoint catches all GET requests with a dynamic path, constructs a Git URL - using the `full_path` parameter, and renders the `git.jinja` template with that URL. - - Parameters - ---------- - request : Request - The incoming request object, which provides context for rendering the response. - full_path : str - The full path extracted from the URL, which is used to build the Git URL. - - Returns - ------- - HTMLResponse - An HTML response containing the rendered template, with the Git URL - and other default parameters such as loading state and file size. - """ - return templates.TemplateResponse( - "git.jinja", - { - "request": request, - "repo_url": full_path, - "loading": True, - "default_file_size": 243, - }, - ) - - -@router.post("/{full_path:path}", response_class=HTMLResponse) -@limiter.limit("10/minute") -async def process_catch_all( - request: Request, - input_text: str = Form(...), - max_file_size: int = Form(...), - pattern_type: str = Form(...), - pattern: str = Form(...), - token: str = Form(...), -) -> HTMLResponse: - """ - Process the form submission with user input for query parameters. - - This endpoint handles POST requests, processes the input parameters (e.g., text, file size, pattern), - and calls the `process_query` function to handle the query logic, returning the result as an HTML response. - - Parameters - ---------- - request : Request - The incoming request object, which provides context for rendering the response. - input_text : str - The input text provided by the user for processing, by default taken from the form. - max_file_size : int - The maximum allowed file size for the input, specified by the user. - pattern_type : str - The type of pattern used for the query, specified by the user. - pattern : str - The pattern string used in the query, specified by the user. - token : str - GitHub personal-access token (PAT). Needed when *input_text* refers to a - **private** repository. - Returns - ------- - HTMLResponse - An HTML response generated after processing the form input and query logic, - which will be rendered and returned to the user. - """ - resolved_token = None if token == "" else token - return await process_query( - request, - input_text, - max_file_size, - pattern_type, - pattern, - is_index=False, - token=resolved_token, - ) - - - -================================================ -FILE: src/server/routers/index.py -================================================ -"""This module defines the FastAPI router for the home page of the application.""" - -from fastapi import APIRouter, Form, Request -from fastapi.responses import HTMLResponse - -from server.query_processor import process_query -from server.server_config import EXAMPLE_REPOS, templates -from server.server_utils import limiter - -router = APIRouter() - - -@router.get("/", response_class=HTMLResponse) -async def home(request: Request) -> HTMLResponse: - """ - Render the home page with example repositories and default parameters. - - This endpoint serves the home page of the application, rendering the `index.jinja` template - and providing it with a list of example repositories and default file size values. - - Parameters - ---------- - request : Request - The incoming request object, which provides context for rendering the response. - - Returns - ------- - HTMLResponse - An HTML response containing the rendered home page template, with example repositories - and other default parameters such as file size. - """ - return templates.TemplateResponse( - "index.jinja", - { - "request": request, - "examples": EXAMPLE_REPOS, - "default_file_size": 243, - }, - ) - - -@router.post("/", response_class=HTMLResponse) -@limiter.limit("10/minute") -async def index_post( - request: Request, - input_text: str = Form(...), - max_file_size: int = Form(...), - pattern_type: str = Form(...), - pattern: str = Form(...), - token: str = Form(...), -) -> HTMLResponse: - """ - Process the form submission with user input for query parameters. - - This endpoint handles POST requests from the home page form. It processes the user-submitted - input (e.g., text, file size, pattern type) and invokes the `process_query` function to handle - the query logic, returning the result as an HTML response. - - Parameters - ---------- - request : Request - The incoming request object, which provides context for rendering the response. - input_text : str - The input text provided by the user for processing, by default taken from the form. - max_file_size : int - The maximum allowed file size for the input, specified by the user. - pattern_type : str - The type of pattern used for the query, specified by the user. - pattern : str - The pattern string used in the query, specified by the user. - token : str - GitHub personal-access token (PAT). Needed when *input_text* refers to a - **private** repository. - Returns - ------- - HTMLResponse - An HTML response containing the results of processing the form input and query logic, - which will be rendered and returned to the user. - """ - resolved_token = None if token == "" else token - return await process_query( - request, - input_text, - max_file_size, - pattern_type, - pattern, - is_index=True, - token=resolved_token, - ) - - - -================================================ -FILE: tests/__init__.py -================================================ - - - -================================================ -FILE: tests/conftest.py -================================================ -""" -Fixtures for tests. - -This file provides shared fixtures for creating sample queries, a temporary directory structure, and a helper function -to write `.ipynb` notebooks for testing notebook utilities. -""" - -import json -from pathlib import Path -from typing import Any, Callable, Dict, List -from unittest.mock import AsyncMock - -import pytest -from pytest_mock import MockerFixture - -from gitingest.query_parsing import IngestionQuery - -WriteNotebookFunc = Callable[[str, Dict[str, Any]], Path] - -DEMO_URL = "https://github.com/user/repo" -LOCAL_REPO_PATH = "/tmp/repo" - - -@pytest.fixture -def sample_query() -> IngestionQuery: - """ - Provide a default `IngestionQuery` object for use in tests. - - This fixture returns a `IngestionQuery` pre-populated with typical fields and some default ignore patterns. - - Returns - ------- - IngestionQuery - The sample `IngestionQuery` object. - """ - return IngestionQuery( - user_name="test_user", - repo_name="test_repo", - url=None, - subpath="/", - local_path=Path("/tmp/test_repo").resolve(), - slug="test_user/test_repo", - id="id", - branch="main", - max_file_size=1_000_000, - ignore_patterns={"*.pyc", "__pycache__", ".git"}, - include_patterns=None, - ) - - -@pytest.fixture -def temp_directory(tmp_path: Path) -> Path: - """ - Create a temporary directory structure for testing repository scanning. - - The structure includes: - test_repo/ - ├── file1.txt - ├── file2.py - ├── src/ - │ ├── subfile1.txt - │ ├── subfile2.py - │ └── subdir/ - │ ├── file_subdir.txt - │ └── file_subdir.py - ├── dir1/ - │ └── file_dir1.txt - └── dir2/ - └── file_dir2.txt - - Parameters - ---------- - tmp_path : Path - The temporary directory path provided by the `tmp_path` fixture. - - Returns - ------- - Path - The path to the created `test_repo` directory. - """ - test_dir = tmp_path / "test_repo" - test_dir.mkdir() - - # Root files - (test_dir / "file1.txt").write_text("Hello World") - (test_dir / "file2.py").write_text("print('Hello')") - - # src directory and its files - src_dir = test_dir / "src" - src_dir.mkdir() - (src_dir / "subfile1.txt").write_text("Hello from src") - (src_dir / "subfile2.py").write_text("print('Hello from src')") - - # src/subdir and its files - subdir = src_dir / "subdir" - subdir.mkdir() - (subdir / "file_subdir.txt").write_text("Hello from subdir") - (subdir / "file_subdir.py").write_text("print('Hello from subdir')") - - # dir1 and its file - dir1 = test_dir / "dir1" - dir1.mkdir() - (dir1 / "file_dir1.txt").write_text("Hello from dir1") - - # dir2 and its file - dir2 = test_dir / "dir2" - dir2.mkdir() - (dir2 / "file_dir2.txt").write_text("Hello from dir2") - - return test_dir - - -@pytest.fixture -def write_notebook(tmp_path: Path) -> WriteNotebookFunc: - """ - Provide a helper function to write a `.ipynb` notebook file with the given content. - - Parameters - ---------- - tmp_path : Path - The temporary directory path provided by the `tmp_path` fixture. - - Returns - ------- - WriteNotebookFunc - A callable that accepts a filename and a dictionary (representing JSON notebook data), writes it to a `.ipynb` - file, and returns the path to the file. - """ - - def _write_notebook(name: str, content: Dict[str, Any]) -> Path: - notebook_path = tmp_path / name - with notebook_path.open(mode="w", encoding="utf-8") as f: - json.dump(content, f) - return notebook_path - - return _write_notebook - - -@pytest.fixture -def stub_branches(mocker: MockerFixture) -> Callable[[List[str]], None]: - """Return a function that stubs git branch discovery to *branches*.""" - - def _factory(branches: List[str]) -> None: - mocker.patch( - "gitingest.utils.git_utils.run_command", - new_callable=AsyncMock, - return_value=("\n".join(f"refs/heads/{b}" for b in branches).encode() + b"\n", b""), - ) - mocker.patch( - "gitingest.utils.git_utils.fetch_remote_branch_list", - new_callable=AsyncMock, - return_value=branches, - ) - - return _factory - - -@pytest.fixture -def repo_exists_true(mocker: MockerFixture) -> AsyncMock: - """Patch `gitingest.cloning.check_repo_exists` to always return ``True``. - - Many cloning-related tests assume that the remote repository exists. This fixture centralises - that behaviour so individual tests no longer need to repeat the same ``mocker.patch`` call. - The mock object is returned so that tests can make assertions on how it was used or override - its behaviour when needed. - """ - return mocker.patch("gitingest.cloning.check_repo_exists", return_value=True) - - -@pytest.fixture -def run_command_mock(mocker: MockerFixture) -> AsyncMock: - """Patch `gitingest.cloning.run_command` with an ``AsyncMock``. - - The mocked function returns a dummy process whose ``communicate`` method yields generic - *stdout* / *stderr* bytes. Tests can still access / tweak the mock via the fixture argument. - """ - mock_exec = mocker.patch("gitingest.cloning.run_command", new_callable=AsyncMock) - - # Provide a default dummy process so most tests don't have to create one. - dummy_process = AsyncMock() - dummy_process.communicate.return_value = (b"output", b"error") - mock_exec.return_value = dummy_process - - return mock_exec - - - -================================================ -FILE: tests/test_cli.py -================================================ -"""Tests for the Gitingest CLI.""" - -import os -from inspect import signature -from pathlib import Path -from typing import List - -import pytest -from _pytest.monkeypatch import MonkeyPatch -from click.testing import CliRunner, Result - -from gitingest.cli import main -from gitingest.config import MAX_FILE_SIZE, OUTPUT_FILE_NAME - - -@pytest.mark.parametrize( - "cli_args, expect_file", - [ - pytest.param(["./"], True, id="default-options"), - pytest.param( - [ - "./", - "--output", - str(OUTPUT_FILE_NAME), - "--max-size", - str(MAX_FILE_SIZE), - "--exclude-pattern", - "tests/", - "--include-pattern", - "src/", - ], - True, - id="custom-options", - ), - ], -) -def test_cli_writes_file(tmp_path: Path, monkeypatch: MonkeyPatch, cli_args: List[str], expect_file: bool) -> None: - """Run the CLI and verify that the SARIF file is created (or not).""" - # Work inside an isolated temp directory - monkeypatch.chdir(tmp_path) - - result = _invoke_isolated_cli_runner(cli_args) - - assert result.exit_code == 0, result.stderr - - # Summary line should be on STDOUT - stdout_lines = result.stdout.splitlines() - assert f"Analysis complete! Output written to: {OUTPUT_FILE_NAME}" in stdout_lines - - # File side-effect - sarif_file = tmp_path / OUTPUT_FILE_NAME - assert sarif_file.exists() is expect_file, f"{OUTPUT_FILE_NAME} existence did not match expectation" - - -def test_cli_with_stdout_output() -> None: - """Test CLI invocation with output directed to STDOUT.""" - # Clean up any existing digest.txt file before test - if os.path.exists(OUTPUT_FILE_NAME): - os.remove(OUTPUT_FILE_NAME) - - try: - result = _invoke_isolated_cli_runner(["./", "--output", "-", "--exclude-pattern", "tests/"]) - - # ─── core expectations (stdout) ────────────────────────────────────- - assert result.exit_code == 0, f"CLI exited with code {result.exit_code}, stderr: {result.stderr}" - assert "---" in result.stdout, "Expected file separator '---' not found in STDOUT" - assert ( - "src/gitingest/cli.py" in result.stdout - ), "Expected content (e.g., src/gitingest/cli.py) not found in STDOUT" - assert not os.path.exists(OUTPUT_FILE_NAME), f"Output file {OUTPUT_FILE_NAME} was unexpectedly created." - - # ─── the summary must *not* pollute STDOUT, must appear on STDERR ─── - summary = "Analysis complete! Output sent to stdout." - stdout_lines = result.stdout.splitlines() - stderr_lines = result.stderr.splitlines() - assert summary not in stdout_lines, "Unexpected summary message found in STDOUT" - assert summary in stderr_lines, "Expected summary message not found in STDERR" - assert f"Output written to: {OUTPUT_FILE_NAME}" not in stderr_lines - finally: - # Clean up any digest.txt file that might have been created during test - if os.path.exists(OUTPUT_FILE_NAME): - os.remove(OUTPUT_FILE_NAME) - - -def _invoke_isolated_cli_runner(args: List[str]) -> Result: - """Return a CliRunner that keeps stderr apart on Click 8.0-8.1.""" - kwargs = {} - if "mix_stderr" in signature(CliRunner.__init__).parameters: - kwargs["mix_stderr"] = False # Click 8.0–8.1 - runner = CliRunner(**kwargs) - return runner.invoke(main, args) - - - -================================================ -FILE: tests/test_flow_integration.py -================================================ -"""Integration tests covering core functionalities, edge cases, and concurrency handling.""" - -import shutil -from concurrent.futures import ThreadPoolExecutor -from pathlib import Path -from typing import Generator - -import pytest -from fastapi.testclient import TestClient -from pytest import FixtureRequest -from pytest_mock import MockerFixture - -from src.server.main import app - -BASE_DIR = Path(__file__).resolve().parent.parent -TEMPLATE_DIR = BASE_DIR / "src" / "templates" - - -@pytest.fixture(scope="module") -def test_client() -> Generator[TestClient, None, None]: - """Create a test client fixture.""" - with TestClient(app) as client_instance: - client_instance.headers.update({"Host": "localhost"}) - yield client_instance - - -@pytest.fixture(autouse=True) -def mock_static_files(mocker: MockerFixture) -> Generator[None, None, None]: - """Mock the static file mount to avoid directory errors.""" - mock_static = mocker.patch("src.server.main.StaticFiles", autospec=True) - mock_static.return_value = None - yield mock_static - - -@pytest.fixture(autouse=True) -def mock_templates(mocker: MockerFixture) -> Generator[None, None, None]: - """Mock Jinja2 template rendering to bypass actual file loading.""" - mock_template = mocker.patch("starlette.templating.Jinja2Templates.TemplateResponse", autospec=True) - mock_template.return_value = "Mocked Template Response" - yield mock_template - - -@pytest.fixture(scope="module", autouse=True) -def cleanup_tmp_dir() -> Generator[None, None, None]: - """Remove /tmp/gitingest after this test-module is done.""" - yield # run tests - temp_dir = Path("/tmp/gitingest") - if temp_dir.exists(): - try: - shutil.rmtree(temp_dir) - except PermissionError as exc: - print(f"Error cleaning up {temp_dir}: {exc}") - - -@pytest.mark.asyncio -async def test_remote_repository_analysis(request: FixtureRequest) -> None: - """Test the complete flow of analyzing a remote repository.""" - client = request.getfixturevalue("test_client") - form_data = { - "input_text": "https://github.com/octocat/Hello-World", - "max_file_size": "243", - "pattern_type": "exclude", - "pattern": "", - "token": "", - } - - response = client.post("/", data=form_data) - assert response.status_code == 200, f"Form submission failed: {response.text}" - assert "Mocked Template Response" in response.text - - -@pytest.mark.asyncio -async def test_invalid_repository_url(request: FixtureRequest) -> None: - """Test handling of an invalid repository URL.""" - client = request.getfixturevalue("test_client") - form_data = { - "input_text": "https://github.com/nonexistent/repo", - "max_file_size": "243", - "pattern_type": "exclude", - "pattern": "", - "token": "", - } - - response = client.post("/", data=form_data) - assert response.status_code == 200, f"Request failed: {response.text}" - assert "Mocked Template Response" in response.text - - -@pytest.mark.asyncio -async def test_large_repository(request: FixtureRequest) -> None: - """Simulate analysis of a large repository with nested folders.""" - client = request.getfixturevalue("test_client") - form_data = { - "input_text": "https://github.com/large/repo-with-many-files", - "max_file_size": "243", - "pattern_type": "exclude", - "pattern": "", - "token": "", - } - - response = client.post("/", data=form_data) - assert response.status_code == 200, f"Request failed: {response.text}" - assert "Mocked Template Response" in response.text - - -@pytest.mark.asyncio -async def test_concurrent_requests(request: FixtureRequest) -> None: - """Test handling of multiple concurrent requests.""" - client = request.getfixturevalue("test_client") - - def make_request(): - form_data = { - "input_text": "https://github.com/octocat/Hello-World", - "max_file_size": "243", - "pattern_type": "exclude", - "pattern": "", - "token": "", - } - response = client.post("/", data=form_data) - assert response.status_code == 200, f"Request failed: {response.text}" - assert "Mocked Template Response" in response.text - - with ThreadPoolExecutor(max_workers=5) as executor: - futures = [executor.submit(make_request) for _ in range(5)] - for future in futures: - future.result() - - -@pytest.mark.asyncio -async def test_large_file_handling(request: FixtureRequest) -> None: - """Test handling of repositories with large files.""" - client = request.getfixturevalue("test_client") - form_data = { - "input_text": "https://github.com/octocat/Hello-World", - "max_file_size": "1", - "pattern_type": "exclude", - "pattern": "", - "token": "", - } - - response = client.post("/", data=form_data) - assert response.status_code == 200, f"Request failed: {response.text}" - assert "Mocked Template Response" in response.text - - -@pytest.mark.asyncio -async def test_repository_with_patterns(request: FixtureRequest) -> None: - """Test repository analysis with include/exclude patterns.""" - client = request.getfixturevalue("test_client") - form_data = { - "input_text": "https://github.com/octocat/Hello-World", - "max_file_size": "243", - "pattern_type": "include", - "pattern": "*.md", - "token": "", - } - - response = client.post("/", data=form_data) - assert response.status_code == 200, f"Request failed: {response.text}" - assert "Mocked Template Response" in response.text - - - -================================================ -FILE: tests/test_git_utils.py -================================================ -""" -Tests for the `git_utils` module. - -These tests validate the `validate_github_token` function, which ensures that -GitHub personal access tokens (PATs) are properly formatted. -""" - -import base64 - -import pytest - -from gitingest.utils.exceptions import InvalidGitHubTokenError -from gitingest.utils.git_utils import ( - create_git_auth_header, - create_git_command, - validate_github_token, -) - - -@pytest.mark.parametrize( - "token", - [ - # Valid tokens: correct prefixes and at least 36 allowed characters afterwards - "github_pat_" + "a" * 36, - "ghp_" + "A" * 36, - "github_pat_1234567890abcdef1234567890abcdef1234", - ], -) -def test_validate_github_token_valid(token): - """validate_github_token should accept properly-formatted tokens.""" - # Should not raise any exception - validate_github_token(token) - - -@pytest.mark.parametrize( - "token", - [ - "github_pat_short", # Too short after prefix - "ghp_" + "b" * 35, # one character short - "invalidprefix_" + "c" * 36, # Wrong prefix - "github_pat_" + "!" * 36, # Disallowed characters - "", # Empty string - ], -) -def test_validate_github_token_invalid(token): - """validate_github_token should raise ValueError on malformed tokens.""" - with pytest.raises(InvalidGitHubTokenError): - validate_github_token(token) - - -@pytest.mark.parametrize( - "base_cmd, local_path, url, token, expected_suffix", - [ - ( - ["git", "clone"], - "/some/path", - "https://github.com/owner/repo.git", - None, - [], # No auth header expected when token is None - ), - ( - ["git", "clone"], - "/some/path", - "https://github.com/owner/repo.git", - "ghp_" + "d" * 36, - [ - "-c", - create_git_auth_header("ghp_" + "d" * 36), - ], # Auth header expected for GitHub URL + token - ), - ( - ["git", "clone"], - "/some/path", - "https://gitlab.com/owner/repo.git", - "ghp_" + "e" * 36, - [], # No auth header for non-GitHub URL even if token provided - ), - ], -) -def test_create_git_command(base_cmd, local_path, url, token, expected_suffix): - """create_git_command should build the correct command list based on inputs.""" - cmd = create_git_command(base_cmd, local_path, url, token) - - # The command should start with base_cmd and the -C option - expected_prefix = base_cmd + ["-C", local_path] - assert cmd[: len(expected_prefix)] == expected_prefix - - # The suffix (anything after prefix) should match expected - assert cmd[len(expected_prefix) :] == expected_suffix - - -def test_create_git_command_invalid_token(): - """Supplying an invalid token for a GitHub URL should raise ValueError.""" - with pytest.raises(InvalidGitHubTokenError): - create_git_command( - ["git", "clone"], - "/some/path", - "https://github.com/owner/repo.git", - "invalid_token", - ) - - -@pytest.mark.parametrize( - "token", - [ - "ghp_abcdefghijklmnopqrstuvwxyz012345", # typical ghp_ token - "github_pat_1234567890abcdef1234567890abcdef1234", - ], -) -def test_create_git_auth_header(token): - """create_git_auth_header should produce correct base64-encoded header.""" - header = create_git_auth_header(token) - expected_basic = base64.b64encode(f"x-oauth-basic:{token}".encode()).decode() - expected = f"http.https://github.com/.extraheader=Authorization: Basic {expected_basic}" - assert header == expected - - -@pytest.mark.parametrize( - "url, token, should_call", - [ - ("https://github.com/foo/bar.git", "ghp_" + "f" * 36, True), - ("https://github.com/foo/bar.git", None, False), - ("https://gitlab.com/foo/bar.git", "ghp_" + "g" * 36, False), - ], -) -def test_create_git_command_helper_calls(mocker, url, token, should_call): - """Verify validate_github_token & create_git_auth_header are invoked only when appropriate.""" - - validate_mock = mocker.patch("gitingest.utils.git_utils.validate_github_token") - header_mock = mocker.patch("gitingest.utils.git_utils.create_git_auth_header", return_value="HEADER") - - cmd = create_git_command(["git", "clone"], "/tmp", url, token) - - if should_call: - validate_mock.assert_called_once_with(token) - header_mock.assert_called_once_with(token) - assert "HEADER" in cmd - else: - validate_mock.assert_not_called() - header_mock.assert_not_called() - # HEADER should not be included in command list - assert "HEADER" not in cmd - - - -================================================ -FILE: tests/test_gitignore_feature.py -================================================ -""" -Tests for the gitignore functionality in Gitingest. -""" - -from pathlib import Path - -import pytest - -from gitingest.entrypoint import ingest_async -from gitingest.utils.ignore_patterns import load_gitignore_patterns - - -@pytest.fixture(name="repo_path") -def repo_fixture(tmp_path: Path) -> Path: - """ - Create a temporary repository structure with: - - A .gitignore that excludes 'exclude.txt' - - 'include.txt' (should be processed) - - 'exclude.txt' (should be skipped when gitignore rules are respected) - """ - # Create a .gitignore file that excludes 'exclude.txt' - gitignore_file = tmp_path / ".gitignore" - gitignore_file.write_text("exclude.txt\n") - - # Create a file that should be included - include_file = tmp_path / "include.txt" - include_file.write_text("This file should be included.") - - # Create a file that should be excluded - exclude_file = tmp_path / "exclude.txt" - exclude_file.write_text("This file should be excluded.") - - return tmp_path - - -def test_load_gitignore_patterns(tmp_path: Path): - """ - Test that load_gitignore_patterns() correctly loads patterns from a .gitignore file. - """ - gitignore = tmp_path / ".gitignore" - # Write some sample patterns with a comment line included - gitignore.write_text("exclude.txt\n*.log\n# a comment\n") - - patterns = load_gitignore_patterns(tmp_path) - - # Check that the expected patterns are loaded - assert "exclude.txt" in patterns - assert "*.log" in patterns - # Ensure that comment lines are not added - for pattern in patterns: - assert not pattern.startswith("#") - - -@pytest.mark.asyncio -async def test_ingest_with_gitignore(repo_path: Path): - """ - Integration test for ingest_async() respecting .gitignore rules. - - When ``include_gitignored`` is ``False`` (default), the content of 'exclude.txt' should be omitted. - When ``include_gitignored`` is ``True``, both files should be present. - """ - # Run ingestion with the gitignore functionality enabled. - _, _, content_with_ignore = await ingest_async(source=str(repo_path)) - # 'exclude.txt' should be skipped. - assert "This file should be excluded." not in content_with_ignore - # 'include.txt' should be processed. - assert "This file should be included." in content_with_ignore - - # Run ingestion with the gitignore functionality disabled. - _, _, content_without_ignore = await ingest_async(source=str(repo_path), include_gitignored=True) - # Now both files should be present. - assert "This file should be excluded." in content_without_ignore - assert "This file should be included." in content_without_ignore - - - -================================================ -FILE: tests/test_ingestion.py -================================================ -""" -Tests for the `query_ingestion` module. - -These tests validate directory scanning, file content extraction, notebook handling, and the overall ingestion logic, -including filtering patterns and subpaths. -""" - -import re -from pathlib import Path -from typing import Set, TypedDict - -import pytest - -from gitingest.ingestion import ingest_query -from gitingest.query_parsing import IngestionQuery - - -def test_run_ingest_query(temp_directory: Path, sample_query: IngestionQuery) -> None: - """ - Test `ingest_query` to ensure it processes the directory and returns expected results. - - Given a directory with .txt and .py files: - When `ingest_query` is invoked, - Then it should produce a summary string listing the files analyzed and a combined content string. - """ - sample_query.local_path = temp_directory - sample_query.subpath = "/" - sample_query.type = None - - summary, _, content = ingest_query(sample_query) - - assert "Repository: test_user/test_repo" in summary - assert "Files analyzed: 8" in summary - - # Check presence of key files in the content - assert "src/subfile1.txt" in content - assert "src/subfile2.py" in content - assert "src/subdir/file_subdir.txt" in content - assert "src/subdir/file_subdir.py" in content - assert "file1.txt" in content - assert "file2.py" in content - assert "dir1/file_dir1.txt" in content - assert "dir2/file_dir2.txt" in content - - -# TODO: Additional tests: -# - Multiple include patterns, e.g. ["*.txt", "*.py"] or ["/src/*", "*.txt"]. -# - Edge cases with weird file names or deep subdirectory structures. -# TODO : def test_include_nonexistent_extension - - -class PatternScenario(TypedDict): - include_patterns: Set[str] - ignore_patterns: Set[str] - expected_num_files: int - expected_content: Set[str] - expected_structure: Set[str] - expected_not_structure: Set[str] - - -@pytest.mark.parametrize( - "pattern_scenario", - [ - pytest.param( - PatternScenario( - { - "include_patterns": {"file2.py", "dir2/file_dir2.txt"}, - "ignore_patterns": {*()}, - "expected_num_files": 2, - "expected_content": {"file2.py", "dir2/file_dir2.txt"}, - "expected_structure": {"test_repo/", "dir2/"}, - "expected_not_structure": {"src/", "subdir/", "dir1/"}, - } - ), - id="include-explicit-files", - ), - pytest.param( - PatternScenario( - { - "include_patterns": { - "file1.txt", - "file2.py", - "file_dir1.txt", - "*/file_dir2.txt", - }, - "ignore_patterns": {*()}, - "expected_num_files": 4, - "expected_content": {"file1.txt", "file2.py", "dir1/file_dir1.txt", "dir2/file_dir2.txt"}, - "expected_structure": {"test_repo/", "dir1/", "dir2/"}, - "expected_not_structure": {"src/", "subdir/"}, - } - ), - id="include-wildcard-directory", - ), - pytest.param( - PatternScenario( - { - "include_patterns": {"*.py"}, - "ignore_patterns": {*()}, - "expected_num_files": 3, - "expected_content": { - "file2.py", - "src/subfile2.py", - "src/subdir/file_subdir.py", - }, - "expected_structure": {"test_repo/", "src/", "subdir/"}, - "expected_not_structure": {"dir1/", "dir2/"}, - } - ), - id="include-wildcard-files", - ), - pytest.param( - PatternScenario( - { - "include_patterns": {"**/file_dir2.txt", "src/**/*.py"}, - "ignore_patterns": {*()}, - "expected_num_files": 3, - "expected_content": { - "dir2/file_dir2.txt", - "src/subfile2.py", - "src/subdir/file_subdir.py", - }, - "expected_structure": {"test_repo/", "dir2/", "src/", "subdir/"}, - "expected_not_structure": {"dir1/"}, - } - ), - id="include-recursive-wildcard", - ), - pytest.param( - PatternScenario( - { - "include_patterns": {*()}, - "ignore_patterns": {"file2.py", "dir2/file_dir2.txt"}, - "expected_num_files": 6, - "expected_content": { - "file1.txt", - "src/subfile1.txt", - "src/subfile2.py", - "src/subdir/file_subdir.txt", - "src/subdir/file_subdir.py", - "dir1/file_dir1.txt", - }, - "expected_structure": {"test_repo/", "src/", "subdir/", "dir1/"}, - "expected_not_structure": {"dir2/"}, - } - ), - id="exclude-explicit-files", - ), - pytest.param( - PatternScenario( - { - "include_patterns": {*()}, - "ignore_patterns": {"file1.txt", "file2.py", "*/file_dir1.txt"}, - "expected_num_files": 5, - "expected_content": { - "src/subfile1.txt", - "src/subfile2.py", - "src/subdir/file_subdir.txt", - "src/subdir/file_subdir.py", - "dir2/file_dir2.txt", - }, - "expected_structure": {"test_repo/", "src/", "subdir/", "dir2/"}, - "expected_not_structure": {"dir1/"}, - } - ), - id="exclude-wildcard-directory", - ), - pytest.param( - PatternScenario( - { - "include_patterns": {*()}, - "ignore_patterns": {"src/**/*.py"}, - "expected_num_files": 6, - "expected_content": { - "file1.txt", - "file2.py", - "src/subfile1.txt", - "src/subdir/file_subdir.txt", - "dir1/file_dir1.txt", - "dir2/file_dir2.txt", - }, - "expected_structure": { - "test_repo/", - "dir1/", - "dir2/", - "src/", - "subdir/", - }, - "expected_not_structure": {*()}, - } - ), - id="exclude-recursive-wildcard", - ), - ], -) -def test_include_ignore_patterns( - temp_directory: Path, - sample_query: IngestionQuery, - pattern_scenario: PatternScenario, -) -> None: - """ - Test `ingest_query` to ensure included and ignored paths are included and ignored respectively. - - Given a directory with .txt and .py files, and a set of include patterns or a set of ignore patterns: - When `ingest_query` is invoked, - Then it should produce a summary string listing the files analyzed and a combined content string. - """ - - sample_query.local_path = temp_directory - sample_query.subpath = "/" - sample_query.type = None - sample_query.include_patterns = pattern_scenario["include_patterns"] or None - sample_query.ignore_patterns = pattern_scenario["ignore_patterns"] or None - - summary, structure, content = ingest_query(sample_query) - - assert "Repository: test_user/test_repo" in summary - num_files_regex = re.compile(r"^Files analyzed: (\d+)$", re.MULTILINE) - assert (num_files_match := num_files_regex.search(summary)) is not None - assert int(num_files_match.group(1)) == pattern_scenario["expected_num_files"] - - # Check presence of key files in the content - for expected_content_item in pattern_scenario["expected_content"]: - assert expected_content_item in content - - # check presence of included directories in structure - for expected_structure_item in pattern_scenario["expected_structure"]: - assert expected_structure_item in structure - - # check non-presence of non-included directories in structure - for expected_not_structure_item in pattern_scenario["expected_not_structure"]: - assert expected_not_structure_item not in structure - - - -================================================ -FILE: tests/test_notebook_utils.py -================================================ -""" -Tests for the `notebook_utils` module. - -These tests validate how notebooks are processed into Python-like output, ensuring that markdown/raw cells are -converted to triple-quoted blocks, code cells remain executable code, and various edge cases (multiple worksheets, -empty cells, outputs, etc.) are handled appropriately. -""" - -import pytest - -from gitingest.utils.notebook_utils import process_notebook -from tests.conftest import WriteNotebookFunc - - -def test_process_notebook_all_cells(write_notebook: WriteNotebookFunc) -> None: - """ - Test processing a notebook containing markdown, code, and raw cells. - - Given a notebook with: - - One markdown cell - - One code cell - - One raw cell - When `process_notebook` is invoked, - Then markdown and raw cells should appear in triple-quoted blocks, and code cells remain as normal code. - """ - notebook_content = { - "cells": [ - {"cell_type": "markdown", "source": ["# Markdown cell"]}, - {"cell_type": "code", "source": ['print("Hello Code")']}, - {"cell_type": "raw", "source": [""]}, - ] - } - nb_path = write_notebook("all_cells.ipynb", notebook_content) - result = process_notebook(nb_path) - - assert result.count('"""') == 4, "Two non-code cells => 2 triple-quoted blocks => 4 total triple quotes." - - # Ensure markdown and raw cells are in triple quotes - assert "# Markdown cell" in result - assert "" in result - - # Ensure code cell is not in triple quotes - assert 'print("Hello Code")' in result - assert '"""\nprint("Hello Code")\n"""' not in result - - -def test_process_notebook_with_worksheets(write_notebook: WriteNotebookFunc) -> None: - """ - Test a notebook containing the (as of IPEP-17 deprecated) 'worksheets' key. - - Given a notebook that uses the 'worksheets' key with a single worksheet, - When `process_notebook` is called, - Then a `DeprecationWarning` should be raised, and the content should match an equivalent notebook - that has top-level 'cells'. - """ - with_worksheets = { - "worksheets": [ - { - "cells": [ - {"cell_type": "markdown", "source": ["# Markdown cell"]}, - {"cell_type": "code", "source": ['print("Hello Code")']}, - {"cell_type": "raw", "source": [""]}, - ] - } - ] - } - without_worksheets = with_worksheets["worksheets"][0] # same, but no 'worksheets' key - - nb_with = write_notebook("with_worksheets.ipynb", with_worksheets) - nb_without = write_notebook("without_worksheets.ipynb", without_worksheets) - - with pytest.warns(DeprecationWarning, match="Worksheets are deprecated as of IPEP-17."): - result_with = process_notebook(nb_with) - - # Should not raise a warning - result_without = process_notebook(nb_without) - - assert result_with == result_without, "Content from the single worksheet should match the top-level equivalent." - - -def test_process_notebook_multiple_worksheets(write_notebook: WriteNotebookFunc) -> None: - """ - Test a notebook containing multiple 'worksheets'. - - Given a notebook with two worksheets: - - First with a markdown cell - - Second with a code cell - When `process_notebook` is called, - Then a warning about multiple worksheets should be raised, and the second worksheet's content should appear - in the final output. - """ - multi_worksheets = { - "worksheets": [ - {"cells": [{"cell_type": "markdown", "source": ["# First Worksheet"]}]}, - {"cells": [{"cell_type": "code", "source": ["# Second Worksheet"]}]}, - ] - } - - single_worksheet = { - "worksheets": [ - {"cells": [{"cell_type": "markdown", "source": ["# First Worksheet"]}]}, - ] - } - - nb_multi = write_notebook("multiple_worksheets.ipynb", multi_worksheets) - nb_single = write_notebook("single_worksheet.ipynb", single_worksheet) - - # Expect DeprecationWarning + UserWarning - with pytest.warns( - DeprecationWarning, match="Worksheets are deprecated as of IPEP-17. Consider updating the notebook." - ): - with pytest.warns( - UserWarning, match="Multiple worksheets detected. Combining all worksheets into a single script." - ): - result_multi = process_notebook(nb_multi) - - # Expect DeprecationWarning only - with pytest.warns( - DeprecationWarning, match="Worksheets are deprecated as of IPEP-17. Consider updating the notebook." - ): - result_single = process_notebook(nb_single) - - assert result_multi != result_single, "Two worksheets should produce more content than one." - assert len(result_multi) > len(result_single), "The multi-worksheet notebook should have extra code content." - assert "# First Worksheet" in result_single - assert "# Second Worksheet" not in result_single - assert "# First Worksheet" in result_multi - assert "# Second Worksheet" in result_multi - - -def test_process_notebook_code_only(write_notebook: WriteNotebookFunc) -> None: - """ - Test a notebook containing only code cells. - - Given a notebook with code cells only: - When `process_notebook` is called, - Then no triple quotes should appear in the output. - """ - notebook_content = { - "cells": [ - {"cell_type": "code", "source": ["print('Code Cell 1')"]}, - {"cell_type": "code", "source": ["x = 42"]}, - ] - } - nb_path = write_notebook("code_only.ipynb", notebook_content) - result = process_notebook(nb_path) - - assert '"""' not in result, "No triple quotes expected when there are only code cells." - assert "print('Code Cell 1')" in result - assert "x = 42" in result - - -def test_process_notebook_markdown_only(write_notebook: WriteNotebookFunc) -> None: - """ - Test a notebook with only markdown cells. - - Given a notebook with two markdown cells: - When `process_notebook` is called, - Then each markdown cell should become a triple-quoted block (2 blocks => 4 triple quotes total). - """ - notebook_content = { - "cells": [ - {"cell_type": "markdown", "source": ["# Markdown Header"]}, - {"cell_type": "markdown", "source": ["Some more markdown."]}, - ] - } - nb_path = write_notebook("markdown_only.ipynb", notebook_content) - result = process_notebook(nb_path) - - assert result.count('"""') == 4, "Two markdown cells => 2 blocks => 4 triple quotes total." - assert "# Markdown Header" in result - assert "Some more markdown." in result - - -def test_process_notebook_raw_only(write_notebook: WriteNotebookFunc) -> None: - """ - Test a notebook with only raw cells. - - Given two raw cells: - When `process_notebook` is called, - Then each raw cell should become a triple-quoted block (2 blocks => 4 triple quotes total). - """ - notebook_content = { - "cells": [ - {"cell_type": "raw", "source": ["Raw content line 1"]}, - {"cell_type": "raw", "source": ["Raw content line 2"]}, - ] - } - nb_path = write_notebook("raw_only.ipynb", notebook_content) - result = process_notebook(nb_path) - - assert result.count('"""') == 4, "Two raw cells => 2 blocks => 4 triple quotes." - assert "Raw content line 1" in result - assert "Raw content line 2" in result - - -def test_process_notebook_empty_cells(write_notebook: WriteNotebookFunc) -> None: - """ - Test that cells with an empty 'source' are skipped. - - Given a notebook with 4 cells, 3 of which have empty `source`: - When `process_notebook` is called, - Then only the non-empty cell should appear in the output (1 block => 2 triple quotes). - """ - notebook_content = { - "cells": [ - {"cell_type": "markdown", "source": []}, - {"cell_type": "code", "source": []}, - {"cell_type": "raw", "source": []}, - {"cell_type": "markdown", "source": ["# Non-empty markdown"]}, - ] - } - nb_path = write_notebook("empty_cells.ipynb", notebook_content) - result = process_notebook(nb_path) - - assert result.count('"""') == 2, "Only one non-empty cell => 1 block => 2 triple quotes" - assert "# Non-empty markdown" in result - - -def test_process_notebook_invalid_cell_type(write_notebook: WriteNotebookFunc) -> None: - """ - Test a notebook with an unknown cell type. - - Given a notebook cell whose `cell_type` is unrecognized: - When `process_notebook` is called, - Then a ValueError should be raised. - """ - notebook_content = { - "cells": [ - {"cell_type": "markdown", "source": ["# Valid markdown"]}, - {"cell_type": "unknown", "source": ["Unrecognized cell type"]}, - ] - } - nb_path = write_notebook("invalid_cell_type.ipynb", notebook_content) - - with pytest.raises(ValueError, match="Unknown cell type: unknown"): - process_notebook(nb_path) - - -def test_process_notebook_with_output(write_notebook: WriteNotebookFunc) -> None: - """ - Test a notebook that has code cells with outputs. - - Given a code cell and multiple output objects: - When `process_notebook` is called with `include_output=True`, - Then the outputs should be appended as commented lines under the code. - """ - notebook_content = { - "cells": [ - { - "cell_type": "code", - "source": [ - "import matplotlib.pyplot as plt\n", - "print('my_data')\n", - "my_data = [1, 2, 3, 4, 5]\n", - "plt.plot(my_data)\n", - "my_data", - ], - "outputs": [ - {"output_type": "stream", "text": ["my_data"]}, - {"output_type": "execute_result", "data": {"text/plain": ["[1, 2, 3, 4, 5]"]}}, - {"output_type": "display_data", "data": {"text/plain": ["
"]}}, - ], - } - ] - } - - nb_path = write_notebook("with_output.ipynb", notebook_content) - with_output = process_notebook(nb_path, include_output=True) - without_output = process_notebook(nb_path, include_output=False) - - expected_source = "\n".join( - [ - "# Jupyter notebook converted to Python script.\n", - "import matplotlib.pyplot as plt", - "print('my_data')", - "my_data = [1, 2, 3, 4, 5]", - "plt.plot(my_data)", - "my_data\n", - ] - ) - expected_output = "\n".join( - [ - "# Output:", - "# my_data", - "# [1, 2, 3, 4, 5]", - "#
\n", - ] - ) - - expected_combined = expected_source + expected_output - - assert with_output == expected_combined, "Should include source code and comment-ified output." - assert without_output == expected_source, "Should include only the source code without output." - - - -================================================ -FILE: tests/test_repository_clone.py -================================================ -""" -Tests for the `cloning` module. - -These tests cover various scenarios for cloning repositories, verifying that the appropriate Git commands are invoked -and handling edge cases such as nonexistent URLs, timeouts, redirects, and specific commits or branches. -""" - -import asyncio -import os -from pathlib import Path -from unittest.mock import AsyncMock - -import pytest -from pytest_mock import MockerFixture - -from gitingest.cloning import clone_repo -from gitingest.schemas import CloneConfig -from gitingest.utils.exceptions import AsyncTimeoutError -from gitingest.utils.git_utils import check_repo_exists -from tests.conftest import DEMO_URL, LOCAL_REPO_PATH - -# All cloning-related tests assume (unless explicitly overridden) that the repository exists. -# Apply the check-repo patch automatically so individual tests don't need to repeat it. -pytestmark = pytest.mark.usefixtures("repo_exists_true") - - -@pytest.mark.asyncio -async def test_clone_with_commit(repo_exists_true: AsyncMock, run_command_mock: AsyncMock) -> None: - """ - Test cloning a repository with a specific commit hash. - - Given a valid URL and a commit hash: - When `clone_repo` is called, - Then the repository should be cloned and checked out at that commit. - """ - clone_config = CloneConfig( - url=DEMO_URL, - local_path=LOCAL_REPO_PATH, - commit="a" * 40, # Simulating a valid commit hash - branch="main", - ) - - await clone_repo(clone_config) - - repo_exists_true.assert_called_once_with(clone_config.url, token=None) - assert run_command_mock.call_count == 2 # Clone and checkout calls - - -@pytest.mark.asyncio -async def test_clone_without_commit(repo_exists_true: AsyncMock, run_command_mock: AsyncMock) -> None: - """ - Test cloning a repository when no commit hash is provided. - - Given a valid URL and no commit hash: - When `clone_repo` is called, - Then only the clone_repo operation should be performed (no checkout). - """ - clone_config = CloneConfig(url=DEMO_URL, local_path=LOCAL_REPO_PATH, commit=None, branch="main") - - await clone_repo(clone_config) - - repo_exists_true.assert_called_once_with(clone_config.url, token=None) - assert run_command_mock.call_count == 1 # Only clone call - - -@pytest.mark.asyncio -async def test_clone_nonexistent_repository(repo_exists_true: AsyncMock) -> None: - """ - Test cloning a nonexistent repository URL. - - Given an invalid or nonexistent URL: - When `clone_repo` is called, - Then a ValueError should be raised with an appropriate error message. - """ - clone_config = CloneConfig( - url="https://github.com/user/nonexistent-repo", - local_path=LOCAL_REPO_PATH, - commit=None, - branch="main", - ) - # Override the default fixture behaviour for this test - repo_exists_true.return_value = False - - with pytest.raises(ValueError, match="Repository not found"): - await clone_repo(clone_config) - - repo_exists_true.assert_called_once_with(clone_config.url, token=None) - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - "mock_stdout, return_code, expected", - [ - (b"HTTP/1.1 200 OK\n", 0, True), # Existing repo - (b"HTTP/1.1 404 Not Found\n", 0, False), # Non-existing repo - (b"HTTP/1.1 200 OK\n", 1, False), # Failed request - ], -) -async def test_check_repo_exists(mock_stdout: bytes, return_code: int, expected: bool, mocker: MockerFixture) -> None: - """ - Test the `check_repo_exists` function with different Git HTTP responses. - - Given various stdout lines and return codes: - When `check_repo_exists` is called, - Then it should correctly indicate whether the repository exists. - """ - mock_exec = mocker.patch("asyncio.create_subprocess_exec", new_callable=AsyncMock) - mock_process = AsyncMock() - mock_process.communicate.return_value = (mock_stdout, b"") - mock_process.returncode = return_code - mock_exec.return_value = mock_process - - repo_exists = await check_repo_exists(DEMO_URL) - - assert repo_exists is expected - - -@pytest.mark.asyncio -async def test_clone_with_custom_branch(run_command_mock: AsyncMock) -> None: - """ - Test cloning a repository with a specified custom branch. - - Given a valid URL and a branch: - When `clone_repo` is called, - Then the repository should be cloned shallowly to that branch. - """ - clone_config = CloneConfig(url=DEMO_URL, local_path=LOCAL_REPO_PATH, branch="feature-branch") - - await clone_repo(clone_config) - - run_command_mock.assert_called_once_with( - "git", - "clone", - "--single-branch", - "--depth=1", - "--branch", - "feature-branch", - clone_config.url, - clone_config.local_path, - ) - - -@pytest.mark.asyncio -async def test_git_command_failure(run_command_mock: AsyncMock) -> None: - """ - Test cloning when the Git command fails during execution. - - Given a valid URL, but `run_command` raises a RuntimeError: - When `clone_repo` is called, - Then a RuntimeError should be raised with the correct message. - """ - clone_config = CloneConfig(url=DEMO_URL, local_path=LOCAL_REPO_PATH) - - run_command_mock.side_effect = RuntimeError("Git command failed") - - with pytest.raises(RuntimeError, match="Git command failed"): - await clone_repo(clone_config) - - -@pytest.mark.asyncio -async def test_clone_default_shallow_clone(run_command_mock: AsyncMock) -> None: - """ - Test cloning a repository with the default shallow clone options. - - Given a valid URL and no branch or commit: - When `clone_repo` is called, - Then the repository should be cloned with `--depth=1` and `--single-branch`. - """ - clone_config = CloneConfig(url=DEMO_URL, local_path=LOCAL_REPO_PATH) - - await clone_repo(clone_config) - - run_command_mock.assert_called_once_with( - "git", - "clone", - "--single-branch", - "--depth=1", - clone_config.url, - clone_config.local_path, - ) - - -@pytest.mark.asyncio -async def test_clone_commit_without_branch(run_command_mock: AsyncMock) -> None: - """ - Test cloning when a commit hash is provided but no branch is specified. - - Given a valid URL and a commit hash (but no branch): - When `clone_repo` is called, - Then the repository should be cloned and checked out at that commit. - """ - # Simulating a valid commit hash - clone_config = CloneConfig(url=DEMO_URL, local_path=LOCAL_REPO_PATH, commit="a" * 40) - - await clone_repo(clone_config) - - assert run_command_mock.call_count == 2 # Clone and checkout calls - run_command_mock.assert_any_call("git", "clone", "--single-branch", clone_config.url, clone_config.local_path) - run_command_mock.assert_any_call("git", "-C", clone_config.local_path, "checkout", clone_config.commit) - - -@pytest.mark.asyncio -async def test_check_repo_exists_with_redirect(mocker: MockerFixture) -> None: - """ - Test `check_repo_exists` when a redirect (302) is returned. - - Given a URL that responds with "302 Found": - When `check_repo_exists` is called, - Then it should return `False`, indicating the repo is inaccessible. - """ - mock_exec = mocker.patch("asyncio.create_subprocess_exec", new_callable=AsyncMock) - mock_process = AsyncMock() - mock_process.communicate.return_value = (b"HTTP/1.1 302 Found\n", b"") - mock_process.returncode = 0 # Simulate successful request - mock_exec.return_value = mock_process - - repo_exists = await check_repo_exists(DEMO_URL) - - assert repo_exists is False - - -@pytest.mark.asyncio -async def test_check_repo_exists_with_permanent_redirect(mocker: MockerFixture) -> None: - """ - Test `check_repo_exists` when a permanent redirect (301) is returned. - - Given a URL that responds with "301 Found": - When `check_repo_exists` is called, - Then it should return `True`, indicating the repo may exist at the new location. - """ - mock_exec = mocker.patch("asyncio.create_subprocess_exec", new_callable=AsyncMock) - mock_process = AsyncMock() - mock_process.communicate.return_value = (b"HTTP/1.1 301 Found\n", b"") - mock_process.returncode = 0 # Simulate successful request - mock_exec.return_value = mock_process - - repo_exists = await check_repo_exists(DEMO_URL) - - assert repo_exists - - -@pytest.mark.asyncio -async def test_clone_with_timeout(run_command_mock: AsyncMock) -> None: - """ - Test cloning a repository when a timeout occurs. - - Given a valid URL, but `run_command` times out: - When `clone_repo` is called, - Then an `AsyncTimeoutError` should be raised to indicate the operation exceeded time limits. - """ - clone_config = CloneConfig(url=DEMO_URL, local_path=LOCAL_REPO_PATH) - - run_command_mock.side_effect = asyncio.TimeoutError - - with pytest.raises(AsyncTimeoutError, match="Operation timed out after"): - await clone_repo(clone_config) - - -@pytest.mark.asyncio -async def test_clone_specific_branch(tmp_path: Path) -> None: - """ - Test cloning a specific branch of a repository. - - Given a valid repository URL and a branch name: - When `clone_repo` is called, - Then the repository should be cloned and checked out at that branch. - """ - repo_url = "https://github.com/cyclotruc/gitingest.git" - branch_name = "main" - local_path = tmp_path / "gitingest" - clone_config = CloneConfig(url=repo_url, local_path=str(local_path), branch=branch_name) - - await clone_repo(clone_config) - - assert local_path.exists(), "The repository was not cloned successfully." - assert local_path.is_dir(), "The cloned repository path is not a directory." - current_branch = os.popen(f"git -C {local_path} branch --show-current").read().strip() - assert current_branch == branch_name, f"Expected branch '{branch_name}', got '{current_branch}'." - - -@pytest.mark.asyncio -async def test_clone_branch_with_slashes(tmp_path: Path, run_command_mock: AsyncMock) -> None: - """ - Test cloning a branch with slashes in the name. - - Given a valid repository URL and a branch name with slashes: - When `clone_repo` is called, - Then the repository should be cloned and checked out at that branch. - """ - branch_name = "fix/in-operator" - local_path = tmp_path / "gitingest" - clone_config = CloneConfig(url=DEMO_URL, local_path=str(local_path), branch=branch_name) - - await clone_repo(clone_config) - - run_command_mock.assert_called_once_with( - "git", - "clone", - "--single-branch", - "--depth=1", - "--branch", - "fix/in-operator", - clone_config.url, - clone_config.local_path, - ) - - -@pytest.mark.asyncio -async def test_clone_creates_parent_directory(tmp_path: Path, run_command_mock: AsyncMock) -> None: - """ - Test that clone_repo creates parent directories if they don't exist. - - Given a local path with non-existent parent directories: - When `clone_repo` is called, - Then it should create the parent directories before attempting to clone. - """ - nested_path = tmp_path / "deep" / "nested" / "path" / "repo" - clone_config = CloneConfig(url=DEMO_URL, local_path=str(nested_path)) - - await clone_repo(clone_config) - - assert nested_path.parent.exists() - run_command_mock.assert_called_once_with( - "git", - "clone", - "--single-branch", - "--depth=1", - clone_config.url, - str(nested_path), - ) - - -@pytest.mark.asyncio -async def test_clone_with_specific_subpath(run_command_mock: AsyncMock) -> None: - """ - Test cloning a repository with a specific subpath. - - Given a valid repository URL and a specific subpath: - When `clone_repo` is called, - Then the repository should be cloned with sparse checkout enabled and the specified subpath. - """ - clone_config = CloneConfig(url=DEMO_URL, local_path=LOCAL_REPO_PATH, subpath="src/docs") - - await clone_repo(clone_config) - - # Verify the clone command includes sparse checkout flags - run_command_mock.assert_any_call( - "git", - "clone", - "--single-branch", - "--filter=blob:none", - "--sparse", - "--depth=1", - clone_config.url, - clone_config.local_path, - ) - - # Verify the sparse-checkout command sets the correct path - run_command_mock.assert_any_call("git", "-C", clone_config.local_path, "sparse-checkout", "set", "src/docs") - - assert run_command_mock.call_count == 2 - - -@pytest.mark.asyncio -async def test_clone_with_commit_and_subpath(run_command_mock: AsyncMock) -> None: - """ - Test cloning a repository with both a specific commit and subpath. - - Given a valid repository URL, commit hash, and subpath: - When `clone_repo` is called, - Then the repository should be cloned with sparse checkout enabled, - checked out at the specific commit, and only include the specified subpath. - """ - # Simulating a valid commit hash - clone_config = CloneConfig(url=DEMO_URL, local_path=LOCAL_REPO_PATH, commit="a" * 40, subpath="src/docs") - - await clone_repo(clone_config) - - # Verify the clone command includes sparse checkout flags - run_command_mock.assert_any_call( - "git", - "clone", - "--single-branch", - "--filter=blob:none", - "--sparse", - clone_config.url, - clone_config.local_path, - ) - - # Verify sparse-checkout set - run_command_mock.assert_any_call( - "git", - "-C", - clone_config.local_path, - "sparse-checkout", - "set", - "src/docs", - ) - - # Verify checkout commit - run_command_mock.assert_any_call( - "git", - "-C", - clone_config.local_path, - "checkout", - clone_config.commit, - ) - - assert run_command_mock.call_count == 3 - - - -================================================ -FILE: tests/query_parser/__init__.py -================================================ - - - -================================================ -FILE: tests/query_parser/test_git_host_agnostic.py -================================================ -""" -Tests to verify that the query parser is Git host agnostic. - -These tests confirm that `parse_query` correctly identifies user/repo pairs and canonical URLs for GitHub, GitLab, -Bitbucket, Gitea, and Codeberg, even if the host is omitted. -""" - -from typing import List, Tuple - -import pytest - -from gitingest.query_parsing import parse_query -from gitingest.utils.query_parser_utils import KNOWN_GIT_HOSTS - -# Repository matrix: (host, user, repo) -_REPOS: List[Tuple[str, str, str]] = [ - ("github.com", "tiangolo", "fastapi"), - ("gitlab.com", "gitlab-org", "gitlab-runner"), - ("bitbucket.org", "na-dna", "llm-knowledge-share"), - ("gitea.com", "xorm", "xorm"), - ("codeberg.org", "forgejo", "forgejo"), - ("git.rwth-aachen.de", "medialab", "19squared"), - ("gitlab.alpinelinux.org", "alpine", "apk-tools"), -] - - -# Generate cartesian product of repository tuples with URL variants. -@pytest.mark.parametrize("host, user, repo", _REPOS, ids=[f"{h}:{u}/{r}" for h, u, r in _REPOS]) -@pytest.mark.parametrize("variant", ["full", "noscheme", "slug"]) -@pytest.mark.asyncio -async def test_parse_query_without_host( - host: str, - user: str, - repo: str, - variant: str, -) -> None: - """Verify that `parse_query` handles URLs, host-omitted URLs and raw slugs.""" - - # Build the input URL based on the selected variant - if variant == "full": - url = f"https://{host}/{user}/{repo}" - elif variant == "noscheme": - url = f"{host}/{user}/{repo}" - else: # "slug" - url = f"{user}/{repo}" - - expected_url = f"https://{host}/{user}/{repo}" - - # For slug form with a custom host (not in KNOWN_GIT_HOSTS) we expect a failure, - # because the parser cannot guess which domain to use. - if variant == "slug" and host not in KNOWN_GIT_HOSTS: - with pytest.raises(ValueError): - await parse_query(url, max_file_size=50, from_web=True) - return - - query = await parse_query(url, max_file_size=50, from_web=True) - - # Compare against the canonical dict while ignoring unpredictable fields. - actual = query.model_dump(exclude={"id", "local_path", "ignore_patterns"}) - - expected = { - "user_name": user, - "repo_name": repo, - "url": expected_url, - "slug": f"{user}-{repo}", - "subpath": "/", - "type": None, - "branch": None, - "commit": None, - "max_file_size": 50, - "include_patterns": None, - } - - assert actual == expected - - - -================================================ -FILE: tests/query_parser/test_query_parser.py -================================================ -""" -Tests for the `query_parsing` module. - -These tests cover URL parsing, pattern parsing, and handling of branches/subpaths for HTTP(S) repositories and local -paths. -""" - -from pathlib import Path -from typing import Callable, List, Optional -from unittest.mock import AsyncMock - -import pytest -from pytest_mock import MockerFixture - -from gitingest.query_parsing import _parse_patterns, _parse_remote_repo, parse_query -from gitingest.schemas.ingestion_schema import IngestionQuery -from gitingest.utils.ignore_patterns import DEFAULT_IGNORE_PATTERNS -from tests.conftest import DEMO_URL - -URLS_HTTPS: List[str] = [ - DEMO_URL, - "https://gitlab.com/user/repo", - "https://bitbucket.org/user/repo", - "https://gitea.com/user/repo", - "https://codeberg.org/user/repo", - "https://gist.github.com/user/repo", - "https://git.example.com/user/repo", - "https://gitlab.example.com/user/repo", - "https://gitlab.example.se/user/repo", -] - -URLS_HTTP: List[str] = [url.replace("https://", "http://") for url in URLS_HTTPS] - - -@pytest.mark.parametrize("url", URLS_HTTPS, ids=lambda u: u) -@pytest.mark.asyncio -async def test_parse_url_valid_https(url: str) -> None: - """Valid HTTPS URLs parse correctly and `query.url` equals the input.""" - query = await _assert_basic_repo_fields(url) - - assert query.url == url # HTTPS: canonical URL should equal input - - -@pytest.mark.parametrize("url", URLS_HTTP, ids=lambda u: u) -@pytest.mark.asyncio -async def test_parse_url_valid_http(url: str) -> None: - """Valid HTTP URLs parse correctly (slug check only).""" - await _assert_basic_repo_fields(url) - - -@pytest.mark.asyncio -async def test_parse_url_invalid() -> None: - """ - Test `_parse_remote_repo` with an invalid URL. - - Given an HTTPS URL lacking a repository structure (e.g., "https://github.com"), - When `_parse_remote_repo` is called, - Then a ValueError should be raised indicating an invalid repository URL. - """ - url = "https://github.com" - - with pytest.raises(ValueError, match="Invalid repository URL"): - await _parse_remote_repo(url) - - -@pytest.mark.asyncio -@pytest.mark.parametrize("url", [DEMO_URL, "https://gitlab.com/user/repo"]) -async def test_parse_query_basic(url: str) -> None: - """ - Test `parse_query` with a basic valid repository URL. - - Given an HTTPS URL and ignore_patterns="*.txt": - When `parse_query` is called, - Then user/repo, URL, and ignore patterns should be parsed correctly. - """ - query = await parse_query(source=url, max_file_size=50, from_web=True, ignore_patterns="*.txt") - - assert query.user_name == "user" - assert query.repo_name == "repo" - assert query.url == url - assert query.ignore_patterns - assert "*.txt" in query.ignore_patterns - - -@pytest.mark.asyncio -async def test_parse_query_mixed_case() -> None: - """ - Test `parse_query` with mixed-case URLs. - - Given a URL with mixed-case parts (e.g. "Https://GitHub.COM/UsEr/rEpO"): - When `parse_query` is called, - Then the user and repo names should be normalized to lowercase. - """ - url = "Https://GitHub.COM/UsEr/rEpO" - query = await parse_query(url, max_file_size=50, from_web=True) - - assert query.user_name == "user" - assert query.repo_name == "repo" - - -@pytest.mark.asyncio -async def test_parse_query_include_pattern() -> None: - """ - Test `parse_query` with a specified include pattern. - - Given a URL and include_patterns="*.py": - When `parse_query` is called, - Then the include pattern should be set, and default ignore patterns remain applied. - """ - query = await parse_query(DEMO_URL, max_file_size=50, from_web=True, include_patterns="*.py") - - assert query.include_patterns == {"*.py"} - assert query.ignore_patterns == DEFAULT_IGNORE_PATTERNS - - -@pytest.mark.asyncio -async def test_parse_query_invalid_pattern() -> None: - """ - Test `parse_query` with an invalid pattern. - - Given an include pattern containing special characters (e.g., "*.py;rm -rf"): - When `parse_query` is called, - Then a ValueError should be raised indicating invalid characters. - """ - with pytest.raises(ValueError, match="Pattern.*contains invalid characters"): - await parse_query(DEMO_URL, max_file_size=50, from_web=True, include_patterns="*.py;rm -rf") - - -@pytest.mark.asyncio -async def test_parse_url_with_subpaths(stub_branches: Callable[[List[str]], None]) -> None: - """ - Test `_parse_remote_repo` with a URL containing branch and subpath. - - Given a URL referencing a branch ("main") and a subdir ("subdir/file"): - When `_parse_remote_repo` is called with remote branch fetching, - Then user, repo, branch, and subpath should be identified correctly. - """ - url = DEMO_URL + "/tree/main/subdir/file" - - stub_branches(["main", "dev", "feature-branch"]) - - query = await _assert_basic_repo_fields(url) - - assert query.user_name == "user" - assert query.repo_name == "repo" - assert query.branch == "main" - assert query.subpath == "/subdir/file" - - -@pytest.mark.asyncio -async def test_parse_url_invalid_repo_structure() -> None: - """ - Test `_parse_remote_repo` with a URL missing a repository name. - - Given a URL like "https://github.com/user": - When `_parse_remote_repo` is called, - Then a ValueError should be raised indicating an invalid repository URL. - """ - url = "https://github.com/user" - - with pytest.raises(ValueError, match="Invalid repository URL"): - await _parse_remote_repo(url) - - -def test_parse_patterns_valid() -> None: - """ - Test `_parse_patterns` with valid comma-separated patterns. - - Given patterns like "*.py, *.md, docs/*": - When `_parse_patterns` is called, - Then it should return a set of parsed strings. - """ - patterns = "*.py, *.md, docs/*" - parsed_patterns = _parse_patterns(patterns) - - assert parsed_patterns == {"*.py", "*.md", "docs/*"} - - -def test_parse_patterns_invalid_characters() -> None: - """ - Test `_parse_patterns` with invalid characters. - - Given a pattern string containing special characters (e.g. "*.py;rm -rf"): - When `_parse_patterns` is called, - Then a ValueError should be raised indicating invalid pattern syntax. - """ - patterns = "*.py;rm -rf" - - with pytest.raises(ValueError, match="Pattern.*contains invalid characters"): - _parse_patterns(patterns) - - -@pytest.mark.asyncio -async def test_parse_query_with_large_file_size() -> None: - """ - Test `parse_query` with a very large file size limit. - - Given a URL and max_file_size=10**9: - When `parse_query` is called, - Then `max_file_size` should be set correctly and default ignore patterns remain unchanged. - """ - query = await parse_query(DEMO_URL, max_file_size=10**9, from_web=True) - - assert query.max_file_size == 10**9 - assert query.ignore_patterns == DEFAULT_IGNORE_PATTERNS - - -@pytest.mark.asyncio -async def test_parse_query_empty_patterns() -> None: - """ - Test `parse_query` with empty patterns. - - Given empty include_patterns and ignore_patterns: - When `parse_query` is called, - Then include_patterns becomes None and default ignore patterns apply. - """ - query = await parse_query(DEMO_URL, max_file_size=50, from_web=True, include_patterns="", ignore_patterns="") - - assert query.include_patterns is None - assert query.ignore_patterns == DEFAULT_IGNORE_PATTERNS - - -@pytest.mark.asyncio -async def test_parse_query_include_and_ignore_overlap() -> None: - """ - Test `parse_query` with overlapping patterns. - - Given include="*.py" and ignore={"*.py", "*.txt"}: - When `parse_query` is called, - Then "*.py" should be removed from ignore patterns. - """ - query = await parse_query( - DEMO_URL, - max_file_size=50, - from_web=True, - include_patterns="*.py", - ignore_patterns={"*.py", "*.txt"}, - ) - - assert query.include_patterns == {"*.py"} - assert query.ignore_patterns is not None - assert "*.py" not in query.ignore_patterns - assert "*.txt" in query.ignore_patterns - - -@pytest.mark.asyncio -async def test_parse_query_local_path() -> None: - """ - Test `parse_query` with a local file path. - - Given "/home/user/project" and from_web=False: - When `parse_query` is called, - Then the local path should be set, id generated, and slug formed accordingly. - """ - path = "/home/user/project" - query = await parse_query(path, max_file_size=100, from_web=False) - tail = Path("home/user/project") - - assert query.local_path.parts[-len(tail.parts) :] == tail.parts - assert query.id is not None - assert query.slug == "home/user/project" - - -@pytest.mark.asyncio -async def test_parse_query_relative_path() -> None: - """ - Test `parse_query` with a relative path. - - Given "./project" and from_web=False: - When `parse_query` is called, - Then local_path resolves relatively, and slug ends with "project". - """ - path = "./project" - query = await parse_query(path, max_file_size=100, from_web=False) - tail = Path("project") - - assert query.local_path.parts[-len(tail.parts) :] == tail.parts - assert query.slug.endswith("project") - - -@pytest.mark.asyncio -async def test_parse_query_empty_source() -> None: - """ - Test `parse_query` with an empty string. - - Given an empty source string: - When `parse_query` is called, - Then a ValueError should be raised indicating an invalid repository URL. - """ - url = "" - - with pytest.raises(ValueError, match="Invalid repository URL"): - await parse_query(url, max_file_size=100, from_web=True) - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - "path, expected_branch, expected_commit", - [ - ("/tree/main", "main", None), - ("/tree/abcd1234abcd1234abcd1234abcd1234abcd1234", None, "abcd1234abcd1234abcd1234abcd1234abcd1234"), - ], -) -async def test_parse_url_branch_and_commit_distinction( - path: str, - expected_branch: str, - expected_commit: str, - stub_branches: Callable[[List[str]], None], -) -> None: - """ - Test `_parse_remote_repo` distinguishing branch vs. commit hash. - - Given either a branch URL (e.g., ".../tree/main") or a 40-character commit URL: - When `_parse_remote_repo` is called with branch fetching, - Then the function should correctly set `branch` or `commit` based on the URL content. - """ - stub_branches(["main", "dev", "feature-branch"]) - - url = DEMO_URL + path - query = await _assert_basic_repo_fields(url) - - assert query.branch == expected_branch - assert query.commit == expected_commit - - -@pytest.mark.asyncio -async def test_parse_query_uuid_uniqueness() -> None: - """ - Test `parse_query` for unique UUID generation. - - Given the same path twice: - When `parse_query` is called repeatedly, - Then each call should produce a different query id. - """ - path = "/home/user/project" - query_1 = await parse_query(path, max_file_size=100, from_web=False) - query_2 = await parse_query(path, max_file_size=100, from_web=False) - - assert query_1.id != query_2.id - - -@pytest.mark.asyncio -async def test_parse_url_with_query_and_fragment() -> None: - """ - Test `_parse_remote_repo` with query parameters and a fragment. - - Given a URL like "https://github.com/user/repo?arg=value#fragment": - When `_parse_remote_repo` is called, - Then those parts should be stripped, leaving a clean user/repo URL. - """ - url = DEMO_URL + "?arg=value#fragment" - query = await _parse_remote_repo(url) - - assert query.user_name == "user" - assert query.repo_name == "repo" - assert query.url == DEMO_URL # URL should be cleaned - - -@pytest.mark.asyncio -async def test_parse_url_unsupported_host() -> None: - """ - Test `_parse_remote_repo` with an unsupported host. - - Given "https://only-domain.com": - When `_parse_remote_repo` is called, - Then a ValueError should be raised for the unknown domain. - """ - url = "https://only-domain.com" - - with pytest.raises(ValueError, match="Unknown domain 'only-domain.com' in URL"): - await _parse_remote_repo(url) - - -@pytest.mark.asyncio -async def test_parse_query_with_branch() -> None: - """ - Test `parse_query` when a branch is specified in a blob path. - - Given "https://github.com/pandas-dev/pandas/blob/2.2.x/...": - When `parse_query` is called, - Then the branch should be identified, subpath set, and commit remain None. - """ - url = "https://github.com/pandas-dev/pandas/blob/2.2.x/.github/ISSUE_TEMPLATE/documentation_improvement.yaml" - query = await parse_query(url, max_file_size=10**9, from_web=True) - - assert query.user_name == "pandas-dev" - assert query.repo_name == "pandas" - assert query.url == "https://github.com/pandas-dev/pandas" - assert query.slug == "pandas-dev-pandas" - assert query.id is not None - assert query.subpath == "/.github/ISSUE_TEMPLATE/documentation_improvement.yaml" - assert query.branch == "2.2.x" - assert query.commit is None - assert query.type == "blob" - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - "path, expected_branch, expected_subpath", - [ - ("/tree/main/src", "main", "/src"), - ("/tree/fix1", "fix1", "/"), - ("/tree/nonexistent-branch/src", "nonexistent-branch", "/src"), - ], -) -async def test_parse_repo_source_with_failed_git_command( - path: str, - expected_branch: str, - expected_subpath: str, - mocker: MockerFixture, -) -> None: - """ - Test `_parse_remote_repo` when git fetch fails. - - Given a URL referencing a branch, but Git fetching fails: - When `_parse_remote_repo` is called, - Then it should fall back to path components for branch identification. - """ - url = DEMO_URL + path - - mock_fetch_branches = mocker.patch("gitingest.utils.git_utils.fetch_remote_branch_list", new_callable=AsyncMock) - mock_fetch_branches.side_effect = Exception("Failed to fetch branch list") - - with pytest.warns( - RuntimeWarning, - match="Warning: Failed to fetch branch list: Command failed: " - "git ls-remote --heads https://github.com/user/repo", - ): - query = await _parse_remote_repo(url) - - assert query.branch == expected_branch - assert query.subpath == expected_subpath - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - ("path", "expected_branch", "expected_subpath"), - [ - ("/tree/feature/fix1/src", "feature/fix1", "/src"), - ("/tree/main/src", "main", "/src"), - ("", None, "/"), - ("/tree/nonexistent-branch/src", None, "/"), - ("/tree/fix", "fix", "/"), - ("/blob/fix/page.html", "fix", "/page.html"), - ], -) -async def test_parse_repo_source_with_various_url_patterns( - path: str, - expected_branch: Optional[str], - expected_subpath: str, - stub_branches: Callable[[List[str]], None], -) -> None: - """ - `_parse_remote_repo` should detect (or reject) a branch and resolve the - sub-path for various GitHub-style URL permutations. - - Branch discovery is stubbed so that only names passed to `stub_branches` are considered "remote". - """ - stub_branches(["feature/fix1", "main", "feature-branch", "fix"]) - - url = DEMO_URL + path - query = await _assert_basic_repo_fields(url) - - assert query.branch == expected_branch - assert query.subpath == expected_subpath - - -async def _assert_basic_repo_fields(url: str) -> IngestionQuery: - """Run _parse_remote_repo and assert user, repo and slug are parsed.""" - - query = await _parse_remote_repo(url) - - assert query.user_name == "user" - assert query.repo_name == "repo" - assert query.slug == "user-repo" - - return query - - From c818dc10d5d253a5a2771a4c5a8d3cb60ece26a1 Mon Sep 17 00:00:00 2001 From: Sendi John Date: Sat, 28 Jun 2025 19:15:26 +0100 Subject: [PATCH 4/9] fix: ensure all files end with a single newline --- src/gitingest/cli.py | 2 +- test.txt | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index 8573acf6..e1a0e0c3 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -212,4 +212,4 @@ async def _async_main( if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/test.txt b/test.txt index 0e2d7c2c..26eb34ed 100644 --- a/test.txt +++ b/test.txt @@ -5924,5 +5924,3 @@ async def _assert_basic_repo_fields(url: str) -> IngestionQuery: assert query.slug == "user-repo" return query - - From 8af650a63e6cd041938575d6f512c713e90166bb Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Tue, 1 Jul 2025 05:00:22 +0200 Subject: [PATCH 5/9] Update src/gitingest/cli.py --- src/gitingest/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index e5295d60..d9320068 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -195,4 +195,4 @@ async def _async_main( if __name__ == "__main__": - main() \ No newline at end of file + main() From 2a49b20436bc4554da9388259bfd952cb2b8caa9 Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Tue, 1 Jul 2025 05:25:34 +0200 Subject: [PATCH 6/9] docs --- src/gitingest/cli.py | 46 +++++++++++++++----------------------------- 1 file changed, 16 insertions(+), 30 deletions(-) diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index d9320068..57476631 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -26,12 +26,6 @@ class _CLIArgs(TypedDict): @click.command() @click.argument("source", type=str, default=".") -@click.option( - "--output", - "-o", - default=None, - help="Output file path (default: digest.txt in current directory). Use '-' for stdout.", -) @click.option( "--max-size", "-s", @@ -63,32 +57,23 @@ class _CLIArgs(TypedDict): "If omitted, the CLI will look for the GITHUB_TOKEN environment variable." ), ) +@click.option( + "--output", + "-o", + default=None, + help="Output file path (default: digest.txt in current directory). Use '-' for stdout.", +) def main(**cli_kwargs: Unpack[_CLIArgs]) -> None: - """ - Main entry point for the CLI. This function is called when the CLI is run as a script. - - It calls the async main function to run the command. + """Run the CLI entry point to analyze a repo / directory and dump its contents. Parameters ---------- - source : str - A directory path or a Git repository URL. - output : str, optional - The path where the output file will be written. If not specified, the output will be written - to a file named `digest.txt` in the current directory. Use '-' to output to stdout. - max_size : int - Maximum file size (in bytes) to consider. - exclude_pattern : Tuple[str, ...] - Glob patterns for pruning the file set. - include_pattern : Tuple[str, ...] - Glob patterns for including files in the output. - branch : str, optional - Specific branch to ingest (defaults to the repository's default). - include_gitignored : bool - If provided, include files normally ignored by .gitignore. - token: str, optional - GitHub personal-access token (PAT). Needed when *source* refers to a - **private** repository. Can also be set via the ``GITHUB_TOKEN`` env var. + **cli_kwargs : Unpack[_CLIArgs] + A dictionary of keyword arguments forwarded to ``ingest_async``. + + Notes + ----- + See ``ingest_async`` for a detailed description of each argument. Examples -------- @@ -108,6 +93,7 @@ def main(**cli_kwargs: Unpack[_CLIArgs]) -> None: Private repositories: $ gitingest https://github.com/user/private-repo -t ghp_token $ GITHUB_TOKEN=ghp_token gitingest https://github.com/user/private-repo + """ asyncio.run(_async_main(**cli_kwargs)) @@ -147,8 +133,8 @@ async def _async_main( GitHub personal access token (PAT) for accessing private repositories. Can also be set via the ``GITHUB_TOKEN`` environment variable. output : str | None - The path where the output file will be written. If not specified, the output will be written - to a file named `digest.txt` in the current directory. Use '-' to output to stdout. + The path where the output file will be written (default: ``digest.txt`` in current directory). + Use ``"-"`` to write to ``stdout``. Raises ------ From e387d7b4a4e7e5e1dc5f4c213a20a32ec23f8b2c Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Tue, 1 Jul 2025 05:27:31 +0200 Subject: [PATCH 7/9] docs --- src/static/llm.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/static/llm.txt b/src/static/llm.txt index bf09c404..2c302f9b 100644 --- a/src/static/llm.txt +++ b/src/static/llm.txt @@ -184,12 +184,12 @@ gitingest https://github.com/user/repo -i "*.py" -s 51200 -o - ``` **Key Parameters for AI Agents**: -- `-o` / `--output`: Stream to STDOUT with `-` (default saves to `digest.txt`) - `-s` / `--max-size`: Maximum file size in bytes to process (default: no limit) - `-i` / `--include-pattern`: Include files matching Unix shell-style wildcards - `-e` / `--exclude-pattern`: Exclude files matching Unix shell-style wildcards - `-b` / `--branch`: Specify branch to analyze (defaults to repository's default branch) - `-t` / `--token`: GitHub personal access token for private repositories +- `-o` / `--output`: Stream to STDOUT with `-` (default saves to `digest.txt`) ### 4.2 Python Package (Best for Code Integration) ```python From 6ef39d92451ca5e8828742dfcdb02ed984e0be68 Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Tue, 1 Jul 2025 05:30:09 +0200 Subject: [PATCH 8/9] docs --- src/gitingest/cli.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index 57476631..64ef463c 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -78,17 +78,17 @@ def main(**cli_kwargs: Unpack[_CLIArgs]) -> None: Examples -------- Basic usage: - $ gitingest . + $ gitingest $ gitingest /path/to/repo $ gitingest https://github.com/user/repo Output to stdout: - $ gitingest . -o - + $ gitingest -o - $ gitingest https://github.com/user/repo --output - With filtering: - $ gitingest . -i "*.py" -e "*.log" - $ gitingest . --include-pattern "*.js" --exclude-pattern "node_modules/*" + $ gitingest -i "*.py" -e "*.log" + $ gitingest --include-pattern "*.js" --exclude-pattern "node_modules/*" Private repositories: $ gitingest https://github.com/user/private-repo -t ghp_token From 774da4c343bee035cfb6fa7f7b44c3baf783e86c Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Tue, 1 Jul 2025 09:05:53 +0200 Subject: [PATCH 9/9] fix --- src/gitingest/utils/git_utils.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/gitingest/utils/git_utils.py b/src/gitingest/utils/git_utils.py index 52fd319e..70f27185 100644 --- a/src/gitingest/utils/git_utils.py +++ b/src/gitingest/utils/git_utils.py @@ -102,7 +102,6 @@ async def check_repo_exists(url: str, token: str | None = None) -> bool: If the curl command returns an unexpected status code. """ - expected_path_length = 2 if token and is_github_host(url): return await _check_github_repo_exists(url, token=token) @@ -121,11 +120,13 @@ async def check_repo_exists(url: str, token: str | None = None) -> bool: response = stdout.decode() status_line = response.splitlines()[0].strip() parts = status_line.split(" ") + + expected_path_length = 2 if len(parts) >= expected_path_length: - status_code_str = parts[1] - if status_code_str in ("200", "301"): + status = parts[1] + if status in ("200", "301"): return True - if status_code_str in ("302", "404"): + if status in ("302", "404"): return False msg = f"Unexpected status line: {status_line}" raise RuntimeError(msg)