diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index 549b5945..237957a4 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -1,21 +1,23 @@ -""" Command-line interface for the Gitingest package. """ +"""Command-line interface for the Gitingest package.""" # pylint: disable=no-value-for-parameter - import asyncio +from pathlib import Path import click -from gitingest.config import MAX_FILE_SIZE, OUTPUT_FILE_PATH +from gitingest.config import MAX_FILE_SIZE +from gitingest.query_parser import _parse_patterns, parse_ignore_file from gitingest.repository_ingest import ingest @click.command() @click.argument("source", type=str, default=".") -@click.option("--output", "-o", default=None, help="Output file path (default: .txt in current directory)") +@click.option("--output", "-o", default=None, help="Output file path (default: digest.txt)") @click.option("--max-size", "-s", default=MAX_FILE_SIZE, help="Maximum file size to process in bytes") -@click.option("--exclude-pattern", "-e", multiple=True, help="Patterns to exclude") -@click.option("--include-pattern", "-i", multiple=True, help="Patterns to include") +@click.option("--exclude-pattern", "-e", multiple=True, help="Patterns to exclude (space-separated patterns allowed)") +@click.option("--include-pattern", "-i", multiple=True, help="Patterns to include (space-separated patterns allowed)") +@click.option("--ignore-file", default=".gitingestignore", help="Path to ignore file (default: .gitingestignore)") @click.option("--branch", "-b", default=None, help="Branch to clone and ingest") def main( source: str, @@ -23,60 +25,64 @@ def main( max_size: int, exclude_pattern: tuple[str, ...], include_pattern: tuple[str, ...], + ignore_file: str, branch: str | None, ): """ - Main entry point for the CLI. This function is called when the CLI is run as a script. - - It calls the async main function to run the command. + Main entry point for the CLI. Parameters ---------- source : str The source directory or repository to analyze. output : str | None - The path where the output file will be written. If not specified, the output will be written - to a file named `.txt` in the current directory. + The path where the output file will be written. If not specified, the output + will be written to a file named `digest.txt` in the current directory. max_size : int The maximum file size to process, in bytes. Files larger than this size will be ignored. exclude_pattern : tuple[str, ...] A tuple of patterns to exclude during the analysis. Files matching these patterns will be ignored. include_pattern : tuple[str, ...] A tuple of patterns to include during the analysis. Only files matching these patterns will be processed. + ignore_file : str + Path to the ignore file containing additional patterns to exclude. branch : str | None The branch to clone (optional). """ - # Main entry point for the CLI. This function is called when the CLI is run as a script. - asyncio.run(_async_main(source, output, max_size, exclude_pattern, include_pattern, branch)) + asyncio.run(async_main(source, output, max_size, exclude_pattern, include_pattern, ignore_file, branch)) -async def _async_main( +async def async_main( source: str, output: str | None, max_size: int, exclude_pattern: tuple[str, ...], include_pattern: tuple[str, ...], + ignore_file: str, branch: str | None, ) -> None: """ Analyze a directory or repository and create a text dump of its contents. - This command analyzes the contents of a specified source directory or repository, applies custom include and - exclude patterns, and generates a text summary of the analysis which is then written to an output file. + This command analyzes the contents of a specified source directory or repository, + applies custom include and exclude patterns, and generates a text summary of the + analysis which is then written to an output file. Parameters ---------- source : str The source directory or repository to analyze. output : str | None - The path where the output file will be written. If not specified, the output will be written - to a file named `.txt` in the current directory. + The path where the output file will be written. If not specified, the output + will be written to a file named `digest.txt` in the current directory. max_size : int The maximum file size to process, in bytes. Files larger than this size will be ignored. exclude_pattern : tuple[str, ...] A tuple of patterns to exclude during the analysis. Files matching these patterns will be ignored. include_pattern : tuple[str, ...] A tuple of patterns to include during the analysis. Only files matching these patterns will be processed. + ignore_file : str + Path to the ignore file containing additional patterns to exclude. branch : str | None The branch to clone (optional). @@ -86,21 +92,39 @@ async def _async_main( If there is an error during the execution of the command, this exception is raised to abort the process. """ try: - # Combine default and custom ignore patterns - exclude_patterns = set(exclude_pattern) - include_patterns = set(include_pattern) - + # Set default output filename if not provided if not output: - output = OUTPUT_FILE_PATH - summary, _, _ = await ingest(source, max_size, include_patterns, exclude_patterns, branch, output=output) + output = "digest.txt" + + # Parse command line patterns + exclude_patterns = _parse_patterns(exclude_pattern) + include_patterns = _parse_patterns(include_pattern) + + # Read and add patterns from ignore file + ignore_file_path = Path(source) / ignore_file + ignore_patterns = parse_ignore_file(ignore_file_path) + exclude_patterns.update(ignore_patterns) + # Perform the ingest operation with branch support + summary, *_ = await ingest(source, max_size, include_patterns, exclude_patterns, branch=branch, output=output) + + # Display results click.echo(f"Analysis complete! Output written to: {output}") click.echo("\nSummary:") click.echo(summary) - except Exception as e: - click.echo(f"Error: {e}", err=True) + except FileNotFoundError as e: + click.echo(f"Error: Source directory not found - {e}", err=True) raise click.Abort() + except PermissionError as e: + click.echo(f"Error: Permission denied - {e}", err=True) + raise click.Abort() + except Exception as e: + click.echo(f"Warning: An error occurred - {e}", err=True) + # For non-critical errors, we might want to continue rather than abort + if isinstance(e, (OSError, IOError)): + raise click.Abort() + return if __name__ == "__main__": diff --git a/src/gitingest/ignore_patterns.py b/src/gitingest/ignore_patterns.py index 5741ab15..7f610715 100644 --- a/src/gitingest/ignore_patterns.py +++ b/src/gitingest/ignore_patterns.py @@ -155,4 +155,6 @@ "*.tfstate*", ## Dependencies in various languages "vendor/", + ## gitingestignore file + ".gitingestignore", } diff --git a/src/gitingest/query_parser.py b/src/gitingest/query_parser.py index be8602f2..3d06289e 100644 --- a/src/gitingest/query_parser.py +++ b/src/gitingest/query_parser.py @@ -48,6 +48,30 @@ class ParsedQuery: # pylint: disable=too-many-instance-attributes pattern_type: str | None = None +def parse_ignore_file(ignore_file_path: Path) -> set[str]: + """ + Parse the .gitingestignore file and return a set of patterns to ignore. + + Parameters + ---------- + ignore_file_path : Path + Path to the .gitingestignore file + + Returns + ------- + set[str] + Set of patterns to ignore + """ + if not ignore_file_path.exists(): + return set() + + with open(ignore_file_path, encoding="utf-8") as f: + # Read lines, strip whitespace, and filter out empty lines and comments + patterns = {line.strip() for line in f if line.strip() and not line.startswith("#")} + + return patterns + + async def parse_query( source: str, max_file_size: int, @@ -81,6 +105,12 @@ async def parse_query( A dataclass object containing the parsed details of the repository or file path. """ + # Convert string patterns to set if necessary + if isinstance(ignore_patterns, str): + ignore_patterns = {ignore_patterns} if ignore_patterns else None + if isinstance(include_patterns, str): + include_patterns = {include_patterns} if include_patterns else None + # Determine the parsing method based on the source type if from_web or urlparse(source).scheme in ("https", "http") or any(h in source for h in KNOWN_GIT_HOSTS): # We either have a full URL or a domain-less slug @@ -89,6 +119,14 @@ async def parse_query( # Local path scenario parsed_query = _parse_path(source) + # Look for .gitingestignore file in the local path + ignore_file_path = Path(parsed_query.local_path) / ".gitingestignore" + additional_ignore_patterns = parse_ignore_file(ignore_file_path) + if ignore_patterns: + ignore_patterns.update(additional_ignore_patterns) + else: + ignore_patterns = additional_ignore_patterns + # Combine default ignore patterns + custom patterns ignore_patterns_set = DEFAULT_IGNORE_PATTERNS.copy() if ignore_patterns: @@ -283,17 +321,18 @@ def _normalize_pattern(pattern: str) -> str: return pattern -def _parse_patterns(pattern: set[str] | str) -> set[str]: +def _parse_patterns(patterns: tuple[str, ...] | set[str] | str) -> set[str]: """ Parse and validate file/directory patterns for inclusion or exclusion. - Takes either a single pattern string or set of pattern strings and processes them into a normalized list. - Patterns are split on commas and spaces, validated for allowed characters, and normalized. + Takes either a single pattern string, a tuple of pattern strings, or a set of pattern strings + and processes them into a normalized list. Patterns are split on commas and spaces, validated + for allowed characters, and normalized. Parameters ---------- - pattern : set[str] | str - Pattern(s) to parse - either a single string or set of strings + patterns : tuple[str, ...] | set[str] | str + Pattern(s) to parse - either a single string, a tuple of strings, or a set of strings Returns ------- @@ -307,7 +346,11 @@ def _parse_patterns(pattern: set[str] | str) -> set[str]: dash (-), underscore (_), dot (.), forward slash (/), plus (+), and asterisk (*) are allowed. """ - patterns = pattern if isinstance(pattern, set) else {pattern} + # Convert patterns to a set if it's not already a set + if isinstance(patterns, tuple): + patterns = set(patterns) + elif isinstance(patterns, str): + patterns = {patterns} parsed_patterns: set[str] = set() for p in patterns: diff --git a/src/gitingest/repository_clone.py b/src/gitingest/repository_clone.py index 1fa38641..fdcdf3e8 100644 --- a/src/gitingest/repository_clone.py +++ b/src/gitingest/repository_clone.py @@ -1,7 +1,10 @@ -""" This module contains functions for cloning a Git repository to a local path. """ +""" +Module for cloning repositories in the gitingest package. +""" import asyncio import os +import shutil from dataclasses import dataclass from pathlib import Path @@ -89,6 +92,11 @@ async def clone_repo(config: CloneConfig) -> tuple[bytes, bytes]: if not await _check_repo_exists(url): raise ValueError("Repository not found, make sure it is public") + # Remove the directory if it exists and is not empty + local_path_obj = Path(local_path) + if local_path_obj.exists() and any(local_path_obj.iterdir()): + shutil.rmtree(local_path_obj) + if commit: # Scenario 1: Clone and checkout a specific commit # Clone the repository without depth to ensure full history for checkout @@ -100,7 +108,6 @@ async def clone_repo(config: CloneConfig) -> tuple[bytes, bytes]: return await _run_git_command(*checkout_cmd) if branch and branch.lower() not in ("main", "master"): - # Scenario 2: Clone a specific branch with shallow depth clone_cmd = ["git", "clone", "--depth=1", "--single-branch", "--branch", branch, url, local_path] return await _run_git_command(*clone_cmd) diff --git a/src/gitingest/repository_ingest.py b/src/gitingest/repository_ingest.py index 57be89da..b9131ac7 100644 --- a/src/gitingest/repository_ingest.py +++ b/src/gitingest/repository_ingest.py @@ -1,6 +1,7 @@ -""" Main entry point for ingesting a source and processing its contents. """ +""" +Module for ingesting repositories in the gitingest package. +""" -import asyncio import inspect import shutil @@ -64,23 +65,21 @@ async def ingest( ) if parsed_query.url: - selected_branch = branch if branch else parsed_query.branch # prioritize branch argument - parsed_query.branch = selected_branch + # Override branch if specified + if branch is not None: + parsed_query.branch = branch # Extract relevant fields for CloneConfig clone_config = CloneConfig( url=parsed_query.url, local_path=str(parsed_query.local_path), commit=parsed_query.commit, - branch=selected_branch, + branch=parsed_query.branch, ) clone_result = clone_repo(clone_config) if inspect.iscoroutine(clone_result): - if asyncio.get_event_loop().is_running(): - await clone_result - else: - asyncio.run(clone_result) + await clone_result else: raise TypeError("clone_repo did not return a coroutine as expected.") @@ -94,5 +93,4 @@ async def ingest( finally: # Clean up the temporary directory if it was created if parsed_query.url: - # Clean up the temporary directory shutil.rmtree(TMP_BASE_PATH, ignore_errors=True) diff --git a/src/server/query_processor.py b/src/server/query_processor.py index 69fcfc58..27ea4bf7 100644 --- a/src/server/query_processor.py +++ b/src/server/query_processor.py @@ -1,5 +1,6 @@ """ Process a query by parsing input, cloning a repository, and generating a summary. """ +import os from functools import partial from fastapi import Request @@ -46,43 +47,23 @@ async def process_query( _TemplateResponse Rendered template response containing the processed results or an error message. - Raises - ------ - ValueError - If an invalid pattern type is provided. """ - if pattern_type == "include": - include_patterns = pattern - exclude_patterns = None - elif pattern_type == "exclude": - exclude_patterns = pattern - include_patterns = None - else: - raise ValueError(f"Invalid pattern type: {pattern_type}") - + include_patterns, exclude_patterns = validate_pattern_type(pattern_type, pattern) template = "index.jinja" if is_index else "git.jinja" template_response = partial(templates.TemplateResponse, name=template) max_file_size = log_slider_to_size(slider_position) - context = { - "request": request, - "repo_url": input_text, - "examples": EXAMPLE_REPOS if is_index else [], - "default_file_size": slider_position, - "pattern_type": pattern_type, - "pattern": pattern, - } + context = create_context(request, input_text, slider_position, pattern_type, pattern, is_index) try: - parsed_query: ParsedQuery = await parse_query( + parsed_query = await parse_query( source=input_text, max_file_size=max_file_size, from_web=True, include_patterns=include_patterns, ignore_patterns=exclude_patterns, ) - if not parsed_query.url: - raise ValueError("The 'url' parameter is required.") + validate_parsed_query(parsed_query) clone_config = CloneConfig( url=parsed_query.url, @@ -91,25 +72,18 @@ async def process_query( branch=parsed_query.branch, ) await clone_repo(clone_config) + + update_ignore_patterns(parsed_query, clone_config.local_path) + summary, tree, content = run_ingest_query(parsed_query) - with open(f"{clone_config.local_path}.txt", "w", encoding="utf-8") as f: - f.write(tree + "\n" + content) + save_ingest_result(clone_config.local_path, tree, content) + content = filter_ignored_files(parsed_query, content) + except Exception as e: - # hack to print error message when query is not defined - if "query" in locals() and parsed_query is not None and isinstance(parsed_query, dict): - _print_error(parsed_query["url"], e, max_file_size, pattern_type, pattern) - else: - print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<- {Colors.END}", end="") - print(f"{Colors.RED}{e}{Colors.END}") - - context["error_message"] = f"Error: {e}" + handle_query_error(e, parsed_query, max_file_size, pattern_type, pattern, context) return template_response(context=context) - if len(content) > MAX_DISPLAY_SIZE: - content = ( - f"(Files content cropped to {int(MAX_DISPLAY_SIZE / 1_000)}k characters, " - "download full ingest to see more)\n" + content[:MAX_DISPLAY_SIZE] - ) + content = truncate_content(content) _print_success( url=parsed_query.url, @@ -132,10 +106,207 @@ async def process_query( return template_response(context=context) +def validate_pattern_type(pattern_type: str, pattern: str): + """ + Ensure valid pattern type and return the corresponding include/exclude patterns. + + Parameters + ---------- + pattern_type : str + Specifies the type of pattern, either "include" or "exclude". + pattern : str + The pattern string to be included or excluded. + + Returns + ------- + tuple + A tuple containing either the include or exclude pattern. + + Raises + ------ + ValueError + If an invalid pattern type is provided. + """ + if pattern_type == "include": + return pattern, None + if pattern_type == "exclude": + return None, pattern + raise ValueError(f"Invalid pattern type: {pattern_type}") + + +def create_context( + request: Request, input_text: str, slider_position: int, pattern_type: str, pattern: str, is_index: bool +) -> dict: + """ + Prepare the context dictionary for rendering templates. + + Parameters + ---------- + request : Request + The HTTP request object. + input_text : str + The user-provided input text (Git repository URL or slug). + slider_position : int + The position of the slider, representing the maximum file size in the query. + pattern_type : str + Type of pattern to use, either "include" or "exclude". + pattern : str + The pattern string to include or exclude. + is_index : bool + Boolean flag indicating if the request is for the index page. + + Returns + ------- + dict + A dictionary containing template context data. + """ + return { + "request": request, + "repo_url": input_text, + "examples": EXAMPLE_REPOS if is_index else [], + "default_file_size": slider_position, + "pattern_type": pattern_type, + "pattern": pattern, + } + + +def validate_parsed_query(parsed_query: ParsedQuery): + """ + Check if the parsed query contains a valid URL. + + Parameters + ---------- + parsed_query : ParsedQuery + The parsed query object containing repository information. + + Raises + ------ + ValueError + If the URL parameter is missing in the parsed query. + """ + if not parsed_query.url: + raise ValueError("The 'url' parameter is required.") + + +def update_ignore_patterns(parsed_query: ParsedQuery, local_path: str): + """ + Load ignore patterns from `.gitingestignore` file if present. + + Parameters + ---------- + parsed_query : ParsedQuery + The parsed query object containing repository details. + local_path : str + The local path where the repository is cloned. + """ + ignore_file_path = os.path.join(local_path, ".gitingestignore") + if os.path.exists(ignore_file_path): + with open(ignore_file_path, encoding="utf-8") as ignore_file: + additional_ignore_patterns = [ + line.strip() for line in ignore_file if line.strip() and not line.startswith("#") + ] + + if additional_ignore_patterns: + parsed_query.ignore_patterns = parsed_query.ignore_patterns or set() + parsed_query.ignore_patterns.update(additional_ignore_patterns) + + +def save_ingest_result(local_path: str, tree: str, content: str): + """ + Save the repository tree and file content to a text file. + + Parameters + ---------- + local_path : str + The local path where the repository is cloned. + tree : str + The repository tree structure. + content : str + The ingested file content. + """ + with open(f"{local_path}.txt", "w", encoding="utf-8") as f: + f.write(tree + "\n" + content) + + +def filter_ignored_files(parsed_query: ParsedQuery, content: str) -> str: + """ + Remove ignored file patterns from content. + + Parameters + ---------- + parsed_query : ParsedQuery + The parsed query object containing ignore patterns. + content : str + The content to be filtered. + + Returns + ------- + str + The filtered content without ignored patterns. + """ + if parsed_query.ignore_patterns: + content = "\n".join( + line + for line in content.splitlines() + if not any(ignored in line for ignored in parsed_query.ignore_patterns) + ) + return content + + +def handle_query_error( + e: Exception, parsed_query: ParsedQuery, max_file_size: int, pattern_type: str, pattern: str, context: dict +): + """ + Handle exceptions during query processing and log errors. + + Parameters + ---------- + e : Exception + The exception raised during processing. + parsed_query : ParsedQuery + The parsed query object. + max_file_size : int + The maximum file size allowed for the query, in bytes. + pattern_type : str + Specifies the type of pattern used. + pattern : str + The actual pattern string used. + context : dict + The template context dictionary. + """ + if "query" in locals() and parsed_query is not None and isinstance(parsed_query, dict): + _print_error(parsed_query["url"], e, max_file_size, pattern_type, pattern) + else: + print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<- {Colors.END}{Colors.RED}{e}{Colors.END}") + + context["error_message"] = f"Error: {e}" + + +def truncate_content(content: str) -> str: + """ + Truncate content if it exceeds the maximum display size. + + Parameters + ---------- + content : str + The content to be truncated. + + Returns + ------- + str + The truncated content, if applicable. + """ + if len(content) > MAX_DISPLAY_SIZE: + content = ( + f"(Files content cropped to {int(MAX_DISPLAY_SIZE / 1_000)}k characters, " + "download full ingest to see more)\n" + content[:MAX_DISPLAY_SIZE] + ) + return content + + def _print_query(url: str, max_file_size: int, pattern_type: str, pattern: str) -> None: """ - Print a formatted summary of the query details, including the URL, file size, - and pattern information, for easier debugging or logging. + Print a formatted summary of the query details. Parameters ---------- @@ -151,16 +322,16 @@ def _print_query(url: str, max_file_size: int, pattern_type: str, pattern: str) print(f"{Colors.WHITE}{url:<20}{Colors.END}", end="") if int(max_file_size / 1024) != 50: print(f" | {Colors.YELLOW}Size: {int(max_file_size/1024)}kb{Colors.END}", end="") - if pattern_type == "include" and pattern != "": + if pattern_type == "include" and pattern: print(f" | {Colors.YELLOW}Include {pattern}{Colors.END}", end="") - elif pattern_type == "exclude" and pattern != "": + elif pattern_type == "exclude" and pattern: print(f" | {Colors.YELLOW}Exclude {pattern}{Colors.END}", end="") + print() def _print_error(url: str, e: Exception, max_file_size: int, pattern_type: str, pattern: str) -> None: """ - Print a formatted error message including the URL, file size, pattern details, and the exception encountered, - for debugging or logging purposes. + Print a formatted error message including details of the exception. Parameters ---------- @@ -182,8 +353,7 @@ def _print_error(url: str, e: Exception, max_file_size: int, pattern_type: str, def _print_success(url: str, max_file_size: int, pattern_type: str, pattern: str, summary: str) -> None: """ - Print a formatted success message, including the URL, file size, pattern details, and a summary with estimated - tokens, for debugging or logging purposes. + Print a formatted success message, including estimated tokens. Parameters ----------