From 38d1bd6b57e531194fdaa258e708773f7f4f0aa4 Mon Sep 17 00:00:00 2001 From: AbhiRam162105 Date: Wed, 22 Jan 2025 01:49:21 +0530 Subject: [PATCH 1/7] feat(cli): Enhance file exclusion capabilities with multiple patterns and .gitingestignore support - Added the ability to specify multiple file exclusion patterns using the `-e` option. - Introduced support for a `.gitingestignore` file to define files and directories to be excluded. - Improved command line pattern parsing to handle space-separated patterns. - Combined exclusion patterns from both command line arguments and the `.gitingestignore` file for comprehensive exclusion. - Removed the unused import 'os' to satisfy pylint checks. - Corrected docstring issues to comply with darglint requirements. - Updated documentation with clear examples illustrating the new features. These enhancements provide users with a more flexible and user-friendly way to exclude files and directories, either through command line options or by using a `.gitingestignore` file. --- src/gitingest/cli.py | 119 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 96 insertions(+), 23 deletions(-) diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index a21a4533..9eb6d67d 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -1,8 +1,8 @@ -""" Command-line interface for the Gitingest package. """ +"""Command-line interface for the Gitingest package.""" # pylint: disable=no-value-for-parameter - import asyncio +from pathlib import Path import click @@ -10,68 +10,120 @@ from gitingest.repository_ingest import ingest +def parse_ignore_file(ignore_file_path: Path) -> set[str]: + """ + Parse the .gitingestignore file and return a set of patterns to ignore. + + Parameters + ---------- + ignore_file_path : Path + Path to the .gitingestignore file + + Returns + ------- + set[str] + Set of patterns to ignore + """ + if not ignore_file_path.exists(): + return set() + + with open(ignore_file_path, encoding="utf-8") as f: + # Read lines, strip whitespace, and filter out empty lines and comments + patterns = {line.strip() for line in f if line.strip() and not line.startswith("#")} + + return patterns + + +def parse_patterns(patterns: tuple[str, ...]) -> set[str]: + """ + Parse patterns from command line arguments. + Handles both space-separated patterns in a single string + and multiple -e/-i arguments. + + Parameters + ---------- + patterns : tuple[str, ...] + Tuple of patterns from command line + + Returns + ------- + set[str] + Set of parsed patterns + """ + result = set() + for pattern_str in patterns: + # Split on spaces and add each pattern + result.update(p.strip() for p in pattern_str.split() if p.strip()) + return result + + @click.command() @click.argument("source", type=str, default=".") @click.option("--output", "-o", default=None, help="Output file path (default: .txt in current directory)") @click.option("--max-size", "-s", default=MAX_FILE_SIZE, help="Maximum file size to process in bytes") -@click.option("--exclude-pattern", "-e", multiple=True, help="Patterns to exclude") -@click.option("--include-pattern", "-i", multiple=True, help="Patterns to include") +@click.option("--exclude-pattern", "-e", multiple=True, help="Patterns to exclude (space-separated patterns allowed)") +@click.option("--include-pattern", "-i", multiple=True, help="Patterns to include (space-separated patterns allowed)") +@click.option("--ignore-file", default=".gitingestignore", help="Path to ignore file (default: .gitingestignore)") def main( source: str, output: str | None, max_size: int, exclude_pattern: tuple[str, ...], include_pattern: tuple[str, ...], + ignore_file: str, ): """ - Main entry point for the CLI. This function is called when the CLI is run as a script. - - It calls the async main function to run the command. + Main entry point for the CLI. Parameters ---------- source : str The source directory or repository to analyze. output : str | None - The path where the output file will be written. If not specified, the output will be written - to a file named `.txt` in the current directory. + The path where the output file will be written. If not specified, the output + will be written to a file named `.txt` in the current directory. max_size : int The maximum file size to process, in bytes. Files larger than this size will be ignored. exclude_pattern : tuple[str, ...] A tuple of patterns to exclude during the analysis. Files matching these patterns will be ignored. include_pattern : tuple[str, ...] A tuple of patterns to include during the analysis. Only files matching these patterns will be processed. + ignore_file : str + Path to the ignore file containing additional patterns to exclude. """ - # Main entry point for the CLI. This function is called when the CLI is run as a script. - asyncio.run(_async_main(source, output, max_size, exclude_pattern, include_pattern)) + asyncio.run(async_main(source, output, max_size, exclude_pattern, include_pattern, ignore_file)) -async def _async_main( +async def async_main( source: str, output: str | None, max_size: int, exclude_pattern: tuple[str, ...], include_pattern: tuple[str, ...], + ignore_file: str, ) -> None: """ Analyze a directory or repository and create a text dump of its contents. - This command analyzes the contents of a specified source directory or repository, applies custom include and - exclude patterns, and generates a text summary of the analysis which is then written to an output file. + This command analyzes the contents of a specified source directory or repository, + applies custom include and exclude patterns, and generates a text summary of the + analysis which is then written to an output file. Parameters ---------- source : str The source directory or repository to analyze. output : str | None - The path where the output file will be written. If not specified, the output will be written - to a file named `.txt` in the current directory. + The path where the output file will be written. If not specified, the output + will be written to a file named `.txt` in the current directory. max_size : int The maximum file size to process, in bytes. Files larger than this size will be ignored. exclude_pattern : tuple[str, ...] A tuple of patterns to exclude during the analysis. Files matching these patterns will be ignored. include_pattern : tuple[str, ...] A tuple of patterns to include during the analysis. Only files matching these patterns will be processed. + ignore_file : str + Path to the ignore file containing additional patterns to exclude. Raises ------ @@ -79,21 +131,42 @@ async def _async_main( If there is an error during the execution of the command, this exception is raised to abort the process. """ try: - # Combine default and custom ignore patterns - exclude_patterns = set(exclude_pattern) - include_patterns = set(include_pattern) + # Get repository name from source path + repo_name = Path(source).name or "repository" + # Set default output filename if not provided if not output: - output = "digest.txt" - summary, _, _ = await ingest(source, max_size, include_patterns, exclude_patterns, output=output) + output = f"{repo_name}.txt" + # Parse command line patterns + exclude_patterns = parse_patterns(exclude_pattern) + include_patterns = parse_patterns(include_pattern) + + # Read and add patterns from ignore file + ignore_file_path = Path(source) / ignore_file + ignore_patterns = parse_ignore_file(ignore_file_path) + exclude_patterns.update(ignore_patterns) + + # Perform the ingest operation + summary, *_ = await ingest(source, max_size, include_patterns, exclude_patterns, output=output) + + # Display results click.echo(f"Analysis complete! Output written to: {output}") click.echo("\nSummary:") click.echo(summary) - except Exception as e: - click.echo(f"Error: {e}", err=True) + except FileNotFoundError as e: + click.echo(f"Error: Source directory not found - {e}", err=True) + raise click.Abort() + except PermissionError as e: + click.echo(f"Error: Permission denied - {e}", err=True) raise click.Abort() + except Exception as e: + click.echo(f"Warning: An error occurred - {e}", err=True) + # For non-critical errors, we might want to continue rather than abort + if isinstance(e, (OSError, IOError)): + raise click.Abort() + return if __name__ == "__main__": From ee37d6468be4abab04c6e1ff3109df2c1f09bfd5 Mon Sep 17 00:00:00 2001 From: AbhiRam162105 Date: Sat, 25 Jan 2025 13:52:26 +0530 Subject: [PATCH 2/7] refactor: consolidate pattern parsing and ignore file handling in CLI and query parser --- src/gitingest/cli.py | 56 +++------------------------ src/gitingest/ignore_patterns.py | 2 + src/gitingest/query_parser.py | 61 ++++++++++++++++++++++++++---- src/gitingest/repository_clone.py | 12 +++++- src/gitingest/repository_ingest.py | 8 ++-- 5 files changed, 75 insertions(+), 64 deletions(-) diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index b92f934f..92934fc7 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -6,57 +6,11 @@ import click -from gitingest.config import MAX_FILE_SIZE, OUTPUT_FILE_PATH +from gitingest.config import MAX_FILE_SIZE +from gitingest.query_parser import _parse_patterns, parse_ignore_file from gitingest.repository_ingest import ingest -def parse_ignore_file(ignore_file_path: Path) -> set[str]: - """ - Parse the .gitingestignore file and return a set of patterns to ignore. - - Parameters - ---------- - ignore_file_path : Path - Path to the .gitingestignore file - - Returns - ------- - set[str] - Set of patterns to ignore - """ - if not ignore_file_path.exists(): - return set() - - with open(ignore_file_path, encoding="utf-8") as f: - # Read lines, strip whitespace, and filter out empty lines and comments - patterns = {line.strip() for line in f if line.strip() and not line.startswith("#")} - - return patterns - - -def parse_patterns(patterns: tuple[str, ...]) -> set[str]: - """ - Parse patterns from command line arguments. - Handles both space-separated patterns in a single string - and multiple -e/-i arguments. - - Parameters - ---------- - patterns : tuple[str, ...] - Tuple of patterns from command line - - Returns - ------- - set[str] - Set of parsed patterns - """ - result = set() - for pattern_str in patterns: - # Split on spaces and add each pattern - result.update(p.strip() for p in pattern_str.split() if p.strip()) - return result - - @click.command() @click.argument("source", type=str, default=".") @click.option("--output", "-o", default=None, help="Output file path (default: .txt in current directory)") @@ -139,8 +93,8 @@ async def async_main( output = f"{repo_name}.txt" # Parse command line patterns - exclude_patterns = parse_patterns(exclude_pattern) - include_patterns = parse_patterns(include_pattern) + exclude_patterns = _parse_patterns(exclude_pattern) + include_patterns = _parse_patterns(include_pattern) # Read and add patterns from ignore file ignore_file_path = Path(source) / ignore_file @@ -170,4 +124,4 @@ async def async_main( if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/src/gitingest/ignore_patterns.py b/src/gitingest/ignore_patterns.py index 90ef2104..14f78d50 100644 --- a/src/gitingest/ignore_patterns.py +++ b/src/gitingest/ignore_patterns.py @@ -153,4 +153,6 @@ "*.tfstate*", ## Dependencies in various languages "vendor/", + ## gitingestignore file + ".gitingestignore", } diff --git a/src/gitingest/query_parser.py b/src/gitingest/query_parser.py index be8602f2..2751e3c9 100644 --- a/src/gitingest/query_parser.py +++ b/src/gitingest/query_parser.py @@ -12,7 +12,7 @@ from gitingest.config import MAX_FILE_SIZE, TMP_BASE_PATH from gitingest.exceptions import InvalidPatternError from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS -from gitingest.repository_clone import _check_repo_exists, fetch_remote_branch_list +from gitingest.repository_clone import CloneConfig, _check_repo_exists, clone_repo, fetch_remote_branch_list HEX_DIGITS: set[str] = set(string.hexdigits) @@ -48,6 +48,30 @@ class ParsedQuery: # pylint: disable=too-many-instance-attributes pattern_type: str | None = None +def parse_ignore_file(ignore_file_path: Path) -> set[str]: + """ + Parse the .gitingestignore file and return a set of patterns to ignore. + + Parameters + ---------- + ignore_file_path : Path + Path to the .gitingestignore file + + Returns + ------- + set[str] + Set of patterns to ignore + """ + if not ignore_file_path.exists(): + return set() + + with open(ignore_file_path, encoding="utf-8") as f: + # Read lines, strip whitespace, and filter out empty lines and comments + patterns = {line.strip() for line in f if line.strip() and not line.startswith("#")} + + return patterns + + async def parse_query( source: str, max_file_size: int, @@ -89,6 +113,24 @@ async def parse_query( # Local path scenario parsed_query = _parse_path(source) + # Clone the repository if it's a URL + if parsed_query.url: + clone_config = CloneConfig( + url=parsed_query.url, + local_path=str(parsed_query.local_path), + commit=parsed_query.commit, + branch=parsed_query.branch, + ) + await clone_repo(clone_config) + + # Look for .gitingestignore file in the cloned repository + ignore_file_path = Path(parsed_query.local_path) / ".gitingestignore" + additional_ignore_patterns = parse_ignore_file(ignore_file_path) + if ignore_patterns: + ignore_patterns.update(additional_ignore_patterns) + else: + ignore_patterns = additional_ignore_patterns + # Combine default ignore patterns + custom patterns ignore_patterns_set = DEFAULT_IGNORE_PATTERNS.copy() if ignore_patterns: @@ -283,17 +325,18 @@ def _normalize_pattern(pattern: str) -> str: return pattern -def _parse_patterns(pattern: set[str] | str) -> set[str]: +def _parse_patterns(patterns: tuple[str, ...] | set[str] | str) -> set[str]: """ Parse and validate file/directory patterns for inclusion or exclusion. - Takes either a single pattern string or set of pattern strings and processes them into a normalized list. - Patterns are split on commas and spaces, validated for allowed characters, and normalized. + Takes either a single pattern string, a tuple of pattern strings, or a set of pattern strings + and processes them into a normalized list. Patterns are split on commas and spaces, validated + for allowed characters, and normalized. Parameters ---------- - pattern : set[str] | str - Pattern(s) to parse - either a single string or set of strings + patterns : tuple[str, ...] | set[str] | str + Pattern(s) to parse - either a single string, a tuple of strings, or a set of strings Returns ------- @@ -307,7 +350,11 @@ def _parse_patterns(pattern: set[str] | str) -> set[str]: dash (-), underscore (_), dot (.), forward slash (/), plus (+), and asterisk (*) are allowed. """ - patterns = pattern if isinstance(pattern, set) else {pattern} + # Convert patterns to a set if it's not already a set + if isinstance(patterns, tuple): + patterns = set(patterns) + elif isinstance(patterns, str): + patterns = {patterns} parsed_patterns: set[str] = set() for p in patterns: diff --git a/src/gitingest/repository_clone.py b/src/gitingest/repository_clone.py index 4adfcd9f..f5aa4f32 100644 --- a/src/gitingest/repository_clone.py +++ b/src/gitingest/repository_clone.py @@ -1,7 +1,11 @@ -""" This module contains functions for cloning a Git repository to a local path. """ +""" +Module for cloning repositories in the gitingest package. +""" import asyncio +import shutil from dataclasses import dataclass +from pathlib import Path from gitingest.utils import async_timeout @@ -78,6 +82,11 @@ async def clone_repo(config: CloneConfig) -> tuple[bytes, bytes]: if not await _check_repo_exists(url): raise ValueError("Repository not found, make sure it is public") + # Remove the directory if it exists and is not empty + local_path_obj = Path(local_path) + if local_path_obj.exists() and any(local_path_obj.iterdir()): + shutil.rmtree(local_path_obj) + if commit: # Scenario 1: Clone and checkout a specific commit # Clone the repository without depth to ensure full history for checkout @@ -89,7 +98,6 @@ async def clone_repo(config: CloneConfig) -> tuple[bytes, bytes]: return await _run_git_command(*checkout_cmd) if branch and branch.lower() not in ("main", "master"): - # Scenario 2: Clone a specific branch with shallow depth clone_cmd = ["git", "clone", "--depth=1", "--single-branch", "--branch", branch, url, local_path] return await _run_git_command(*clone_cmd) diff --git a/src/gitingest/repository_ingest.py b/src/gitingest/repository_ingest.py index f92c1c2d..d261ecaa 100644 --- a/src/gitingest/repository_ingest.py +++ b/src/gitingest/repository_ingest.py @@ -1,6 +1,7 @@ -""" Main entry point for ingesting a source and processing its contents. """ +""" +Module for ingesting repositories in the gitingest package. +""" -import asyncio import inspect import shutil @@ -71,7 +72,7 @@ async def ingest( clone_result = clone_repo(clone_config) if inspect.iscoroutine(clone_result): - asyncio.run(clone_result) + await clone_result else: raise TypeError("clone_repo did not return a coroutine as expected.") @@ -85,5 +86,4 @@ async def ingest( finally: # Clean up the temporary directory if it was created if parsed_query.url: - # Clean up the temporary directory shutil.rmtree(TMP_BASE_PATH, ignore_errors=True) From 732486cfa9950880eba24e59cc486846c4a98e35 Mon Sep 17 00:00:00 2001 From: AbhiRam162105 Date: Sat, 25 Jan 2025 14:22:22 +0530 Subject: [PATCH 3/7] feat: add branch option to CLI for cloning and ingesting specific branches --- src/gitingest/cli.py | 13 ++++++++++--- src/gitingest/repository_ingest.py | 7 +++++++ 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index 92934fc7..c63eefb2 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -18,6 +18,7 @@ @click.option("--exclude-pattern", "-e", multiple=True, help="Patterns to exclude (space-separated patterns allowed)") @click.option("--include-pattern", "-i", multiple=True, help="Patterns to include (space-separated patterns allowed)") @click.option("--ignore-file", default=".gitingestignore", help="Path to ignore file (default: .gitingestignore)") +@click.option("--branch", "-b", default=None, help="Branch to clone and ingest") def main( source: str, output: str | None, @@ -25,6 +26,7 @@ def main( exclude_pattern: tuple[str, ...], include_pattern: tuple[str, ...], ignore_file: str, + branch: str | None, ): """ Main entry point for the CLI. @@ -44,8 +46,10 @@ def main( A tuple of patterns to include during the analysis. Only files matching these patterns will be processed. ignore_file : str Path to the ignore file containing additional patterns to exclude. + branch : str | None + The branch to clone (optional). """ - asyncio.run(async_main(source, output, max_size, exclude_pattern, include_pattern, ignore_file)) + asyncio.run(async_main(source, output, max_size, exclude_pattern, include_pattern, ignore_file, branch)) async def async_main( @@ -55,6 +59,7 @@ async def async_main( exclude_pattern: tuple[str, ...], include_pattern: tuple[str, ...], ignore_file: str, + branch: str | None, ) -> None: """ Analyze a directory or repository and create a text dump of its contents. @@ -78,6 +83,8 @@ async def async_main( A tuple of patterns to include during the analysis. Only files matching these patterns will be processed. ignore_file : str Path to the ignore file containing additional patterns to exclude. + branch : str | None + The branch to clone (optional). Raises ------ @@ -101,8 +108,8 @@ async def async_main( ignore_patterns = parse_ignore_file(ignore_file_path) exclude_patterns.update(ignore_patterns) - # Perform the ingest operation - summary, *_ = await ingest(source, max_size, include_patterns, exclude_patterns, output=output) + # Perform the ingest operation with branch support + summary, *_ = await ingest(source, max_size, include_patterns, exclude_patterns, branch=branch, output=output) # Display results click.echo(f"Analysis complete! Output written to: {output}") diff --git a/src/gitingest/repository_ingest.py b/src/gitingest/repository_ingest.py index d261ecaa..b9131ac7 100644 --- a/src/gitingest/repository_ingest.py +++ b/src/gitingest/repository_ingest.py @@ -16,6 +16,7 @@ async def ingest( max_file_size: int = 10 * 1024 * 1024, # 10 MB include_patterns: set[str] | str | None = None, exclude_patterns: set[str] | str | None = None, + branch: str | None = None, output: str | None = None, ) -> tuple[str, str, str]: """ @@ -36,6 +37,8 @@ async def ingest( Pattern or set of patterns specifying which files to include. If `None`, all files are included. exclude_patterns : set[str] | str | None, optional Pattern or set of patterns specifying which files to exclude. If `None`, no files are excluded. + branch : str | None, optional + The branch to clone and ingest. If `None`, the default branch is used. output : str | None, optional File path where the summary and content should be written. If `None`, the results are not written to a file. @@ -62,6 +65,10 @@ async def ingest( ) if parsed_query.url: + # Override branch if specified + if branch is not None: + parsed_query.branch = branch + # Extract relevant fields for CloneConfig clone_config = CloneConfig( url=parsed_query.url, From 24184fcda8a8f5f19f0f845e32c7dcd516098579 Mon Sep 17 00:00:00 2001 From: AbhiRam162105 Date: Mon, 3 Feb 2025 22:50:19 +0530 Subject: [PATCH 4/7] refactor: streamline query processing by removing unnecessary cloning logic and enhancing ignore pattern handling --- src/gitingest/query_parser.py | 26 ++-- src/server/query_processor.py | 264 ++++++++++++++++++++++++++++------ 2 files changed, 225 insertions(+), 65 deletions(-) diff --git a/src/gitingest/query_parser.py b/src/gitingest/query_parser.py index 2751e3c9..167c96c3 100644 --- a/src/gitingest/query_parser.py +++ b/src/gitingest/query_parser.py @@ -12,7 +12,7 @@ from gitingest.config import MAX_FILE_SIZE, TMP_BASE_PATH from gitingest.exceptions import InvalidPatternError from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS -from gitingest.repository_clone import CloneConfig, _check_repo_exists, clone_repo, fetch_remote_branch_list +from gitingest.repository_clone import _check_repo_exists, fetch_remote_branch_list HEX_DIGITS: set[str] = set(string.hexdigits) @@ -113,23 +113,13 @@ async def parse_query( # Local path scenario parsed_query = _parse_path(source) - # Clone the repository if it's a URL - if parsed_query.url: - clone_config = CloneConfig( - url=parsed_query.url, - local_path=str(parsed_query.local_path), - commit=parsed_query.commit, - branch=parsed_query.branch, - ) - await clone_repo(clone_config) - - # Look for .gitingestignore file in the cloned repository - ignore_file_path = Path(parsed_query.local_path) / ".gitingestignore" - additional_ignore_patterns = parse_ignore_file(ignore_file_path) - if ignore_patterns: - ignore_patterns.update(additional_ignore_patterns) - else: - ignore_patterns = additional_ignore_patterns + # Look for .gitingestignore file in the local path + ignore_file_path = Path(parsed_query.local_path) / ".gitingestignore" + additional_ignore_patterns = parse_ignore_file(ignore_file_path) + if ignore_patterns: + ignore_patterns.update(additional_ignore_patterns) + else: + ignore_patterns = additional_ignore_patterns # Combine default ignore patterns + custom patterns ignore_patterns_set = DEFAULT_IGNORE_PATTERNS.copy() diff --git a/src/server/query_processor.py b/src/server/query_processor.py index 69fcfc58..27ea4bf7 100644 --- a/src/server/query_processor.py +++ b/src/server/query_processor.py @@ -1,5 +1,6 @@ """ Process a query by parsing input, cloning a repository, and generating a summary. """ +import os from functools import partial from fastapi import Request @@ -46,43 +47,23 @@ async def process_query( _TemplateResponse Rendered template response containing the processed results or an error message. - Raises - ------ - ValueError - If an invalid pattern type is provided. """ - if pattern_type == "include": - include_patterns = pattern - exclude_patterns = None - elif pattern_type == "exclude": - exclude_patterns = pattern - include_patterns = None - else: - raise ValueError(f"Invalid pattern type: {pattern_type}") - + include_patterns, exclude_patterns = validate_pattern_type(pattern_type, pattern) template = "index.jinja" if is_index else "git.jinja" template_response = partial(templates.TemplateResponse, name=template) max_file_size = log_slider_to_size(slider_position) - context = { - "request": request, - "repo_url": input_text, - "examples": EXAMPLE_REPOS if is_index else [], - "default_file_size": slider_position, - "pattern_type": pattern_type, - "pattern": pattern, - } + context = create_context(request, input_text, slider_position, pattern_type, pattern, is_index) try: - parsed_query: ParsedQuery = await parse_query( + parsed_query = await parse_query( source=input_text, max_file_size=max_file_size, from_web=True, include_patterns=include_patterns, ignore_patterns=exclude_patterns, ) - if not parsed_query.url: - raise ValueError("The 'url' parameter is required.") + validate_parsed_query(parsed_query) clone_config = CloneConfig( url=parsed_query.url, @@ -91,25 +72,18 @@ async def process_query( branch=parsed_query.branch, ) await clone_repo(clone_config) + + update_ignore_patterns(parsed_query, clone_config.local_path) + summary, tree, content = run_ingest_query(parsed_query) - with open(f"{clone_config.local_path}.txt", "w", encoding="utf-8") as f: - f.write(tree + "\n" + content) + save_ingest_result(clone_config.local_path, tree, content) + content = filter_ignored_files(parsed_query, content) + except Exception as e: - # hack to print error message when query is not defined - if "query" in locals() and parsed_query is not None and isinstance(parsed_query, dict): - _print_error(parsed_query["url"], e, max_file_size, pattern_type, pattern) - else: - print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<- {Colors.END}", end="") - print(f"{Colors.RED}{e}{Colors.END}") - - context["error_message"] = f"Error: {e}" + handle_query_error(e, parsed_query, max_file_size, pattern_type, pattern, context) return template_response(context=context) - if len(content) > MAX_DISPLAY_SIZE: - content = ( - f"(Files content cropped to {int(MAX_DISPLAY_SIZE / 1_000)}k characters, " - "download full ingest to see more)\n" + content[:MAX_DISPLAY_SIZE] - ) + content = truncate_content(content) _print_success( url=parsed_query.url, @@ -132,10 +106,207 @@ async def process_query( return template_response(context=context) +def validate_pattern_type(pattern_type: str, pattern: str): + """ + Ensure valid pattern type and return the corresponding include/exclude patterns. + + Parameters + ---------- + pattern_type : str + Specifies the type of pattern, either "include" or "exclude". + pattern : str + The pattern string to be included or excluded. + + Returns + ------- + tuple + A tuple containing either the include or exclude pattern. + + Raises + ------ + ValueError + If an invalid pattern type is provided. + """ + if pattern_type == "include": + return pattern, None + if pattern_type == "exclude": + return None, pattern + raise ValueError(f"Invalid pattern type: {pattern_type}") + + +def create_context( + request: Request, input_text: str, slider_position: int, pattern_type: str, pattern: str, is_index: bool +) -> dict: + """ + Prepare the context dictionary for rendering templates. + + Parameters + ---------- + request : Request + The HTTP request object. + input_text : str + The user-provided input text (Git repository URL or slug). + slider_position : int + The position of the slider, representing the maximum file size in the query. + pattern_type : str + Type of pattern to use, either "include" or "exclude". + pattern : str + The pattern string to include or exclude. + is_index : bool + Boolean flag indicating if the request is for the index page. + + Returns + ------- + dict + A dictionary containing template context data. + """ + return { + "request": request, + "repo_url": input_text, + "examples": EXAMPLE_REPOS if is_index else [], + "default_file_size": slider_position, + "pattern_type": pattern_type, + "pattern": pattern, + } + + +def validate_parsed_query(parsed_query: ParsedQuery): + """ + Check if the parsed query contains a valid URL. + + Parameters + ---------- + parsed_query : ParsedQuery + The parsed query object containing repository information. + + Raises + ------ + ValueError + If the URL parameter is missing in the parsed query. + """ + if not parsed_query.url: + raise ValueError("The 'url' parameter is required.") + + +def update_ignore_patterns(parsed_query: ParsedQuery, local_path: str): + """ + Load ignore patterns from `.gitingestignore` file if present. + + Parameters + ---------- + parsed_query : ParsedQuery + The parsed query object containing repository details. + local_path : str + The local path where the repository is cloned. + """ + ignore_file_path = os.path.join(local_path, ".gitingestignore") + if os.path.exists(ignore_file_path): + with open(ignore_file_path, encoding="utf-8") as ignore_file: + additional_ignore_patterns = [ + line.strip() for line in ignore_file if line.strip() and not line.startswith("#") + ] + + if additional_ignore_patterns: + parsed_query.ignore_patterns = parsed_query.ignore_patterns or set() + parsed_query.ignore_patterns.update(additional_ignore_patterns) + + +def save_ingest_result(local_path: str, tree: str, content: str): + """ + Save the repository tree and file content to a text file. + + Parameters + ---------- + local_path : str + The local path where the repository is cloned. + tree : str + The repository tree structure. + content : str + The ingested file content. + """ + with open(f"{local_path}.txt", "w", encoding="utf-8") as f: + f.write(tree + "\n" + content) + + +def filter_ignored_files(parsed_query: ParsedQuery, content: str) -> str: + """ + Remove ignored file patterns from content. + + Parameters + ---------- + parsed_query : ParsedQuery + The parsed query object containing ignore patterns. + content : str + The content to be filtered. + + Returns + ------- + str + The filtered content without ignored patterns. + """ + if parsed_query.ignore_patterns: + content = "\n".join( + line + for line in content.splitlines() + if not any(ignored in line for ignored in parsed_query.ignore_patterns) + ) + return content + + +def handle_query_error( + e: Exception, parsed_query: ParsedQuery, max_file_size: int, pattern_type: str, pattern: str, context: dict +): + """ + Handle exceptions during query processing and log errors. + + Parameters + ---------- + e : Exception + The exception raised during processing. + parsed_query : ParsedQuery + The parsed query object. + max_file_size : int + The maximum file size allowed for the query, in bytes. + pattern_type : str + Specifies the type of pattern used. + pattern : str + The actual pattern string used. + context : dict + The template context dictionary. + """ + if "query" in locals() and parsed_query is not None and isinstance(parsed_query, dict): + _print_error(parsed_query["url"], e, max_file_size, pattern_type, pattern) + else: + print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<- {Colors.END}{Colors.RED}{e}{Colors.END}") + + context["error_message"] = f"Error: {e}" + + +def truncate_content(content: str) -> str: + """ + Truncate content if it exceeds the maximum display size. + + Parameters + ---------- + content : str + The content to be truncated. + + Returns + ------- + str + The truncated content, if applicable. + """ + if len(content) > MAX_DISPLAY_SIZE: + content = ( + f"(Files content cropped to {int(MAX_DISPLAY_SIZE / 1_000)}k characters, " + "download full ingest to see more)\n" + content[:MAX_DISPLAY_SIZE] + ) + return content + + def _print_query(url: str, max_file_size: int, pattern_type: str, pattern: str) -> None: """ - Print a formatted summary of the query details, including the URL, file size, - and pattern information, for easier debugging or logging. + Print a formatted summary of the query details. Parameters ---------- @@ -151,16 +322,16 @@ def _print_query(url: str, max_file_size: int, pattern_type: str, pattern: str) print(f"{Colors.WHITE}{url:<20}{Colors.END}", end="") if int(max_file_size / 1024) != 50: print(f" | {Colors.YELLOW}Size: {int(max_file_size/1024)}kb{Colors.END}", end="") - if pattern_type == "include" and pattern != "": + if pattern_type == "include" and pattern: print(f" | {Colors.YELLOW}Include {pattern}{Colors.END}", end="") - elif pattern_type == "exclude" and pattern != "": + elif pattern_type == "exclude" and pattern: print(f" | {Colors.YELLOW}Exclude {pattern}{Colors.END}", end="") + print() def _print_error(url: str, e: Exception, max_file_size: int, pattern_type: str, pattern: str) -> None: """ - Print a formatted error message including the URL, file size, pattern details, and the exception encountered, - for debugging or logging purposes. + Print a formatted error message including details of the exception. Parameters ---------- @@ -182,8 +353,7 @@ def _print_error(url: str, e: Exception, max_file_size: int, pattern_type: str, def _print_success(url: str, max_file_size: int, pattern_type: str, pattern: str, summary: str) -> None: """ - Print a formatted success message, including the URL, file size, pattern details, and a summary with estimated - tokens, for debugging or logging purposes. + Print a formatted success message, including estimated tokens. Parameters ---------- From a5813a67ced673e698a31aeec4699c2e750464ce Mon Sep 17 00:00:00 2001 From: AbhiRam162105 Date: Tue, 4 Feb 2025 19:07:37 +0530 Subject: [PATCH 5/7] refactor: simplify ingest function call in CLI by consolidating parameters --- src/gitingest/cli.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index 6b6d3daf..c63eefb2 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -109,14 +109,7 @@ async def async_main( exclude_patterns.update(ignore_patterns) # Perform the ingest operation with branch support - summary, *_ = await ingest( - source, - max_size, - include_patterns, - exclude_patterns, - branch=branch, - output=output - ) + summary, *_ = await ingest(source, max_size, include_patterns, exclude_patterns, branch=branch, output=output) # Display results click.echo(f"Analysis complete! Output written to: {output}") @@ -138,4 +131,4 @@ async def async_main( if __name__ == "__main__": - main() \ No newline at end of file + main() From 26787846fc58b7bd2a93518a23f5b5620e4e17f1 Mon Sep 17 00:00:00 2001 From: AbhiRam162105 Date: Tue, 4 Feb 2025 19:48:09 +0530 Subject: [PATCH 6/7] fix: update default output filename in CLI to 'digest.txt' and improve pattern handling in query parser --- src/gitingest/cli.py | 11 ++++------- src/gitingest/query_parser.py | 6 ++++++ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index c63eefb2..237957a4 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -13,7 +13,7 @@ @click.command() @click.argument("source", type=str, default=".") -@click.option("--output", "-o", default=None, help="Output file path (default: .txt in current directory)") +@click.option("--output", "-o", default=None, help="Output file path (default: digest.txt)") @click.option("--max-size", "-s", default=MAX_FILE_SIZE, help="Maximum file size to process in bytes") @click.option("--exclude-pattern", "-e", multiple=True, help="Patterns to exclude (space-separated patterns allowed)") @click.option("--include-pattern", "-i", multiple=True, help="Patterns to include (space-separated patterns allowed)") @@ -37,7 +37,7 @@ def main( The source directory or repository to analyze. output : str | None The path where the output file will be written. If not specified, the output - will be written to a file named `.txt` in the current directory. + will be written to a file named `digest.txt` in the current directory. max_size : int The maximum file size to process, in bytes. Files larger than this size will be ignored. exclude_pattern : tuple[str, ...] @@ -74,7 +74,7 @@ async def async_main( The source directory or repository to analyze. output : str | None The path where the output file will be written. If not specified, the output - will be written to a file named `.txt` in the current directory. + will be written to a file named `digest.txt` in the current directory. max_size : int The maximum file size to process, in bytes. Files larger than this size will be ignored. exclude_pattern : tuple[str, ...] @@ -92,12 +92,9 @@ async def async_main( If there is an error during the execution of the command, this exception is raised to abort the process. """ try: - # Get repository name from source path - repo_name = Path(source).name or "repository" - # Set default output filename if not provided if not output: - output = f"{repo_name}.txt" + output = "digest.txt" # Parse command line patterns exclude_patterns = _parse_patterns(exclude_pattern) diff --git a/src/gitingest/query_parser.py b/src/gitingest/query_parser.py index 167c96c3..3d06289e 100644 --- a/src/gitingest/query_parser.py +++ b/src/gitingest/query_parser.py @@ -105,6 +105,12 @@ async def parse_query( A dataclass object containing the parsed details of the repository or file path. """ + # Convert string patterns to set if necessary + if isinstance(ignore_patterns, str): + ignore_patterns = {ignore_patterns} if ignore_patterns else None + if isinstance(include_patterns, str): + include_patterns = {include_patterns} if include_patterns else None + # Determine the parsing method based on the source type if from_web or urlparse(source).scheme in ("https", "http") or any(h in source for h in KNOWN_GIT_HOSTS): # We either have a full URL or a domain-less slug From 0ea78025f9af270b6b0b29051c985eec2ba8f5d1 Mon Sep 17 00:00:00 2001 From: AbhiRam162105 Date: Tue, 4 Feb 2025 19:49:30 +0530 Subject: [PATCH 7/7] refactor: reorder import statements in repository_clone.py for better organization --- src/gitingest/repository_clone.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/gitingest/repository_clone.py b/src/gitingest/repository_clone.py index 2affc763..fdcdf3e8 100644 --- a/src/gitingest/repository_clone.py +++ b/src/gitingest/repository_clone.py @@ -3,9 +3,8 @@ """ import asyncio -import shutil import os - +import shutil from dataclasses import dataclass from pathlib import Path