Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 50 additions & 26 deletions src/gitingest/cli.py
Original file line number Diff line number Diff line change
@@ -1,82 +1,88 @@
""" Command-line interface for the Gitingest package. """
"""Command-line interface for the Gitingest package."""

# pylint: disable=no-value-for-parameter

import asyncio
from pathlib import Path

import click

from gitingest.config import MAX_FILE_SIZE, OUTPUT_FILE_PATH
from gitingest.config import MAX_FILE_SIZE
from gitingest.query_parser import _parse_patterns, parse_ignore_file
from gitingest.repository_ingest import ingest


@click.command()
@click.argument("source", type=str, default=".")
@click.option("--output", "-o", default=None, help="Output file path (default: <repo_name>.txt in current directory)")
@click.option("--output", "-o", default=None, help="Output file path (default: digest.txt)")
@click.option("--max-size", "-s", default=MAX_FILE_SIZE, help="Maximum file size to process in bytes")
@click.option("--exclude-pattern", "-e", multiple=True, help="Patterns to exclude")
@click.option("--include-pattern", "-i", multiple=True, help="Patterns to include")
@click.option("--exclude-pattern", "-e", multiple=True, help="Patterns to exclude (space-separated patterns allowed)")
@click.option("--include-pattern", "-i", multiple=True, help="Patterns to include (space-separated patterns allowed)")
@click.option("--ignore-file", default=".gitingestignore", help="Path to ignore file (default: .gitingestignore)")
@click.option("--branch", "-b", default=None, help="Branch to clone and ingest")
def main(
source: str,
output: str | None,
max_size: int,
exclude_pattern: tuple[str, ...],
include_pattern: tuple[str, ...],
ignore_file: str,
branch: str | None,
):
"""
Main entry point for the CLI. This function is called when the CLI is run as a script.

It calls the async main function to run the command.
Main entry point for the CLI.

Parameters
----------
source : str
The source directory or repository to analyze.
output : str | None
The path where the output file will be written. If not specified, the output will be written
to a file named `<repo_name>.txt` in the current directory.
The path where the output file will be written. If not specified, the output
will be written to a file named `digest.txt` in the current directory.
max_size : int
The maximum file size to process, in bytes. Files larger than this size will be ignored.
exclude_pattern : tuple[str, ...]
A tuple of patterns to exclude during the analysis. Files matching these patterns will be ignored.
include_pattern : tuple[str, ...]
A tuple of patterns to include during the analysis. Only files matching these patterns will be processed.
ignore_file : str
Path to the ignore file containing additional patterns to exclude.
branch : str | None
The branch to clone (optional).
"""
# Main entry point for the CLI. This function is called when the CLI is run as a script.
asyncio.run(_async_main(source, output, max_size, exclude_pattern, include_pattern, branch))
asyncio.run(async_main(source, output, max_size, exclude_pattern, include_pattern, ignore_file, branch))


async def _async_main(
async def async_main(
source: str,
output: str | None,
max_size: int,
exclude_pattern: tuple[str, ...],
include_pattern: tuple[str, ...],
ignore_file: str,
branch: str | None,
) -> None:
"""
Analyze a directory or repository and create a text dump of its contents.

This command analyzes the contents of a specified source directory or repository, applies custom include and
exclude patterns, and generates a text summary of the analysis which is then written to an output file.
This command analyzes the contents of a specified source directory or repository,
applies custom include and exclude patterns, and generates a text summary of the
analysis which is then written to an output file.

Parameters
----------
source : str
The source directory or repository to analyze.
output : str | None
The path where the output file will be written. If not specified, the output will be written
to a file named `<repo_name>.txt` in the current directory.
The path where the output file will be written. If not specified, the output
will be written to a file named `digest.txt` in the current directory.
max_size : int
The maximum file size to process, in bytes. Files larger than this size will be ignored.
exclude_pattern : tuple[str, ...]
A tuple of patterns to exclude during the analysis. Files matching these patterns will be ignored.
include_pattern : tuple[str, ...]
A tuple of patterns to include during the analysis. Only files matching these patterns will be processed.
ignore_file : str
Path to the ignore file containing additional patterns to exclude.
branch : str | None
The branch to clone (optional).

Expand All @@ -86,21 +92,39 @@ async def _async_main(
If there is an error during the execution of the command, this exception is raised to abort the process.
"""
try:
# Combine default and custom ignore patterns
exclude_patterns = set(exclude_pattern)
include_patterns = set(include_pattern)

# Set default output filename if not provided
if not output:
output = OUTPUT_FILE_PATH
summary, _, _ = await ingest(source, max_size, include_patterns, exclude_patterns, branch, output=output)
output = "digest.txt"

# Parse command line patterns
exclude_patterns = _parse_patterns(exclude_pattern)
include_patterns = _parse_patterns(include_pattern)

# Read and add patterns from ignore file
ignore_file_path = Path(source) / ignore_file
ignore_patterns = parse_ignore_file(ignore_file_path)
exclude_patterns.update(ignore_patterns)

# Perform the ingest operation with branch support
summary, *_ = await ingest(source, max_size, include_patterns, exclude_patterns, branch=branch, output=output)

# Display results
click.echo(f"Analysis complete! Output written to: {output}")
click.echo("\nSummary:")
click.echo(summary)

except Exception as e:
click.echo(f"Error: {e}", err=True)
except FileNotFoundError as e:
click.echo(f"Error: Source directory not found - {e}", err=True)
raise click.Abort()
except PermissionError as e:
click.echo(f"Error: Permission denied - {e}", err=True)
raise click.Abort()
except Exception as e:
click.echo(f"Warning: An error occurred - {e}", err=True)
# For non-critical errors, we might want to continue rather than abort
if isinstance(e, (OSError, IOError)):
raise click.Abort()
return


if __name__ == "__main__":
Expand Down
2 changes: 2 additions & 0 deletions src/gitingest/ignore_patterns.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,4 +155,6 @@
"*.tfstate*",
## Dependencies in various languages
"vendor/",
## gitingestignore file
".gitingestignore",
}
55 changes: 49 additions & 6 deletions src/gitingest/query_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,30 @@ class ParsedQuery: # pylint: disable=too-many-instance-attributes
pattern_type: str | None = None


def parse_ignore_file(ignore_file_path: Path) -> set[str]:
"""
Parse the .gitingestignore file and return a set of patterns to ignore.

Parameters
----------
ignore_file_path : Path
Path to the .gitingestignore file

Returns
-------
set[str]
Set of patterns to ignore
"""
if not ignore_file_path.exists():
return set()

with open(ignore_file_path, encoding="utf-8") as f:
# Read lines, strip whitespace, and filter out empty lines and comments
patterns = {line.strip() for line in f if line.strip() and not line.startswith("#")}

return patterns


async def parse_query(
source: str,
max_file_size: int,
Expand Down Expand Up @@ -81,6 +105,12 @@ async def parse_query(
A dataclass object containing the parsed details of the repository or file path.
"""

# Convert string patterns to set if necessary
if isinstance(ignore_patterns, str):
ignore_patterns = {ignore_patterns} if ignore_patterns else None
if isinstance(include_patterns, str):
include_patterns = {include_patterns} if include_patterns else None

# Determine the parsing method based on the source type
if from_web or urlparse(source).scheme in ("https", "http") or any(h in source for h in KNOWN_GIT_HOSTS):
# We either have a full URL or a domain-less slug
Expand All @@ -89,6 +119,14 @@ async def parse_query(
# Local path scenario
parsed_query = _parse_path(source)

# Look for .gitingestignore file in the local path
ignore_file_path = Path(parsed_query.local_path) / ".gitingestignore"
additional_ignore_patterns = parse_ignore_file(ignore_file_path)
if ignore_patterns:
ignore_patterns.update(additional_ignore_patterns)
else:
ignore_patterns = additional_ignore_patterns

# Combine default ignore patterns + custom patterns
ignore_patterns_set = DEFAULT_IGNORE_PATTERNS.copy()
if ignore_patterns:
Expand Down Expand Up @@ -283,17 +321,18 @@ def _normalize_pattern(pattern: str) -> str:
return pattern


def _parse_patterns(pattern: set[str] | str) -> set[str]:
def _parse_patterns(patterns: tuple[str, ...] | set[str] | str) -> set[str]:
"""
Parse and validate file/directory patterns for inclusion or exclusion.

Takes either a single pattern string or set of pattern strings and processes them into a normalized list.
Patterns are split on commas and spaces, validated for allowed characters, and normalized.
Takes either a single pattern string, a tuple of pattern strings, or a set of pattern strings
and processes them into a normalized list. Patterns are split on commas and spaces, validated
for allowed characters, and normalized.

Parameters
----------
pattern : set[str] | str
Pattern(s) to parse - either a single string or set of strings
patterns : tuple[str, ...] | set[str] | str
Pattern(s) to parse - either a single string, a tuple of strings, or a set of strings

Returns
-------
Expand All @@ -307,7 +346,11 @@ def _parse_patterns(pattern: set[str] | str) -> set[str]:
dash (-), underscore (_), dot (.), forward slash (/), plus (+), and
asterisk (*) are allowed.
"""
patterns = pattern if isinstance(pattern, set) else {pattern}
# Convert patterns to a set if it's not already a set
if isinstance(patterns, tuple):
patterns = set(patterns)
elif isinstance(patterns, str):
patterns = {patterns}

parsed_patterns: set[str] = set()
for p in patterns:
Expand Down
11 changes: 9 additions & 2 deletions src/gitingest/repository_clone.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
""" This module contains functions for cloning a Git repository to a local path. """
"""
Module for cloning repositories in the gitingest package.
"""

import asyncio
import os
import shutil
from dataclasses import dataclass
from pathlib import Path

Expand Down Expand Up @@ -89,6 +92,11 @@ async def clone_repo(config: CloneConfig) -> tuple[bytes, bytes]:
if not await _check_repo_exists(url):
raise ValueError("Repository not found, make sure it is public")

# Remove the directory if it exists and is not empty
local_path_obj = Path(local_path)
if local_path_obj.exists() and any(local_path_obj.iterdir()):
shutil.rmtree(local_path_obj)

if commit:
# Scenario 1: Clone and checkout a specific commit
# Clone the repository without depth to ensure full history for checkout
Expand All @@ -100,7 +108,6 @@ async def clone_repo(config: CloneConfig) -> tuple[bytes, bytes]:
return await _run_git_command(*checkout_cmd)

if branch and branch.lower() not in ("main", "master"):

# Scenario 2: Clone a specific branch with shallow depth
clone_cmd = ["git", "clone", "--depth=1", "--single-branch", "--branch", branch, url, local_path]
return await _run_git_command(*clone_cmd)
Expand Down
18 changes: 8 additions & 10 deletions src/gitingest/repository_ingest.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
""" Main entry point for ingesting a source and processing its contents. """
"""
Module for ingesting repositories in the gitingest package.
"""

import asyncio
import inspect
import shutil

Expand Down Expand Up @@ -64,23 +65,21 @@ async def ingest(
)

if parsed_query.url:
selected_branch = branch if branch else parsed_query.branch # prioritize branch argument
parsed_query.branch = selected_branch
# Override branch if specified
if branch is not None:
parsed_query.branch = branch

# Extract relevant fields for CloneConfig
clone_config = CloneConfig(
url=parsed_query.url,
local_path=str(parsed_query.local_path),
commit=parsed_query.commit,
branch=selected_branch,
branch=parsed_query.branch,
)
clone_result = clone_repo(clone_config)

if inspect.iscoroutine(clone_result):
if asyncio.get_event_loop().is_running():
await clone_result
else:
asyncio.run(clone_result)
await clone_result
else:
raise TypeError("clone_repo did not return a coroutine as expected.")

Expand All @@ -94,5 +93,4 @@ async def ingest(
finally:
# Clean up the temporary directory if it was created
if parsed_query.url:
# Clean up the temporary directory
shutil.rmtree(TMP_BASE_PATH, ignore_errors=True)
Loading