diff --git a/README.md b/README.md index 84b80e31..aab92204 100644 --- a/README.md +++ b/README.md @@ -122,6 +122,9 @@ gitingest https://github.com/username/private-repo --token github_pat_... # Or set it as an environment variable export GITHUB_TOKEN=github_pat_... gitingest https://github.com/username/private-repo + +# Include repository submodules +gitingest https://github.com/username/repo-with-submodules --include-submodules ``` By default, files listed in `.gitignore` are skipped. Use `--include-gitignored` if you @@ -163,6 +166,9 @@ summary, tree, content = ingest("https://github.com/username/private-repo", toke import os os.environ["GITHUB_TOKEN"] = "github_pat_..." summary, tree, content = ingest("https://github.com/username/private-repo") + +# Include repository submodules +summary, tree, content = ingest("https://github.com/username/repo-with-submodules", include_submodules=True) ``` By default, this won't write a file but can be enabled with the `output` argument. diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index 64ef463c..e14ed681 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -20,6 +20,7 @@ class _CLIArgs(TypedDict): include_pattern: tuple[str, ...] branch: str | None include_gitignored: bool + include_submodules: bool token: str | None output: str | None @@ -47,6 +48,12 @@ class _CLIArgs(TypedDict): default=False, help="Include files matched by .gitignore and .gitingestignore", ) +@click.option( + "--include-submodules", + is_flag=True, + help="Include repository's submodules in the analysis", + default=False, +) @click.option( "--token", "-t", @@ -94,6 +101,9 @@ def main(**cli_kwargs: Unpack[_CLIArgs]) -> None: $ gitingest https://github.com/user/private-repo -t ghp_token $ GITHUB_TOKEN=ghp_token gitingest https://github.com/user/private-repo + Include submodules: + $ gitingest https://github.com/user/repo --include-submodules + """ asyncio.run(_async_main(**cli_kwargs)) @@ -106,6 +116,7 @@ async def _async_main( include_pattern: tuple[str, ...] | None = None, branch: str | None = None, include_gitignored: bool = False, + include_submodules: bool = False, token: str | None = None, output: str | None = None, ) -> None: @@ -129,6 +140,8 @@ async def _async_main( Git branch to ingest. If ``None``, the repository's default branch is used. include_gitignored : bool If ``True``, also ingest files matched by ``.gitignore`` or ``.gitingestignore`` (default: ``False``). + include_submodules : bool + If ``True``, recursively include all Git submodules within the repository (default: ``False``). token : str | None GitHub personal access token (PAT) for accessing private repositories. Can also be set via the ``GITHUB_TOKEN`` environment variable. @@ -155,14 +168,15 @@ async def _async_main( click.echo(f"Analyzing source, output will be written to '{output_target}'...", err=True) summary, _, _ = await ingest_async( - source=source, + source, max_file_size=max_size, include_patterns=include_patterns, exclude_patterns=exclude_patterns, branch=branch, - output=output_target, include_gitignored=include_gitignored, + include_submodules=include_submodules, token=token, + output=output_target, ) except Exception as exc: # Convert any exception into Click.Abort so that exit status is non-zero diff --git a/src/gitingest/clone.py b/src/gitingest/clone.py index a65c3046..1f091486 100644 --- a/src/gitingest/clone.py +++ b/src/gitingest/clone.py @@ -63,7 +63,9 @@ async def clone_repo(config: CloneConfig, *, token: str | None = None) -> None: clone_cmd += ["-c", create_git_auth_header(token, url=url)] clone_cmd += ["clone", "--single-branch"] - # TODO: Re-enable --recurse-submodules when submodule support is needed + + if config.include_submodules: + clone_cmd += ["--recurse-submodules"] if partial_clone: clone_cmd += ["--filter=blob:none", "--sparse"] @@ -86,15 +88,28 @@ async def clone_repo(config: CloneConfig, *, token: str | None = None) -> None: # Checkout the subpath if it is a partial clone if partial_clone: - subpath = config.subpath.lstrip("/") - if config.blob: - # When ingesting from a file url (blob/branch/path/file.txt), we need to remove the file name. - subpath = str(Path(subpath).parent.as_posix()) - - checkout_cmd = create_git_command(["git"], local_path, url, token) - await run_command(*checkout_cmd, "sparse-checkout", "set", subpath) + await _checkout_partial_clone(config, token) # Checkout the commit if it is provided if commit: checkout_cmd = create_git_command(["git"], local_path, url, token) await run_command(*checkout_cmd, "checkout", commit) + + +async def _checkout_partial_clone(config: CloneConfig, token: str | None) -> None: + """Configure sparse-checkout for a partially cloned repository. + + Parameters + ---------- + config : CloneConfig + The configuration for cloning the repository, including subpath and blob flag. + token : str | None + GitHub personal access token (PAT) for accessing private repositories. + + """ + subpath = config.subpath.lstrip("/") + if config.blob: + # Remove the file name from the subpath when ingesting from a file url (e.g. blob/branch/path/file.txt) + subpath = str(Path(subpath).parent.as_posix()) + checkout_cmd = create_git_command(["git"], config.local_path, config.url, token) + await run_command(*checkout_cmd, "sparse-checkout", "set", subpath) diff --git a/src/gitingest/entrypoint.py b/src/gitingest/entrypoint.py index 9c04d65b..f64dec08 100644 --- a/src/gitingest/entrypoint.py +++ b/src/gitingest/entrypoint.py @@ -27,6 +27,7 @@ async def ingest_async( branch: str | None = None, tag: str | None = None, include_gitignored: bool = False, + include_submodules: bool = False, token: str | None = None, output: str | None = None, ) -> tuple[str, str, str]: @@ -52,6 +53,8 @@ async def ingest_async( The tag to clone and ingest. If ``None``, no tag is used. include_gitignored : bool If ``True``, include files ignored by ``.gitignore`` and ``.gitingestignore`` (default: ``False``). + include_submodules : bool + If ``True``, recursively include all Git submodules within the repository (default: ``False``). token : str | None GitHub personal access token (PAT) for accessing private repositories. Can also be set via the ``GITHUB_TOKEN`` environment variable. @@ -86,6 +89,8 @@ async def ingest_async( if query.url: _override_branch_and_tag(query, branch=branch, tag=tag) + query.include_submodules = include_submodules + async with _clone_repo_if_remote(query, token=token): summary, tree, content = ingest_query(query) await _write_output(tree, content=content, target=output) @@ -101,6 +106,7 @@ def ingest( branch: str | None = None, tag: str | None = None, include_gitignored: bool = False, + include_submodules: bool = False, token: str | None = None, output: str | None = None, ) -> tuple[str, str, str]: @@ -126,6 +132,8 @@ def ingest( The tag to clone and ingest. If ``None``, no tag is used. include_gitignored : bool If ``True``, include files ignored by ``.gitignore`` and ``.gitingestignore`` (default: ``False``). + include_submodules : bool + If ``True``, recursively include all Git submodules within the repository (default: ``False``). token : str | None GitHub personal access token (PAT) for accessing private repositories. Can also be set via the ``GITHUB_TOKEN`` environment variable. @@ -156,6 +164,7 @@ def ingest( branch=branch, tag=tag, include_gitignored=include_gitignored, + include_submodules=include_submodules, token=token, output=output, ), diff --git a/src/gitingest/schemas/ingestion.py b/src/gitingest/schemas/ingestion.py index 3e1c5e81..c40e11d6 100644 --- a/src/gitingest/schemas/ingestion.py +++ b/src/gitingest/schemas/ingestion.py @@ -11,7 +11,7 @@ @dataclass -class CloneConfig: +class CloneConfig: # pylint: disable=too-many-instance-attributes """Configuration for cloning a Git repository. This class holds the necessary parameters for cloning a repository to a local path, including @@ -33,6 +33,8 @@ class CloneConfig: The subpath to clone from the repository (default: ``"/"``). blob: bool Whether the repository is a blob (default: ``False``). + include_submodules: bool + Whether to clone submodules (default: ``False``). """ @@ -43,6 +45,7 @@ class CloneConfig: tag: str | None = None subpath: str = "/" blob: bool = False + include_submodules: bool = False class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes @@ -78,6 +81,8 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes The patterns to ignore (default: ``set()``). include_patterns : set[str] | None The patterns to include. + include_submodules : bool + Whether to include all Git submodules within the repository. (default: ``False``) """ @@ -95,6 +100,7 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes max_file_size: int = Field(default=MAX_FILE_SIZE) ignore_patterns: set[str] = set() # TODO: ignore_patterns and include_patterns have the same type include_patterns: set[str] | None = None + include_submodules: bool = False def extract_clone_config(self) -> CloneConfig: """Extract the relevant fields for the CloneConfig object. @@ -122,6 +128,7 @@ def extract_clone_config(self) -> CloneConfig: tag=self.tag, subpath=self.subpath, blob=self.type == "blob", + include_submodules=self.include_submodules, ) def ensure_url(self) -> None: diff --git a/tests/query_parser/test_git_host_agnostic.py b/tests/query_parser/test_git_host_agnostic.py index 710330d7..d3d2542a 100644 --- a/tests/query_parser/test_git_host_agnostic.py +++ b/tests/query_parser/test_git_host_agnostic.py @@ -68,6 +68,7 @@ async def test_parse_query_without_host( "commit": None, "max_file_size": 50, "include_patterns": None, + "include_submodules": False, } assert actual == expected diff --git a/tests/test_cli.py b/tests/test_cli.py index f001b84a..f9bbde85 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -27,6 +27,7 @@ "tests/", "--include-pattern", "src/", + "--include-submodules", ], True, id="custom-options", diff --git a/tests/test_clone.py b/tests/test_clone.py index fed456b9..9ffaa376 100644 --- a/tests/test_clone.py +++ b/tests/test_clone.py @@ -181,10 +181,10 @@ async def test_clone_default_shallow_clone(run_command_mock: AsyncMock) -> None: @pytest.mark.asyncio -async def test_clone_commit_without_branch(run_command_mock: AsyncMock) -> None: - """Test cloning when a commit hash is provided but no branch is specified. +async def test_clone_commit(run_command_mock: AsyncMock) -> None: + """Test cloning when a commit hash is provided. - Given a valid URL and a commit hash (but no branch): + Given a valid URL and a commit hash: When ``clone_repo`` is called, Then the repository should be cloned and checked out at that commit. """ @@ -414,3 +414,28 @@ async def test_clone_with_commit_and_subpath(run_command_mock: AsyncMock) -> Non ) assert run_command_mock.call_count == expected_call_count + + +@pytest.mark.asyncio +async def test_clone_with_include_submodules(run_command_mock: AsyncMock) -> None: + """Test cloning a repository with submodules included. + + Given a valid URL and ``include_submodules=True``: + When ``clone_repo`` is called, + Then the repository should be cloned with ``--recurse-submodules`` in the git command. + """ + expected_call_count = 1 # No commit and no partial clone + clone_config = CloneConfig(url=DEMO_URL, local_path=LOCAL_REPO_PATH, branch="main", include_submodules=True) + + await clone_repo(clone_config) + + assert run_command_mock.call_count == expected_call_count + run_command_mock.assert_called_once_with( + "git", + "clone", + "--single-branch", + "--recurse-submodules", + "--depth=1", + clone_config.url, + clone_config.local_path, + )