From 234e1acbe6b2e7dc4ba36caafd0053e38ba2fea6 Mon Sep 17 00:00:00 2001 From: Gowtham Kishore Date: Mon, 13 Jan 2025 22:38:05 -0800 Subject: [PATCH 1/4] Fixing the branch name with nested / --- src/gitingest/query_parser.py | 43 ++++++++++++--- src/gitingest/repository_clone.py | 30 ++++++++++- tests/query_parser/test_query_parser.py | 72 ++++++++++++++++++++----- 3 files changed, 123 insertions(+), 22 deletions(-) diff --git a/src/gitingest/query_parser.py b/src/gitingest/query_parser.py index 78dd6cff..ce888f1d 100644 --- a/src/gitingest/query_parser.py +++ b/src/gitingest/query_parser.py @@ -11,7 +11,7 @@ from config import TMP_BASE_PATH from gitingest.exceptions import InvalidPatternError from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS -from gitingest.repository_clone import _check_repo_exists +from gitingest.repository_clone import _check_repo_exists, fetch_remote_branch_list HEX_DIGITS: set[str] = set(string.hexdigits) @@ -168,18 +168,47 @@ async def _parse_repo_source(source: str) -> dict[str, Any]: parsed["type"] = possible_type # Commit or branch - commit_or_branch = remaining_parts.pop(0) + commit_or_branch = remaining_parts[0] if _is_valid_git_commit_hash(commit_or_branch): parsed["commit"] = commit_or_branch - else: - parsed["branch"] = commit_or_branch + parsed["subpath"] += "/".join(remaining_parts[1:]) - # Subpath if anything left - if remaining_parts: + else: + parsed["branch"] = await _configure_branch_and_subpath(remaining_parts, url) parsed["subpath"] += "/".join(remaining_parts) - return parsed +async def _configure_branch_and_subpath(remaining_parts: list[str],url: str) -> str | None: + """ + Find the branch name from the remaining parts of the URL path. + Parameters + ---------- + remaining_parts : list[str] + List of path parts extracted from the URL. + url : str + The repository URL to determine branches. + + Returns + ------- + str(branch name) or None + + """ + try: + # Fetch the list of branches from the remote repository + branches: list[str] = await fetch_remote_branch_list(url) + except Exception as e: + print(f"Warning: Failed to fetch branch list: {str(e)}") + return remaining_parts.pop(0) + + branch = [] + + while remaining_parts: + branch.append(remaining_parts.pop(0)) + branch_name = "/".join(branch) + if branch_name in branches: + return branch_name + + return None def _is_valid_git_commit_hash(commit: str) -> bool: """ diff --git a/src/gitingest/repository_clone.py b/src/gitingest/repository_clone.py index d251a6f1..490249d6 100644 --- a/src/gitingest/repository_clone.py +++ b/src/gitingest/repository_clone.py @@ -5,7 +5,7 @@ from gitingest.utils import async_timeout -CLONE_TIMEOUT: int = 20 +TIMEOUT: int = 20 @dataclass @@ -34,7 +34,7 @@ class CloneConfig: branch: str | None = None -@async_timeout(CLONE_TIMEOUT) +@async_timeout(TIMEOUT) async def clone_repo(config: CloneConfig) -> tuple[bytes, bytes]: """ Clone a repository to a local path based on the provided configuration. @@ -141,6 +141,32 @@ async def _check_repo_exists(url: str) -> bool: raise RuntimeError(f"Unexpected status code: {status_code}") +@async_timeout(TIMEOUT) +async def fetch_remote_branch_list(url: str) -> list[str]: + """ + Get the list of branches from the remote repo. + + Parameters + ---------- + url : str + The URL of the repository. + + Returns + ------- + list[str] + list of the branches in the remote repository + """ + fetch_branches_command = ["git", "ls-remote", "--heads", url] + stdout, stderr = await _run_git_command(*fetch_branches_command) + stdout_decoded = stdout.decode() + + return [ + line.split('refs/heads/', 1)[1] + for line in stdout_decoded.splitlines() + if line.strip() and 'refs/heads/' in line + ] + + async def _run_git_command(*args: str) -> tuple[bytes, bytes]: """ Execute a Git command asynchronously and captures its output. diff --git a/tests/query_parser/test_query_parser.py b/tests/query_parser/test_query_parser.py index 0db65d3b..fee44920 100644 --- a/tests/query_parser/test_query_parser.py +++ b/tests/query_parser/test_query_parser.py @@ -3,6 +3,8 @@ from pathlib import Path import pytest +from unittest.mock import patch, AsyncMock +from gitingest.repository_clone import _check_repo_exists, fetch_remote_branch_list from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS from gitingest.query_parser import _parse_patterns, _parse_repo_source, parse_query @@ -96,18 +98,21 @@ async def test_parse_query_invalid_pattern() -> None: with pytest.raises(ValueError, match="Pattern.*contains invalid characters"): await parse_query(url, max_file_size=50, from_web=True, include_patterns="*.py;rm -rf") - async def test_parse_url_with_subpaths() -> None: """ Test `_parse_repo_source` with a URL containing a branch and subpath. Verifies that user name, repository name, branch, and subpath are correctly extracted. """ url = "https://github.com/user/repo/tree/main/subdir/file" - result = await _parse_repo_source(url) - assert result["user_name"] == "user" - assert result["repo_name"] == "repo" - assert result["branch"] == "main" - assert result["subpath"] == "/subdir/file" + with patch('gitingest.repository_clone._run_git_command', new_callable=AsyncMock) as mock_run_git_command: + mock_run_git_command.return_value = (b"refs/heads/main\nrefs/heads/dev\nrefs/heads/feature-branch\n", b"") + with patch('gitingest.repository_clone.fetch_remote_branch_list', new_callable=AsyncMock) as mock_fetch_branches: + mock_fetch_branches.return_value = ["main", "dev", "feature-branch"] + result = await _parse_repo_source(url) + assert result["user_name"] == "user" + assert result["repo_name"] == "repo" + assert result["branch"] == "main" + assert result["subpath"] == "/subdir/file" async def test_parse_url_invalid_repo_structure() -> None: @@ -222,15 +227,18 @@ async def test_parse_url_branch_and_commit_distinction() -> None: url_branch = "https://github.com/user/repo/tree/main" url_commit = "https://github.com/user/repo/tree/abcd1234abcd1234abcd1234abcd1234abcd1234" - result_branch = await _parse_repo_source(url_branch) - result_commit = await _parse_repo_source(url_commit) - - assert result_branch["branch"] == "main" - assert result_branch["commit"] is None + with patch('gitingest.repository_clone._run_git_command', new_callable=AsyncMock) as mock_run_git_command: + mock_run_git_command.return_value = (b"refs/heads/main\nrefs/heads/dev\nrefs/heads/feature-branch\n", b"") + with patch('gitingest.repository_clone.fetch_remote_branch_list', new_callable=AsyncMock) as mock_fetch_branches: + mock_fetch_branches.return_value = ["main", "dev", "feature-branch"] - assert result_commit["branch"] is None - assert result_commit["commit"] == "abcd1234abcd1234abcd1234abcd1234abcd1234" + result_branch = await _parse_repo_source(url_branch) + result_commit = await _parse_repo_source(url_commit) + assert result_branch["branch"] == "main" + assert result_branch["commit"] is None + assert result_commit["branch"] is None + assert result_commit["commit"] == "abcd1234abcd1234abcd1234abcd1234abcd1234" async def test_parse_query_uuid_uniqueness() -> None: """ @@ -274,3 +282,41 @@ async def test_parse_query_with_branch() -> None: assert result["branch"] == "2.2.x" assert result["commit"] is None assert result["type"] == "blob" + +@pytest.mark.asyncio +@pytest.mark.parametrize("url, expected_branch, expected_subpath", [ + ("https://github.com/user/repo/tree/main/src", "main", "/src"), + ("https://github.com/user/repo/tree/fix1", "fix1", "/"), + ("https://github.com/user/repo/tree/nonexistent-branch/src", "nonexistent-branch", "/src"), +]) +async def test_parse_repo_source_with_failed_git_command(url, expected_branch, expected_subpath): + """ + Test `_parse_repo_source` when git command fails. + Verifies that the function returns the first path component as the branch. + """ + with patch('gitingest.repository_clone.fetch_remote_branch_list', new_callable=AsyncMock) as mock_fetch_branches: + mock_fetch_branches.side_effect = Exception("Failed to fetch branch list") + + result = await _parse_repo_source(url) + + assert result["branch"] == expected_branch + assert result["subpath"] == expected_subpath + +@pytest.mark.asyncio +@pytest.mark.parametrize("url, expected_branch, expected_subpath", [ + ("https://github.com/user/repo/tree/feature/fix1/src", "feature/fix1", "/src"), + ("https://github.com/user/repo/tree/main/src", "main", "/src"), + ("https://github.com/user/repo", None, "/"), # No + ("https://github.com/user/repo/tree/nonexistent-branch/src", None, "/"), # Non-existent branch + ("https://github.com/user/repo/tree/fix", "fix", "/"), +]) +async def test_parse_repo_source_with_various_url_patterns(url, expected_branch, expected_subpath): + with patch('gitingest.repository_clone._run_git_command', new_callable=AsyncMock) as mock_run_git_command, \ + patch('gitingest.repository_clone.fetch_remote_branch_list', new_callable=AsyncMock) as mock_fetch_branches: + + mock_run_git_command.return_value = (b"refs/heads/feature/fix1\nrefs/heads/main\nrefs/heads/feature-branch\nrefs/heads/fix\n", b"") + mock_fetch_branches.return_value = ["feature/fix1", "main", "feature-branch"] + + result = await _parse_repo_source(url) + assert result["branch"] == expected_branch + assert result["subpath"] == expected_subpath From 61da0311b2bb10472bf8f4a62eaeb9f5270e7a70 Mon Sep 17 00:00:00 2001 From: Gowtham Kishore Date: Mon, 13 Jan 2025 23:20:01 -0800 Subject: [PATCH 2/4] Adding fallback logic if git command fails' --- src/gitingest/query_parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gitingest/query_parser.py b/src/gitingest/query_parser.py index ce888f1d..bff25125 100644 --- a/src/gitingest/query_parser.py +++ b/src/gitingest/query_parser.py @@ -190,7 +190,7 @@ async def _configure_branch_and_subpath(remaining_parts: list[str],url: str) -> Returns ------- - str(branch name) or None + str (branch name) or None """ try: @@ -198,7 +198,7 @@ async def _configure_branch_and_subpath(remaining_parts: list[str],url: str) -> branches: list[str] = await fetch_remote_branch_list(url) except Exception as e: print(f"Warning: Failed to fetch branch list: {str(e)}") - return remaining_parts.pop(0) + return remaining_parts.pop(0) if remaining_parts else None branch = [] From 877fd193b2ec1e53b1020fe88e03d015870176f2 Mon Sep 17 00:00:00 2001 From: Gowtham Kishore Date: Tue, 14 Jan 2025 00:14:47 -0800 Subject: [PATCH 3/4] Fixing lint suggestions --- src/gitingest/query_parser.py | 21 ++++---- src/gitingest/repository_clone.py | 18 +++---- tests/query_parser/test_query_parser.py | 69 ++++++++++++++++--------- 3 files changed, 63 insertions(+), 45 deletions(-) diff --git a/src/gitingest/query_parser.py b/src/gitingest/query_parser.py index bff25125..17db62e9 100644 --- a/src/gitingest/query_parser.py +++ b/src/gitingest/query_parser.py @@ -4,6 +4,7 @@ import re import string import uuid +import warnings from pathlib import Path from typing import Any from urllib.parse import unquote, urlparse @@ -172,36 +173,35 @@ async def _parse_repo_source(source: str) -> dict[str, Any]: if _is_valid_git_commit_hash(commit_or_branch): parsed["commit"] = commit_or_branch parsed["subpath"] += "/".join(remaining_parts[1:]) - else: parsed["branch"] = await _configure_branch_and_subpath(remaining_parts, url) parsed["subpath"] += "/".join(remaining_parts) return parsed -async def _configure_branch_and_subpath(remaining_parts: list[str],url: str) -> str | None: + +async def _configure_branch_and_subpath(remaining_parts: list[str], url: str) -> str | None: """ - Find the branch name from the remaining parts of the URL path. + Configure the branch and subpath based on the remaining parts of the URL. Parameters ---------- remaining_parts : list[str] - List of path parts extracted from the URL. + The remaining parts of the URL path. url : str - The repository URL to determine branches. - + The URL of the repository. Returns ------- - str (branch name) or None + str | None + The branch name if found, otherwise None. """ try: # Fetch the list of branches from the remote repository branches: list[str] = await fetch_remote_branch_list(url) except Exception as e: - print(f"Warning: Failed to fetch branch list: {str(e)}") - return remaining_parts.pop(0) if remaining_parts else None + warnings.warn(f"Warning: Failed to fetch branch list: {str(e)}") + return remaining_parts.pop(0) branch = [] - while remaining_parts: branch.append(remaining_parts.pop(0)) branch_name = "/".join(branch) @@ -210,6 +210,7 @@ async def _configure_branch_and_subpath(remaining_parts: list[str],url: str) -> return None + def _is_valid_git_commit_hash(commit: str) -> bool: """ Validate if the provided string is a valid Git commit hash. diff --git a/src/gitingest/repository_clone.py b/src/gitingest/repository_clone.py index 490249d6..4adfcd9f 100644 --- a/src/gitingest/repository_clone.py +++ b/src/gitingest/repository_clone.py @@ -144,27 +144,25 @@ async def _check_repo_exists(url: str) -> bool: @async_timeout(TIMEOUT) async def fetch_remote_branch_list(url: str) -> list[str]: """ - Get the list of branches from the remote repo. - + Fetch the list of branches from a remote Git repository. Parameters ---------- url : str - The URL of the repository. - + The URL of the Git repository to fetch branches from. Returns ------- list[str] - list of the branches in the remote repository + A list of branch names available in the remote repository. """ fetch_branches_command = ["git", "ls-remote", "--heads", url] - stdout, stderr = await _run_git_command(*fetch_branches_command) + stdout, _ = await _run_git_command(*fetch_branches_command) stdout_decoded = stdout.decode() return [ - line.split('refs/heads/', 1)[1] - for line in stdout_decoded.splitlines() - if line.strip() and 'refs/heads/' in line - ] + line.split("refs/heads/", 1)[1] + for line in stdout_decoded.splitlines() + if line.strip() and "refs/heads/" in line + ] async def _run_git_command(*args: str) -> tuple[bytes, bytes]: diff --git a/tests/query_parser/test_query_parser.py b/tests/query_parser/test_query_parser.py index fee44920..ee4f8bbc 100644 --- a/tests/query_parser/test_query_parser.py +++ b/tests/query_parser/test_query_parser.py @@ -1,10 +1,9 @@ """ Tests for the query_parser module. """ from pathlib import Path +from unittest.mock import AsyncMock, patch import pytest -from unittest.mock import patch, AsyncMock -from gitingest.repository_clone import _check_repo_exists, fetch_remote_branch_list from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS from gitingest.query_parser import _parse_patterns, _parse_repo_source, parse_query @@ -98,15 +97,18 @@ async def test_parse_query_invalid_pattern() -> None: with pytest.raises(ValueError, match="Pattern.*contains invalid characters"): await parse_query(url, max_file_size=50, from_web=True, include_patterns="*.py;rm -rf") + async def test_parse_url_with_subpaths() -> None: """ Test `_parse_repo_source` with a URL containing a branch and subpath. Verifies that user name, repository name, branch, and subpath are correctly extracted. """ url = "https://github.com/user/repo/tree/main/subdir/file" - with patch('gitingest.repository_clone._run_git_command', new_callable=AsyncMock) as mock_run_git_command: + with patch("gitingest.repository_clone._run_git_command", new_callable=AsyncMock) as mock_run_git_command: mock_run_git_command.return_value = (b"refs/heads/main\nrefs/heads/dev\nrefs/heads/feature-branch\n", b"") - with patch('gitingest.repository_clone.fetch_remote_branch_list', new_callable=AsyncMock) as mock_fetch_branches: + with patch( + "gitingest.repository_clone.fetch_remote_branch_list", new_callable=AsyncMock + ) as mock_fetch_branches: mock_fetch_branches.return_value = ["main", "dev", "feature-branch"] result = await _parse_repo_source(url) assert result["user_name"] == "user" @@ -227,9 +229,11 @@ async def test_parse_url_branch_and_commit_distinction() -> None: url_branch = "https://github.com/user/repo/tree/main" url_commit = "https://github.com/user/repo/tree/abcd1234abcd1234abcd1234abcd1234abcd1234" - with patch('gitingest.repository_clone._run_git_command', new_callable=AsyncMock) as mock_run_git_command: + with patch("gitingest.repository_clone._run_git_command", new_callable=AsyncMock) as mock_run_git_command: mock_run_git_command.return_value = (b"refs/heads/main\nrefs/heads/dev\nrefs/heads/feature-branch\n", b"") - with patch('gitingest.repository_clone.fetch_remote_branch_list', new_callable=AsyncMock) as mock_fetch_branches: + with patch( + "gitingest.repository_clone.fetch_remote_branch_list", new_callable=AsyncMock + ) as mock_fetch_branches: mock_fetch_branches.return_value = ["main", "dev", "feature-branch"] result_branch = await _parse_repo_source(url_branch) @@ -240,6 +244,7 @@ async def test_parse_url_branch_and_commit_distinction() -> None: assert result_commit["branch"] is None assert result_commit["commit"] == "abcd1234abcd1234abcd1234abcd1234abcd1234" + async def test_parse_query_uuid_uniqueness() -> None: """ Test `parse_query` to ensure that each call generates a unique UUID for the query result. @@ -283,38 +288,52 @@ async def test_parse_query_with_branch() -> None: assert result["commit"] is None assert result["type"] == "blob" + @pytest.mark.asyncio -@pytest.mark.parametrize("url, expected_branch, expected_subpath", [ - ("https://github.com/user/repo/tree/main/src", "main", "/src"), - ("https://github.com/user/repo/tree/fix1", "fix1", "/"), - ("https://github.com/user/repo/tree/nonexistent-branch/src", "nonexistent-branch", "/src"), -]) +@pytest.mark.parametrize( + "url, expected_branch, expected_subpath", + [ + ("https://github.com/user/repo/tree/main/src", "main", "/src"), + ("https://github.com/user/repo/tree/fix1", "fix1", "/"), + ("https://github.com/user/repo/tree/nonexistent-branch/src", "nonexistent-branch", "/src"), + ], +) async def test_parse_repo_source_with_failed_git_command(url, expected_branch, expected_subpath): """ Test `_parse_repo_source` when git command fails. Verifies that the function returns the first path component as the branch. """ - with patch('gitingest.repository_clone.fetch_remote_branch_list', new_callable=AsyncMock) as mock_fetch_branches: + with patch("gitingest.repository_clone.fetch_remote_branch_list", new_callable=AsyncMock) as mock_fetch_branches: mock_fetch_branches.side_effect = Exception("Failed to fetch branch list") - + result = await _parse_repo_source(url) - + assert result["branch"] == expected_branch assert result["subpath"] == expected_subpath + @pytest.mark.asyncio -@pytest.mark.parametrize("url, expected_branch, expected_subpath", [ - ("https://github.com/user/repo/tree/feature/fix1/src", "feature/fix1", "/src"), - ("https://github.com/user/repo/tree/main/src", "main", "/src"), - ("https://github.com/user/repo", None, "/"), # No - ("https://github.com/user/repo/tree/nonexistent-branch/src", None, "/"), # Non-existent branch - ("https://github.com/user/repo/tree/fix", "fix", "/"), -]) +@pytest.mark.parametrize( + "url, expected_branch, expected_subpath", + [ + ("https://github.com/user/repo/tree/feature/fix1/src", "feature/fix1", "/src"), + ("https://github.com/user/repo/tree/main/src", "main", "/src"), + ("https://github.com/user/repo", None, "/"), # No + ("https://github.com/user/repo/tree/nonexistent-branch/src", None, "/"), # Non-existent branch + ("https://github.com/user/repo/tree/fix", "fix", "/"), + ("https://github.com/user/repo/blob/fix/page.html", "fix", "/page.html"), + ], +) async def test_parse_repo_source_with_various_url_patterns(url, expected_branch, expected_subpath): - with patch('gitingest.repository_clone._run_git_command', new_callable=AsyncMock) as mock_run_git_command, \ - patch('gitingest.repository_clone.fetch_remote_branch_list', new_callable=AsyncMock) as mock_fetch_branches: - - mock_run_git_command.return_value = (b"refs/heads/feature/fix1\nrefs/heads/main\nrefs/heads/feature-branch\nrefs/heads/fix\n", b"") + with ( + patch("gitingest.repository_clone._run_git_command", new_callable=AsyncMock) as mock_run_git_command, + patch("gitingest.repository_clone.fetch_remote_branch_list", new_callable=AsyncMock) as mock_fetch_branches, + ): + + mock_run_git_command.return_value = ( + b"refs/heads/feature/fix1\nrefs/heads/main\nrefs/heads/feature-branch\nrefs/heads/fix\n", + b"", + ) mock_fetch_branches.return_value = ["feature/fix1", "main", "feature-branch"] result = await _parse_repo_source(url) From e94cc450f6b7b5ae0534437ae75fece6bab050e5 Mon Sep 17 00:00:00 2001 From: Gowtham Kishore Date: Tue, 14 Jan 2025 00:36:56 -0800 Subject: [PATCH 4/4] Resolving it as runtime error --- src/gitingest/query_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gitingest/query_parser.py b/src/gitingest/query_parser.py index 17db62e9..58a9869e 100644 --- a/src/gitingest/query_parser.py +++ b/src/gitingest/query_parser.py @@ -197,7 +197,7 @@ async def _configure_branch_and_subpath(remaining_parts: list[str], url: str) -> try: # Fetch the list of branches from the remote repository branches: list[str] = await fetch_remote_branch_list(url) - except Exception as e: + except RuntimeError as e: warnings.warn(f"Warning: Failed to fetch branch list: {str(e)}") return remaining_parts.pop(0)