Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(ingest): use branch info when cloning git repos #6937

Merged
merged 1 commit into from
Jan 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion metadata-ingestion/src/datahub/configuration/github.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ class GitHubReference(ConfigModel):
)
branch: str = Field(
"main",
description="Branch on which your files live by default. Typically main or master.",
description="Branch on which your files live by default. Typically main or master. This can also be a commit hash.",
)
base_url: str = Field(
"https://github.com",
Expand Down Expand Up @@ -73,3 +73,15 @@ def auto_infer_from_repo(cls, v: Optional[str], values: Dict[str, Any]) -> str:
if v is None:
return f"git@github.com:{values.get('repo')}"
return v

@property
def branch_for_clone(self) -> Optional[str]:
# If branch was manually set, we should use it. Otherwise return None.
# We do this because we want to use the default branch unless they override it.
# While our default for branch is "main", they could be using "master" or something else.
# It's ok if the URLs we generate are slightly incorrect, but changing branch to be
# required would be a breaking change.

if "branch" in self.__fields_set__:
return self.branch
return None
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@ def __init__(self, tmp_dir: str, skip_known_host_verification: bool = True):
self.skip_known_host_verification = skip_known_host_verification
self.last_repo_cloned: Optional[git.Repo] = None

def clone(self, ssh_key: Optional[SecretStr], repo_url: str) -> Path:
def clone(
self, ssh_key: Optional[SecretStr], repo_url: str, branch: Optional[str] = None
) -> Path:
unique_dir = str(uuid4())
keys_dir = f"{self.tmp_dir}/{unique_dir}/keys"
checkout_dir = f"{self.tmp_dir}/{unique_dir}/checkout"
Expand Down Expand Up @@ -58,6 +60,11 @@ def clone(self, ssh_key: Optional[SecretStr], repo_url: str) -> Path:
env=dict(GIT_SSH_COMMAND=git_ssh_cmd),
)
logger.info("✅ Cloning complete!")

if branch is not None:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you

logger.info(f"Checking out branch {branch}")
self.last_repo_cloned.git.checkout(branch)

return pathlib.Path(checkout_dir)

def get_last_repo_cloned(self) -> Optional[git.Repo]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1434,6 +1434,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
checkout_dir = git_clone.clone(
ssh_key=self.source_config.github_info.deploy_key,
repo_url=self.source_config.github_info.repo_ssh_locator,
branch=self.source_config.github_info.branch_for_clone,
)
self.reporter.git_clone_latency = datetime.now() - start_time
self.source_config.base_folder = checkout_dir.resolve()
Expand Down Expand Up @@ -1463,6 +1464,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
)
),
repo_url=p_ref.repo_ssh_locator,
branch=p_ref.branch_for_clone,
)

p_ref = p_checkout_dir.resolve()
Expand Down
65 changes: 42 additions & 23 deletions metadata-ingestion/tests/integration/git/test_git_clone.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,50 @@
import os

import pytest
from pydantic import SecretStr

from datahub.configuration.github import GitHubInfo
from datahub.ingestion.source.git.git_import import GitClone

LOOKML_TEST_SSH_KEY = os.environ.get("DATAHUB_LOOKML_GIT_TEST_SSH_KEY")

def test_git_clone(pytestconfig, tmp_path):

@pytest.mark.skipif(
LOOKML_TEST_SSH_KEY is None,
reason="DATAHUB_LOOKML_GIT_TEST_SSH_KEY env variable is not configured",
)
def test_git_clone(tmp_path):
git_clone = GitClone(str(tmp_path))
secret_env_variable = "DATAHUB_LOOKML_GIT_TEST_SSH_KEY"
if os.environ.get(secret_env_variable) is not None:
secret_key = SecretStr(os.environ.get(secret_env_variable)) # type: ignore
checkout_dir = git_clone.clone(
ssh_key=secret_key,
repo_url="git@github.com:acryldata/long-tail-companions-looker",
)
assert os.path.exists(checkout_dir)
assert set(os.listdir(checkout_dir)) == set(
[
".datahub",
"models",
"README.md",
".github",
".git",
"views",
]
)
else:
print(
"Skipping test as env variable DATAHUB_LOOKML_GIT_TEST_SSH_KEY is not configured"
)
secret_key = SecretStr(LOOKML_TEST_SSH_KEY) if LOOKML_TEST_SSH_KEY else None

checkout_dir = git_clone.clone(
ssh_key=secret_key,
repo_url="git@github.com:acryldata/long-tail-companions-looker",
branch="d380a2b777ec6f4653626f39c68dba85893faa74",
)
assert os.path.exists(checkout_dir)
assert set(os.listdir(checkout_dir)) == set(
[
".datahub",
"models",
"README.md",
".github",
".git",
"views",
"manifest_lock.lkml",
"manifest.lkml",
]
)


def test_github_branch():
config = GitHubInfo(
repo="owner/repo",
)
assert config.branch_for_clone is None

config = GitHubInfo(
repo="owner/repo",
branch="main",
)
assert config.branch_for_clone == "main"