Skip to content

Commit

Permalink
feat(ingest/lookml): shallow clone repos (datahub-project#10888)
Browse files Browse the repository at this point in the history
  • Loading branch information
hsheth2 authored and aviv-julienjehannet committed Jul 25, 2024
1 parent 846375d commit d4caa1c
Showing 1 changed file with 26 additions and 11 deletions.
37 changes: 26 additions & 11 deletions metadata-ingestion/src/datahub/ingestion/source/git/git_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ def __init__(self, tmp_dir: str, skip_known_host_verification: bool = True):
def clone(
self, ssh_key: Optional[SecretStr], repo_url: str, branch: Optional[str] = None
) -> Path:
# Note: this does a shallow clone.

unique_dir = str(uuid4())
keys_dir = f"{self.tmp_dir}/{unique_dir}/keys"
checkout_dir = f"{self.tmp_dir}/{unique_dir}/checkout"
Expand Down Expand Up @@ -55,20 +57,33 @@ def clone(
)
logger.debug(f"ssh_command={git_ssh_cmd}")

logger.info(
f"⏳ Cloning repo '{self.sanitize_repo_url(repo_url)}', this can take some time..."
)
self.last_repo_cloned = git.Repo.clone_from(
repo_url,
checkout_dir,
env=dict(GIT_SSH_COMMAND=git_ssh_cmd),
)
logger.info("✅ Cloning complete!")

if branch is not None:
if branch is None:
logger.info(
f"⏳ Cloning repo '{self.sanitize_repo_url(repo_url)}' (default branch), this can take some time..."
)
self.last_repo_cloned = git.Repo.clone_from(
repo_url,
checkout_dir,
env=dict(GIT_SSH_COMMAND=git_ssh_cmd),
depth=1,
)
else:
# Because we accept branch names, tags, and commit hashes in the branch parameter,
# we can't just use the --branch flag of Git clone. Doing a blobless clone allows
# us to quickly checkout the right commit.
logger.info(
f"⏳ Cloning repo '{self.sanitize_repo_url(repo_url)}' (branch: {branch}), this can take some time..."
)
self.last_repo_cloned = git.Repo.clone_from(
repo_url,
checkout_dir,
env=dict(GIT_SSH_COMMAND=git_ssh_cmd),
filter="blob:none",
)
logger.info(f"Checking out branch {branch}")
self.last_repo_cloned.git.checkout(branch)

logger.info("✅ Cloning complete!")
return pathlib.Path(checkout_dir)

def get_last_repo_cloned(self) -> Optional[git.Repo]:
Expand Down

0 comments on commit d4caa1c

Please sign in to comment.