Skip to content

Commit

Permalink
fix: addition of some commits are ridiculous huge (#7719)
Browse files Browse the repository at this point in the history
  • Loading branch information
klesh authored Jul 11, 2024
1 parent 87ca3d0 commit 5889a08
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 30 deletions.
50 changes: 24 additions & 26 deletions backend/plugins/gitextractor/parser/clone_gitcli.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,25 +89,19 @@ func (g *GitcliCloner) CloneRepo(ctx plugin.SubTaskContext, localDir string) err

}

cmd, err := g.buildCloneCommand(ctx, localDir, since)
if err != nil {
return err
}
err = g.execCloneCommand(cmd)
err := g.execGitCloneCommand(ctx, localDir, since)
if err != nil {
return err
}
// deepen the commits by 1 more step to avoid https://github.com/apache/incubator-devlake/issues/7426
if since != nil {
// fixes error described on https://stackoverflow.com/questions/63878612/git-fatal-error-in-object-unshallow-sha-1
// It might be casued by the commit which being deepen has mulitple parent(e.g. a merge commit), not sure.
repackCmd := exec.CommandContext(ctx.GetContext(), "git", "-C", localDir, "repack", "-d")
if err := repackCmd.Run(); err != nil {
if err := g.execGitCommand(ctx, "-C", localDir, "repack", "-d"); err != nil {
return errors.Default.Wrap(err, "failed to repack the repo")
}
deepenCmd := exec.CommandContext(ctx.GetContext(), "git", "-C", localDir, "fetch", "--deepen=1")
// deepen would fail on a EMPTY repo, ignore the error
if err := deepenCmd.Run(); err != nil {
if err := g.execGitCommand(ctx, "-C", localDir, "fetch", "--deepen=1"); err != nil {
g.logger.Error(err, "failed to deepen the cloned repo")
}
}
Expand All @@ -119,9 +113,22 @@ func (g *GitcliCloner) CloneRepo(ctx plugin.SubTaskContext, localDir string) err
return nil
}

func (g *GitcliCloner) buildCloneCommand(ctx plugin.SubTaskContext, localDir string, since *time.Time) (*exec.Cmd, errors.Error) {
func (g *GitcliCloner) execGitCloneCommand(ctx plugin.SubTaskContext, localDir string, since *time.Time) errors.Error {
taskData := ctx.GetData().(*GitExtractorTaskData)
args := []string{"clone", taskData.Options.Url, localDir, "--bare", "--progress"}
if since != nil {
args = append(args, fmt.Sprintf("--shallow-since=%s", since.Format(time.RFC3339)))
}
// support time after and diff sync
// support skipping blobs collection
if *taskData.Options.SkipCommitStat {
args = append(args, "--filter=blob:none")
}
return g.execGitCommand(ctx, args...)
}

func (g *GitcliCloner) execGitCommand(ctx plugin.SubTaskContext, args ...string) errors.Error {
taskData := ctx.GetData().(*GitExtractorTaskData)
env := []string{}
// support proxy
if taskData.ParsedURL.Scheme == "http" || taskData.ParsedURL.Scheme == "https" {
Expand All @@ -136,7 +143,7 @@ func (g *GitcliCloner) buildCloneCommand(ctx plugin.SubTaskContext, localDir str
if taskData.Options.Proxy != "" {
parsedProxyURL, e := url.Parse(taskData.Options.Proxy)
if e != nil {
return nil, errors.BadInput.Wrap(e, "failed to parse the proxy URL")
return errors.BadInput.Wrap(e, "failed to parse the proxy URL")
}
proxyCommand := "corkscrew"
sshCmdArgs = append(sshCmdArgs, "-o", fmt.Sprintf(`ProxyCommand="%s %s %s %%h %%p"`, proxyCommand, parsedProxyURL.Hostname(), parsedProxyURL.Port()))
Expand All @@ -146,16 +153,16 @@ func (g *GitcliCloner) buildCloneCommand(ctx plugin.SubTaskContext, localDir str
pkFile, err := os.CreateTemp("", "gitext-pk")
if err != nil {
g.logger.Error(err, "create temp private key file error")
return nil, errors.Default.New("failed to handle the private key")
return errors.Default.New("failed to handle the private key")
}
if _, e := pkFile.WriteString(taskData.Options.PrivateKey + "\n"); e != nil {
g.logger.Error(err, "write private key file error")
return nil, errors.Default.New("failed to write the private key")
return errors.Default.New("failed to write the private key")
}
pkFile.Close()
if e := os.Chmod(pkFile.Name(), 0600); e != nil {
g.logger.Error(err, "chmod private key file error")
return nil, errors.Default.New("failed to modify the private key")
return errors.Default.New("failed to modify the private key")
}

if taskData.Options.Passphrase != "" {
Expand All @@ -169,7 +176,7 @@ func (g *GitcliCloner) buildCloneCommand(ctx plugin.SubTaskContext, localDir str
if ppout, pperr := pp.CombinedOutput(); pperr != nil {
g.logger.Error(pperr, "change private key passphrase error")
g.logger.Info(string(ppout))
return nil, errors.Default.New("failed to decrypt the private key")
return errors.Default.New("failed to decrypt the private key")
}
}
defer os.Remove(pkFile.Name())
Expand All @@ -179,22 +186,13 @@ func (g *GitcliCloner) buildCloneCommand(ctx plugin.SubTaskContext, localDir str
env = append(env, fmt.Sprintf("GIT_SSH_COMMAND=ssh %s", strings.Join(sshCmdArgs, " ")))
}
}
// support time after and diff sync
if since != nil {
args = append(args, fmt.Sprintf("--shallow-since=%s", since.Format(time.RFC3339)))
}
// support skipping blobs collection
if *taskData.Options.SkipCommitStat {
args = append(args, "--filter=blob:none")
}
// fmt.Printf("args: %v\n", args)
g.logger.Debug("git %v", args)
cmd := exec.CommandContext(ctx.GetContext(), "git", args...)
cmd.Env = env
return cmd, nil
return g.execCommand(cmd)
}

func (g *GitcliCloner) execCloneCommand(cmd *exec.Cmd) errors.Error {
func (g *GitcliCloner) execCommand(cmd *exec.Cmd) errors.Error {
stdout, err := cmd.StdoutPipe()
if err != nil {
g.logger.Error(err, "stdout pipe error")
Expand Down
14 changes: 14 additions & 0 deletions backend/plugins/gitextractor/parser/repo_gogit.go
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,20 @@ func (r *GogitRepoCollector) CollectCommits(subtaskCtx plugin.SubTaskContext) (e
default:
}
commitSha := commit.Hash.String()

if commit.NumParents() != 0 {
_, err := commit.Parents().Next()
if err != nil {
if err == plumbing.ErrObjectNotFound {
// Skip calculating commit statistics when there are parent commits, but the first one cannot be fetched from the ODB.
// This usually happens during a shallow clone for incremental collection. Otherwise, we might end up overwriting
// the correct addition/deletion data in the database with an absurdly large addition number.
r.logger.Info("skip commit %s because it has no parent commit", commitSha)
return nil
}
return err
}
}
codeCommit := &code.Commit{
Sha: commitSha,
Message: commit.Message,
Expand Down
15 changes: 11 additions & 4 deletions backend/plugins/gitextractor/parser/repo_libgit2.go
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,17 @@ func (r *Libgit2RepoCollector) CollectCommits(subtaskCtx plugin.SubTaskContext)
if commit == nil {
return nil
}
var parent *git.Commit
if commit.ParentCount() > 0 {
parent = commit.Parent(0)
// Skip calculating commit statistics when there are parent commits, but the first one cannot be fetched from the ODB.
// This usually happens during a shallow clone for incremental collection. Otherwise, we might end up overwriting
// the correct addition/deletion data in the database with an absurdly large addition number.
if parent == nil {
r.logger.Info("skip commit %s because it has no parent commit", commit.Id().String())
return nil
}
}
commitSha := commit.Id().String()
r.logger.Debug("process commit: %s", commitSha)
c := &code.Commit{
Expand All @@ -303,10 +314,6 @@ func (r *Libgit2RepoCollector) CollectCommits(subtaskCtx plugin.SubTaskContext)
if err != nil {
return err
}
var parent *git.Commit
if commit.ParentCount() > 0 {
parent = commit.Parent(0)
}

if !*taskOpts.SkipCommitStat {
var stats *git.DiffStats
Expand Down

0 comments on commit 5889a08

Please sign in to comment.