From 3628e2b813d5a6157e5cec9777ca6ecee950b211 Mon Sep 17 00:00:00 2001 From: Andrew Thornton Date: Thu, 26 Jan 2023 21:31:15 +0000 Subject: [PATCH 1/8] Improve checkIfPRContentChanged The code for checking if a commit has caused a change in a PR is extremely inefficient and affects the head repository instead of using a temporary repository. This PR therefore makes several significant improvements: * A temporary repo like that used in merging. * The diff code is then significant improved to use a three-way diff instead of comparing diffs (possibly binary) line-by-line - in memory... Ref #22578 Signed-off-by: Andrew Thornton --- services/pull/pull.go | 84 +++++++++++++++---------------------------- 1 file changed, 28 insertions(+), 56 deletions(-) diff --git a/services/pull/pull.go b/services/pull/pull.go index 7f81def6d66cc..9b4252416d5c8 100644 --- a/services/pull/pull.go +++ b/services/pull/pull.go @@ -4,14 +4,11 @@ package pull import ( - "bufio" - "bytes" "context" "fmt" "io" "regexp" "strings" - "time" "code.gitea.io/gitea/models" "code.gitea.io/gitea/models/db" @@ -351,72 +348,47 @@ func AddTestPullRequestTask(doer *user_model.User, repoID int64, branch string, // checkIfPRContentChanged checks if diff to target branch has changed by push // A commit can be considered to leave the PR untouched if the patch/diff with its merge base is unchanged func checkIfPRContentChanged(ctx context.Context, pr *issues_model.PullRequest, oldCommitID, newCommitID string) (hasChanged bool, err error) { - if err = pr.LoadHeadRepo(ctx); err != nil { - return false, fmt.Errorf("LoadHeadRepo: %w", err) - } else if pr.HeadRepo == nil { - // corrupt data assumed changed - return true, nil - } - - if err = pr.LoadBaseRepo(ctx); err != nil { - return false, fmt.Errorf("LoadBaseRepo: %w", err) - } - - headGitRepo, err := git.OpenRepository(ctx, pr.HeadRepo.RepoPath()) + tmpBasePath, err := createTemporaryRepo(ctx, pr) if err != nil { - return false, fmt.Errorf("OpenRepository: %w", err) - } - defer headGitRepo.Close() - - // Add a temporary remote. - tmpRemote := "checkIfPRContentChanged-" + fmt.Sprint(time.Now().UnixNano()) - if err = headGitRepo.AddRemote(tmpRemote, pr.BaseRepo.RepoPath(), true); err != nil { - return false, fmt.Errorf("AddRemote: %s/%s-%s: %w", pr.HeadRepo.OwnerName, pr.HeadRepo.Name, tmpRemote, err) + log.Error("CreateTemporaryPath: %v", err) + return false, err } defer func() { - if err := headGitRepo.RemoveRemote(tmpRemote); err != nil { - log.Error("checkIfPRContentChanged: RemoveRemote: %s/%s-%s: %v", pr.HeadRepo.OwnerName, pr.HeadRepo.Name, tmpRemote, err) + if err := repo_module.RemoveTemporaryPath(tmpBasePath); err != nil { + log.Error("checkIfPRContentChanged: RemoveTemporaryPath: %s", err) } }() - // To synchronize repo and get a base ref - _, base, err := headGitRepo.GetMergeBase(tmpRemote, pr.BaseBranch, pr.HeadBranch) + + tmpRepo, err := git.OpenRepository(ctx, tmpBasePath) if err != nil { - return false, fmt.Errorf("GetMergeBase: %w", err) + return false, fmt.Errorf("OpenRepository: %w", err) } + defer tmpRepo.Close() - diffBefore := &bytes.Buffer{} - diffAfter := &bytes.Buffer{} - if err := headGitRepo.GetDiffFromMergeBase(base, oldCommitID, diffBefore); err != nil { - // If old commit not found, assume changed. - log.Debug("GetDiffFromMergeBase: %v", err) - return true, nil - } - if err := headGitRepo.GetDiffFromMergeBase(base, newCommitID, diffAfter); err != nil { - // New commit should be found - return false, fmt.Errorf("GetDiffFromMergeBase: %w", err) + // Find the merge-base + _, base, err := tmpRepo.GetMergeBase("", "base", "tracking") + if err != nil { + return false, fmt.Errorf("GetMergeBase: %w", err) } - diffBeforeLines := bufio.NewScanner(diffBefore) - diffAfterLines := bufio.NewScanner(diffAfter) - - for diffBeforeLines.Scan() && diffAfterLines.Scan() { - if strings.HasPrefix(diffBeforeLines.Text(), "index") && strings.HasPrefix(diffAfterLines.Text(), "index") { - // file hashes can change without the diff changing - continue - } else if strings.HasPrefix(diffBeforeLines.Text(), "@@") && strings.HasPrefix(diffAfterLines.Text(), "@@") { - // the location of the difference may change - continue - } else if !bytes.Equal(diffBeforeLines.Bytes(), diffAfterLines.Bytes()) { - return true, nil - } - } + cmd := git.NewCommand(ctx, "diff", "--name-only", "-1").AddDynamicArguments(newCommitID, oldCommitID, base) + stdout, stderr, err := cmd.RunStdString(&git.RunOpts{ + Dir: tmpBasePath, + }) + if err != nil { + log.Error("Unable to run diff on %s %s %s in tempRepo for PR[%d]%s/%s...%s/%s: stdout %s stderr %s Error: %v", + newCommitID, oldCommitID, base, + pr.ID, pr.BaseRepo.FullName(), pr.BaseBranch, pr.HeadRepo.FullName(), pr.HeadBranch, + stdout, stderr, err) - if diffBeforeLines.Scan() || diffAfterLines.Scan() { - // Diffs not of equal length - return true, nil + // New commit should be found + return false, fmt.Errorf("Unable to run diff on %s %s %s in tempRepo for PR[%d]%s/%s...%s/%s: %w", + newCommitID, oldCommitID, base, + pr.ID, pr.BaseRepo.FullName(), pr.BaseBranch, pr.HeadRepo.FullName(), pr.HeadBranch, + err) } - return false, nil + return strings.TrimSpace(stdout) == "", nil } // PushToBaseRepo pushes commits from branches of head repository to From 6754c2783e79e2c840a421cb8f0d98829dfd5de5 Mon Sep 17 00:00:00 2001 From: Andrew Thornton Date: Thu, 26 Jan 2023 21:55:09 +0000 Subject: [PATCH 2/8] not-equals Signed-off-by: Andrew Thornton --- services/pull/pull.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/pull/pull.go b/services/pull/pull.go index 9b4252416d5c8..d9ed8c2387d0f 100644 --- a/services/pull/pull.go +++ b/services/pull/pull.go @@ -388,7 +388,7 @@ func checkIfPRContentChanged(ctx context.Context, pr *issues_model.PullRequest, err) } - return strings.TrimSpace(stdout) == "", nil + return strings.TrimSpace(stdout) != "", nil } // PushToBaseRepo pushes commits from branches of head repository to From 5d2dd5e9d4ae7c5c6dd0fb493848dd9cfa938d8d Mon Sep 17 00:00:00 2001 From: Andrew Thornton Date: Thu, 26 Jan 2023 22:32:00 +0000 Subject: [PATCH 3/8] use pipeline to properly prevent reading in too many files names Signed-off-by: Andrew Thornton --- services/pull/pull.go | 47 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 40 insertions(+), 7 deletions(-) diff --git a/services/pull/pull.go b/services/pull/pull.go index d9ed8c2387d0f..9fadb273afce2 100644 --- a/services/pull/pull.go +++ b/services/pull/pull.go @@ -4,9 +4,11 @@ package pull import ( + "bytes" "context" "fmt" "io" + "os" "regexp" "strings" @@ -371,15 +373,46 @@ func checkIfPRContentChanged(ctx context.Context, pr *issues_model.PullRequest, return false, fmt.Errorf("GetMergeBase: %w", err) } - cmd := git.NewCommand(ctx, "diff", "--name-only", "-1").AddDynamicArguments(newCommitID, oldCommitID, base) - stdout, stderr, err := cmd.RunStdString(&git.RunOpts{ - Dir: tmpBasePath, - }) + cmd := git.NewCommand(ctx, "diff", "--name-only").AddDynamicArguments(newCommitID, oldCommitID, base) + stdoutReader, stdoutWriter, err := os.Pipe() if err != nil { - log.Error("Unable to run diff on %s %s %s in tempRepo for PR[%d]%s/%s...%s/%s: stdout %s stderr %s Error: %v", + return false, fmt.Errorf("unable to open pipe for to run diff: %w", err) + } + + changedErr := fmt.Errorf("changed files") + + if err := cmd.Run(&git.RunOpts{ + Dir: tmpBasePath, + Stdout: stdoutWriter, + PipelineFunc: func(ctx context.Context, cancel context.CancelFunc) error { + _ = stdoutWriter.Close() + defer func() { + _ = stdoutReader.Close() + }() + buf := make([]byte, 1024) + for { + n, err := stdoutReader.Read(buf) + if err != nil { + if err == io.EOF { + return nil + } + return err + } + + ts := bytes.TrimSpace(buf[:n]) + if len(ts) > 0 { + return changedErr + } + } + }, + }); err != nil { + if err == changedErr { + return true, nil + } + log.Error("Unable to run diff on %s %s %s in tempRepo for PR[%d]%s/%s...%s/%s: Error: %v", newCommitID, oldCommitID, base, pr.ID, pr.BaseRepo.FullName(), pr.BaseBranch, pr.HeadRepo.FullName(), pr.HeadBranch, - stdout, stderr, err) + err) // New commit should be found return false, fmt.Errorf("Unable to run diff on %s %s %s in tempRepo for PR[%d]%s/%s...%s/%s: %w", @@ -388,7 +421,7 @@ func checkIfPRContentChanged(ctx context.Context, pr *issues_model.PullRequest, err) } - return strings.TrimSpace(stdout) != "", nil + return false, nil } // PushToBaseRepo pushes commits from branches of head repository to From 0b21de075b75b625e481ff5979a4cfcc2bff67fe Mon Sep 17 00:00:00 2001 From: zeripath Date: Fri, 27 Jan 2023 07:08:30 +0000 Subject: [PATCH 4/8] Update services/pull/pull.go Co-authored-by: delvh --- services/pull/pull.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/pull/pull.go b/services/pull/pull.go index 9fadb273afce2..9fabaec244863 100644 --- a/services/pull/pull.go +++ b/services/pull/pull.go @@ -352,7 +352,7 @@ func AddTestPullRequestTask(doer *user_model.User, repoID int64, branch string, func checkIfPRContentChanged(ctx context.Context, pr *issues_model.PullRequest, oldCommitID, newCommitID string) (hasChanged bool, err error) { tmpBasePath, err := createTemporaryRepo(ctx, pr) if err != nil { - log.Error("CreateTemporaryPath: %v", err) + log.Error("CreateTemporaryRepo: %v", err) return false, err } defer func() { From c2d786e44c8ac3cf891bd797f4644367e98c4067 Mon Sep 17 00:00:00 2001 From: Andrew Thornton Date: Fri, 27 Jan 2023 19:46:51 +0000 Subject: [PATCH 5/8] as per delvh Signed-off-by: Andrew Thornton --- modules/util/io.go | 22 ++++++++++++++++++++++ services/pull/pull.go | 18 ++---------------- 2 files changed, 24 insertions(+), 16 deletions(-) diff --git a/modules/util/io.go b/modules/util/io.go index e5d7561beff63..cff8c49697798 100644 --- a/modules/util/io.go +++ b/modules/util/io.go @@ -4,6 +4,7 @@ package util import ( + "errors" "io" ) @@ -17,3 +18,24 @@ func ReadAtMost(r io.Reader, buf []byte) (n int, err error) { } return n, err } + +// ErrNotEmpty is an error reported when there is a non-empty reader +var ErrNotEmpty = errors.New("not-empty") + +// EmptyReader reads a reader and ensures it is empty +func EmptyReader(r io.Reader) (err error) { + var buf [1024]byte + + for { + n, err := r.Read(buf[:]) + if err != nil { + if err == io.EOF { + return nil + } + return err + } + if n > 0 { + return ErrNotEmpty + } + } +} diff --git a/services/pull/pull.go b/services/pull/pull.go index 9fabaec244863..4ac2a81f1399e 100644 --- a/services/pull/pull.go +++ b/services/pull/pull.go @@ -4,7 +4,6 @@ package pull import ( - "bytes" "context" "fmt" "io" @@ -28,6 +27,7 @@ import ( repo_module "code.gitea.io/gitea/modules/repository" "code.gitea.io/gitea/modules/setting" "code.gitea.io/gitea/modules/sync" + "code.gitea.io/gitea/modules/util" issue_service "code.gitea.io/gitea/services/issue" ) @@ -389,21 +389,7 @@ func checkIfPRContentChanged(ctx context.Context, pr *issues_model.PullRequest, defer func() { _ = stdoutReader.Close() }() - buf := make([]byte, 1024) - for { - n, err := stdoutReader.Read(buf) - if err != nil { - if err == io.EOF { - return nil - } - return err - } - - ts := bytes.TrimSpace(buf[:n]) - if len(ts) > 0 { - return changedErr - } - } + return util.EmptyReader(stdoutReader) }, }); err != nil { if err == changedErr { From 98ac32506d1545d36f0faa34fe1002235a03d8e7 Mon Sep 17 00:00:00 2001 From: Andrew Thornton Date: Fri, 27 Jan 2023 19:51:47 +0000 Subject: [PATCH 6/8] more Signed-off-by: Andrew Thornton --- modules/util/io.go | 4 ++-- services/pull/pull.go | 10 ++++------ 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/modules/util/io.go b/modules/util/io.go index cff8c49697798..89bef0d42e0a0 100644 --- a/modules/util/io.go +++ b/modules/util/io.go @@ -22,8 +22,8 @@ func ReadAtMost(r io.Reader, buf []byte) (n int, err error) { // ErrNotEmpty is an error reported when there is a non-empty reader var ErrNotEmpty = errors.New("not-empty") -// EmptyReader reads a reader and ensures it is empty -func EmptyReader(r io.Reader) (err error) { +// IsEmptyReader reads a reader and ensures it is empty +func IsEmptyReader(r io.Reader) (err error) { var buf [1024]byte for { diff --git a/services/pull/pull.go b/services/pull/pull.go index 4ac2a81f1399e..19d9c5a596019 100644 --- a/services/pull/pull.go +++ b/services/pull/pull.go @@ -373,14 +373,12 @@ func checkIfPRContentChanged(ctx context.Context, pr *issues_model.PullRequest, return false, fmt.Errorf("GetMergeBase: %w", err) } - cmd := git.NewCommand(ctx, "diff", "--name-only").AddDynamicArguments(newCommitID, oldCommitID, base) + cmd := git.NewCommand(ctx, "diff", "--name-only", "-z").AddDynamicArguments(newCommitID, oldCommitID, base) stdoutReader, stdoutWriter, err := os.Pipe() if err != nil { return false, fmt.Errorf("unable to open pipe for to run diff: %w", err) } - changedErr := fmt.Errorf("changed files") - if err := cmd.Run(&git.RunOpts{ Dir: tmpBasePath, Stdout: stdoutWriter, @@ -389,18 +387,18 @@ func checkIfPRContentChanged(ctx context.Context, pr *issues_model.PullRequest, defer func() { _ = stdoutReader.Close() }() - return util.EmptyReader(stdoutReader) + return util.IsEmptyReader(stdoutReader) }, }); err != nil { - if err == changedErr { + if err == util.ErrNotEmpty { return true, nil } + log.Error("Unable to run diff on %s %s %s in tempRepo for PR[%d]%s/%s...%s/%s: Error: %v", newCommitID, oldCommitID, base, pr.ID, pr.BaseRepo.FullName(), pr.BaseBranch, pr.HeadRepo.FullName(), pr.HeadBranch, err) - // New commit should be found return false, fmt.Errorf("Unable to run diff on %s %s %s in tempRepo for PR[%d]%s/%s...%s/%s: %w", newCommitID, oldCommitID, base, pr.ID, pr.BaseRepo.FullName(), pr.BaseBranch, pr.HeadRepo.FullName(), pr.HeadBranch, From 54410ce9e9aa35eb06e649c7a603b4441e100b5f Mon Sep 17 00:00:00 2001 From: Andrew Thornton Date: Fri, 27 Jan 2023 20:05:51 +0000 Subject: [PATCH 7/8] remove duplication in error report Signed-off-by: Andrew Thornton --- services/pull/pull.go | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/services/pull/pull.go b/services/pull/pull.go index 19d9c5a596019..c983c4f3e78ec 100644 --- a/services/pull/pull.go +++ b/services/pull/pull.go @@ -399,10 +399,7 @@ func checkIfPRContentChanged(ctx context.Context, pr *issues_model.PullRequest, pr.ID, pr.BaseRepo.FullName(), pr.BaseBranch, pr.HeadRepo.FullName(), pr.HeadBranch, err) - return false, fmt.Errorf("Unable to run diff on %s %s %s in tempRepo for PR[%d]%s/%s...%s/%s: %w", - newCommitID, oldCommitID, base, - pr.ID, pr.BaseRepo.FullName(), pr.BaseBranch, pr.HeadRepo.FullName(), pr.HeadBranch, - err) + return false, fmt.Errorf("Unable to run git diff --name-only -z %s %s %s: %w", newCommitID, oldCommitID, base, err) } return false, nil From 2dc44c6f514523588892beb6aaf5423ab1b9ac36 Mon Sep 17 00:00:00 2001 From: zeripath Date: Sat, 28 Jan 2023 10:47:17 +0000 Subject: [PATCH 8/8] Update modules/util/io.go --- modules/util/io.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/util/io.go b/modules/util/io.go index 89bef0d42e0a0..69b1d63145a49 100644 --- a/modules/util/io.go +++ b/modules/util/io.go @@ -24,7 +24,7 @@ var ErrNotEmpty = errors.New("not-empty") // IsEmptyReader reads a reader and ensures it is empty func IsEmptyReader(r io.Reader) (err error) { - var buf [1024]byte + var buf [1]byte for { n, err := r.Read(buf[:])