diff --git a/pkg/sources/github/github.go b/pkg/sources/github/github.go index a6eba956e7c7..e201bb5bcc6b 100644 --- a/pkg/sources/github/github.go +++ b/pkg/sources/github/github.go @@ -16,6 +16,7 @@ import ( "github.com/go-logr/logr" "github.com/gobwas/glob" "github.com/google/go-github/v63/github" + "github.com/trufflesecurity/trufflehog/v3/pkg/handlers" "golang.org/x/exp/rand" "golang.org/x/sync/errgroup" "google.golang.org/protobuf/proto" @@ -1392,35 +1393,34 @@ func (s *Source) scanTarget(ctx context.Context, target sources.ChunkingTarget, return fmt.Errorf("invalid GitHub URL") } - qry := commitQuery{ - repo: segments[2], - owner: segments[1], - sha: meta.GetCommit(), - filename: meta.GetFile(), + readCloser, resp, err := s.connector.APIClient().Repositories.DownloadContents( + ctx, + segments[1], + segments[2], + meta.GetFile(), + &github.RepositoryContentGetOptions{Ref: meta.GetCommit()}) + // As of this writing, if the returned readCloser is not nil, it's just the Body of the returned github.Response, so + // there's no need to independently close it. + if resp != nil && resp.Body != nil { + defer resp.Body.Close() } - res, err := s.getDiffForFileInCommit(ctx, qry) if err != nil { - return err + return fmt.Errorf("could not download file for scan: %w", err) + } + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("unexpected HTTP response status when trying to download file for scan: %v", resp.Status) } - chunk := &sources.Chunk{ + + reporter := sources.ChanReporter{Ch: chunksChan} + chunkSkel := sources.Chunk{ SourceType: s.Type(), SourceName: s.name, SourceID: s.SourceID(), JobID: s.JobID(), SecretID: target.SecretID, - Data: []byte(stripLeadingPlusMinus(res)), SourceMetadata: &source_metadatapb.MetaData{ Data: &source_metadatapb.MetaData_Github{Github: meta}, }, - Verify: s.verify, - } - - return common.CancellableWrite(ctx, chunksChan, chunk) -} - -// stripLeadingPlusMinus removes leading + and - characters from lines in a diff string. These characters exist in the -// diffs returned when performing a targeted scan and need to be removed so that detectors are operating on the correct -// text. -func stripLeadingPlusMinus(diff string) string { - return strings.ReplaceAll(strings.ReplaceAll(diff, "\n+", "\n"), "\n-", "\n") + Verify: s.verify} + return handlers.HandleFile(ctx, readCloser, &chunkSkel, reporter) } diff --git a/pkg/sources/github/github_integration_test.go b/pkg/sources/github/github_integration_test.go index cfc6fe01703b..9fff037c321b 100644 --- a/pkg/sources/github/github_integration_test.go +++ b/pkg/sources/github/github_integration_test.go @@ -758,6 +758,24 @@ func TestSource_Chunks_TargetedScan(t *testing.T) { }, wantChunks: 1, }, + { + name: "targeted scan, binary file", + init: init{ + name: "test source", + connection: &sourcespb.GitHub{Credential: &sourcespb.GitHub_Token{Token: githubToken}}, + queryCriteria: &source_metadatapb.MetaData{ + Data: &source_metadatapb.MetaData_Github{ + Github: &source_metadatapb.Github{ + Repository: "https://github.com/truffle-sandbox/test-secrets.git", + Link: "https://github.com/truffle-sandbox/test-secrets/blob/70bef8590f87257c0992eecc7db529827a12b801/null_text_w_ptp.ipynb", + Commit: "70bef8590f87257c0992eecc7db529827a12b801", + File: "null_text_w_ptp.ipynb", + }, + }, + }, + }, + wantChunks: 607, + }, { name: "no file in commit", init: init{ diff --git a/pkg/sources/github/repo.go b/pkg/sources/github/repo.go index 67e7728816c7..79be8653ffcb 100644 --- a/pkg/sources/github/repo.go +++ b/pkg/sources/github/repo.go @@ -281,60 +281,6 @@ func (s *Source) wikiIsReachable(ctx context.Context, repoURL string) bool { return wikiURL == res.Request.URL.String() } -// commitQuery represents the details required to fetch a commit. -type commitQuery struct { - repo string - owner string - sha string - filename string -} - -// getDiffForFileInCommit retrieves the diff for a specified file in a commit. -// If the file or its diff is not found, it returns an error. -func (s *Source) getDiffForFileInCommit(ctx context.Context, query commitQuery) (string, error) { - var ( - commit *github.RepositoryCommit - err error - ) - for { - commit, _, err = s.connector.APIClient().Repositories.GetCommit(ctx, query.owner, query.repo, query.sha, nil) - if s.handleRateLimit(err) { - continue - } - if err != nil { - return "", fmt.Errorf("error fetching commit %s: %w", query.sha, err) - } - break - } - - if len(commit.Files) == 0 { - return "", fmt.Errorf("commit %s does not contain any files", query.sha) - } - - res := new(strings.Builder) - // Only return the diff if the file is in the commit. - for _, file := range commit.Files { - if *file.Filename != query.filename { - continue - } - - if file.Patch == nil { - return "", fmt.Errorf("commit %s file %s does not have a diff", query.sha, query.filename) - } - - if _, err := res.WriteString(*file.Patch); err != nil { - return "", fmt.Errorf("buffer write error for commit %s file %s: %w", query.sha, query.filename, err) - } - res.WriteString("\n") - } - - if res.Len() == 0 { - return "", fmt.Errorf("commit %s does not contain patch for file %s", query.sha, query.filename) - } - - return res.String(), nil -} - func (s *Source) normalizeRepo(repo string) (string, error) { // If there's a '/', assume it's a URL and try to normalize it. if strings.ContainsRune(repo, '/') {