Skip to content

Commit

Permalink
feat(python): parse licenses from dist-info folder (#4724)
Browse files Browse the repository at this point in the history
Signed-off-by: knqyf263 <knqyf263@gmail.com>
Co-authored-by: knqyf263 <knqyf263@gmail.com>
  • Loading branch information
nikpivkin and knqyf263 authored Jan 4, 2024
1 parent fa2e883 commit df3e90a
Show file tree
Hide file tree
Showing 10 changed files with 636 additions and 68 deletions.
172 changes: 139 additions & 33 deletions pkg/fanal/analyzer/language/python/packaging/packaging.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,27 +4,43 @@ import (
"archive/zip"
"bytes"
"context"
"errors"
"io"
"io/fs"
"os"
"path"
"path/filepath"
"strings"

"github.com/samber/lo"
"golang.org/x/xerrors"

dio "github.com/aquasecurity/go-dep-parser/pkg/io"
"github.com/aquasecurity/go-dep-parser/pkg/python/packaging"
godeptypes "github.com/aquasecurity/go-dep-parser/pkg/types"
"github.com/aquasecurity/trivy/pkg/fanal/analyzer"
"github.com/aquasecurity/trivy/pkg/fanal/analyzer/language"
"github.com/aquasecurity/trivy/pkg/fanal/types"
"github.com/aquasecurity/trivy/pkg/licensing"
"github.com/aquasecurity/trivy/pkg/log"
"github.com/aquasecurity/trivy/pkg/utils/fsutils"
)

func init() {
analyzer.RegisterAnalyzer(&packagingAnalyzer{})
analyzer.RegisterPostAnalyzer(analyzer.TypePythonPkg, newPackagingAnalyzer)
}

const version = 1

func newPackagingAnalyzer(opt analyzer.AnalyzerOptions) (analyzer.PostAnalyzer, error) {
return &packagingAnalyzer{
pkgParser: packaging.NewParser(),
licenseClassifierConfidenceLevel: opt.LicenseScannerOption.ClassifierConfidenceLevel,
}, nil
}

var (
requiredFiles = []string{
eggFiles = []string{
// .egg format
// https://setuptools.readthedocs.io/en/latest/deprecated/python_eggs.html#eggs-and-their-formats
".egg", // zip format
Expand All @@ -34,35 +50,125 @@ var (
// https://setuptools.readthedocs.io/en/latest/deprecated/python_eggs.html#eggs-and-their-formats
".egg-info",
".egg-info/PKG-INFO",

// wheel
".dist-info/METADATA",
}
)

type packagingAnalyzer struct{}
type packagingAnalyzer struct {
pkgParser godeptypes.Parser
licenseClassifierConfidenceLevel float64
}

// PostAnalyze analyzes egg and wheel files.
func (a packagingAnalyzer) PostAnalyze(_ context.Context, input analyzer.PostAnalysisInput) (*analyzer.AnalysisResult, error) {

// Analyze analyzes egg and wheel files.
func (a packagingAnalyzer) Analyze(_ context.Context, input analyzer.AnalysisInput) (*analyzer.AnalysisResult, error) {
r := input.Content
var apps []types.Application

required := func(path string, _ fs.DirEntry) bool {
return filepath.Base(path) == "METADATA" || isEggFile(path)
}

err := fsutils.WalkDir(input.FS, ".", required, func(path string, d fs.DirEntry, r io.Reader) error {
rsa, ok := r.(dio.ReadSeekerAt)
if !ok {
return xerrors.New("invalid reader")
}

// .egg file is zip format and PKG-INFO needs to be extracted from the zip file.
if strings.HasSuffix(input.FilePath, ".egg") {
pkginfoInZip, err := a.analyzeEggZip(input.Content, input.Info.Size())
// .egg file is zip format and PKG-INFO needs to be extracted from the zip file.
if strings.HasSuffix(path, ".egg") {
info, err := d.Info()
if err != nil {
return xerrors.Errorf("egg file error: %w", err)
}
pkginfoInZip, err := a.analyzeEggZip(rsa, info.Size())
if err != nil {
return xerrors.Errorf("egg analysis error: %w", err)
}

// Egg archive may not contain required files, then we will get nil. Skip this archives
if pkginfoInZip == nil {
return nil
}
rsa = pkginfoInZip
}

app, err := a.parse(path, rsa, input.Options.FileChecksum)
if err != nil {
return nil, xerrors.Errorf("egg analysis error: %w", err)
return xerrors.Errorf("parse error: %w", err)
} else if app == nil {
return nil
}

if err := a.fillAdditionalData(input.FS, app); err != nil {
log.Logger.Warnf("Unable to collect additional info: %s", err)
}

// Egg archive may not contain required files, then we will get nil. Skip this archives
if pkginfoInZip == nil {
return nil, nil
apps = append(apps, *app)
return nil
})

if err != nil {
return nil, xerrors.Errorf("python package walk error: %w", err)
}
return &analyzer.AnalysisResult{
Applications: apps,
}, nil
}

func (a packagingAnalyzer) fillAdditionalData(fsys fs.FS, app *types.Application) error {
for i, lib := range app.Libraries {
var licenses []string
for _, lic := range lib.Licenses {
// Parser adds `file://` prefix to filepath from `License-File` field
// We need to read this file to find licenses
// Otherwise, this is the name of the license
if !strings.HasPrefix(lic, "file://") {
licenses = append(licenses, lic)
continue
}
licenseFilePath := path.Base(strings.TrimPrefix(lic, "file://"))

findings, err := classifyLicense(app.FilePath, licenseFilePath, a.licenseClassifierConfidenceLevel, fsys)
if err != nil {
return err
} else if len(findings) == 0 {
continue
}

// License found
foundLicenses := lo.Map(findings, func(finding types.LicenseFinding, _ int) string {
return finding.Name
})
licenses = append(licenses, foundLicenses...)
}
app.Libraries[i].Licenses = licenses
}

r = pkginfoInZip
return nil
}

func classifyLicense(dir, licPath string, classifierConfidenceLevel float64, fsys fs.FS) (types.LicenseFindings, error) {
// Note that fs.FS is always slashed regardless of the platform,
// and path.Join should be used rather than filepath.Join.
f, err := fsys.Open(path.Join(path.Dir(dir), licPath))
if errors.Is(err, fs.ErrNotExist) {
return nil, nil
} else if err != nil {
return nil, xerrors.Errorf("file open error: %w", err)
}
defer f.Close()

p := packaging.NewParser()
return language.AnalyzePackage(types.PythonPkg, input.FilePath, r, p, input.Options.FileChecksum)
l, err := licensing.Classify(licPath, f, classifierConfidenceLevel)
if err != nil {
return nil, xerrors.Errorf("license classify error: %w", err)
} else if l == nil {
return nil, nil
}

return l.Findings, nil
}

func (a packagingAnalyzer) parse(filePath string, r dio.ReadSeekerAt, checksum bool) (*types.Application, error) {
return language.ParsePackage(types.PythonPkg, filePath, r, a.pkgParser, checksum)
}

func (a packagingAnalyzer) analyzeEggZip(r io.ReaderAt, size int64) (dio.ReadSeekerAt, error) {
Expand All @@ -71,17 +177,16 @@ func (a packagingAnalyzer) analyzeEggZip(r io.ReaderAt, size int64) (dio.ReadSee
return nil, xerrors.Errorf("zip reader error: %w", err)
}

for _, file := range zr.File {
if !a.Required(file.Name, nil) {
continue
}

return a.open(file)
found, ok := lo.Find(zr.File, func(f *zip.File) bool {
return isEggFile(f.Name)
})
if !ok {
return nil, nil
}

return nil, nil
return a.open(found)
}

// open reads the file content in the zip archive to make it seekable.
func (a packagingAnalyzer) open(file *zip.File) (dio.ReadSeekerAt, error) {
f, err := file.Open()
if err != nil {
Expand All @@ -98,12 +203,13 @@ func (a packagingAnalyzer) open(file *zip.File) (dio.ReadSeekerAt, error) {
}

func (a packagingAnalyzer) Required(filePath string, _ os.FileInfo) bool {
for _, r := range requiredFiles {
if strings.HasSuffix(filePath, r) {
return true
}
}
return false
return strings.Contains(filePath, ".dist-info") || isEggFile(filePath)
}

func isEggFile(filePath string) bool {
return lo.SomeBy(eggFiles, func(fileName string) bool {
return strings.HasSuffix(filePath, fileName)
})
}

func (a packagingAnalyzer) Type() analyzer.Type {
Expand Down
Loading

0 comments on commit df3e90a

Please sign in to comment.