Skip to content

Commit

Permalink
Add batch querying for clearly defined to reduce ingestion time (#2088)
Browse files Browse the repository at this point in the history
* change CD certifier to batch query

Signed-off-by: pxp928 <parth.psu@gmail.com>

* rebase ingestor scanner to use certifier

Signed-off-by: pxp928 <parth.psu@gmail.com>

* update unit tests and change ent to not index on attribution for clearlyDefined

Signed-off-by: pxp928 <parth.psu@gmail.com>

* add query rate limit per service for certifier

Signed-off-by: pxp928 <parth.psu@gmail.com>

* add query limit for license scanner on ingestion

Signed-off-by: pxp928 <parth.psu@gmail.com>

* fix and order unit test for certifiers

Signed-off-by: pxp928 <parth.psu@gmail.com>

* add comments to functions

Signed-off-by: pxp928 <parth.psu@gmail.com>

* fix unit tests

Signed-off-by: pxp928 <parth.psu@gmail.com>

* add re-try for clearly defined

Signed-off-by: pxp928 <parth.psu@gmail.com>

* add constant query size for CD and OSV

Signed-off-by: pxp928 <parth.psu@gmail.com>

---------

Signed-off-by: pxp928 <parth.psu@gmail.com>
  • Loading branch information
pxp928 authored Aug 30, 2024
1 parent 7fe8848 commit 0f694a3
Show file tree
Hide file tree
Showing 28 changed files with 947 additions and 15,804 deletions.
14 changes: 13 additions & 1 deletion cmd/guaccollect/cmd/license.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,17 @@ import (
"github.com/guacsec/guac/pkg/certifier"
"github.com/guacsec/guac/pkg/certifier/certify"
"github.com/guacsec/guac/pkg/certifier/clearlydefined"
"github.com/guacsec/guac/pkg/certifier/components/root_package"
"github.com/guacsec/guac/pkg/cli"
"github.com/guacsec/guac/pkg/logging"
"github.com/spf13/cobra"
"github.com/spf13/viper"
)

const (
cdQuerySize = 248
)

type cdOptions struct {
graphqlEndpoint string
headerFile string
Expand Down Expand Up @@ -98,7 +103,7 @@ you have access to read and write to the respective blob store.`,
httpClient := http.Client{Transport: transport}
gqlclient := graphql.NewClient(opts.graphqlEndpoint, &httpClient)

packageQueryFunc, err := getPackageQuery(gqlclient, opts.batchSize, opts.addedLatency)
packageQueryFunc, err := getCDPackageQuery(gqlclient, opts.batchSize, opts.addedLatency)
if err != nil {
logger.Errorf("error: %v", err)
os.Exit(1)
Expand All @@ -108,6 +113,13 @@ you have access to read and write to the respective blob store.`,
},
}

func getCDPackageQuery(client graphql.Client, batchSize int, addedLatency *time.Duration) (func() certifier.QueryComponents, error) {
return func() certifier.QueryComponents {
packageQuery := root_package.NewPackageQuery(client, batchSize, cdQuerySize, addedLatency)
return packageQuery
}, nil
}

func validateCDFlags(
graphqlEndpoint,
headerFile,
Expand Down
10 changes: 7 additions & 3 deletions cmd/guaccollect/cmd/osv.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,10 @@ import (
"github.com/spf13/viper"
)

const (
osvQuerySize = 999
)

type osvOptions struct {
graphqlEndpoint string
headerFile string
Expand Down Expand Up @@ -107,7 +111,7 @@ you have access to read and write to the respective blob store.`,
httpClient := http.Client{Transport: transport}
gqlclient := graphql.NewClient(opts.graphqlEndpoint, &httpClient)

packageQueryFunc, err := getPackageQuery(gqlclient, opts.batchSize, opts.addedLatency)
packageQueryFunc, err := getOSVPackageQuery(gqlclient, opts.batchSize, opts.addedLatency)
if err != nil {
logger.Errorf("error: %v", err)
os.Exit(1)
Expand Down Expand Up @@ -164,9 +168,9 @@ func getCertifierPublish(ctx context.Context, blobStore *blob.BlobStore, pubsub
}, nil
}

func getPackageQuery(client graphql.Client, batchSize int, addedLatency *time.Duration) (func() certifier.QueryComponents, error) {
func getOSVPackageQuery(client graphql.Client, batchSize int, addedLatency *time.Duration) (func() certifier.QueryComponents, error) {
return func() certifier.QueryComponents {
packageQuery := root_package.NewPackageQuery(client, batchSize, addedLatency)
packageQuery := root_package.NewPackageQuery(client, batchSize, osvQuerySize, addedLatency)
return packageQuery
}, nil
}
Expand Down
14 changes: 8 additions & 6 deletions cmd/guacone/cmd/license.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@ import (
"github.com/spf13/viper"
)

const (
cdQuerySize = 248
)

type cdOptions struct {
graphqlEndpoint string
headerFile string
Expand Down Expand Up @@ -96,7 +100,7 @@ var cdCmd = &cobra.Command{

httpClient := http.Client{Transport: transport}
gqlclient := graphql.NewClient(opts.graphqlEndpoint, &httpClient)
packageQuery := root_package.NewPackageQuery(gqlclient, opts.batchSize, opts.addedLatency)
packageQuery := root_package.NewPackageQuery(gqlclient, opts.batchSize, cdQuerySize, opts.addedLatency)

totalNum := 0
docChan := make(chan *processor.Document)
Expand Down Expand Up @@ -176,12 +180,10 @@ var cdCmd = &cobra.Command{

// Collect
errHandler := func(err error) bool {
if err == nil {
logger.Info("certifier ended gracefully")
return true
if err != nil {
logger.Errorf("certifier ended with error: %v", err)
atomic.StoreInt32(&gotErr, 1)
}
logger.Errorf("certifier ended with error: %v", err)
atomic.StoreInt32(&gotErr, 1)
// process documents already captures
return true
}
Expand Down
6 changes: 5 additions & 1 deletion cmd/guacone/cmd/osv.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@ import (
"github.com/spf13/viper"
)

const (
osvQuerySize = 999
)

type osvOptions struct {
graphqlEndpoint string
headerFile string
Expand Down Expand Up @@ -96,7 +100,7 @@ var osvCmd = &cobra.Command{

httpClient := http.Client{Transport: transport}
gqlclient := graphql.NewClient(opts.graphqlEndpoint, &httpClient)
packageQuery := root_package.NewPackageQuery(gqlclient, opts.batchSize, opts.addedLatency)
packageQuery := root_package.NewPackageQuery(gqlclient, opts.batchSize, osvQuerySize, opts.addedLatency)

totalNum := 0
docChan := make(chan *processor.Document)
Expand Down
2 changes: 1 addition & 1 deletion internal/testing/cmd/pubsub_test/cmd/osv.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ func getCertifierPublish(ctx context.Context, blobStore *blob.BlobStore, pubsub

func getPackageQuery(client graphql.Client) (func() certifier.QueryComponents, error) {
return func() certifier.QueryComponents {
packageQuery := root_package.NewPackageQuery(client, 60000, nil)
packageQuery := root_package.NewPackageQuery(client, 60000, 999, nil)
return packageQuery
}, nil
}
Expand Down
18 changes: 18 additions & 0 deletions internal/testing/dochelper/dochelper.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
package dochelper

import (
"errors"
"fmt"
"reflect"
"time"
Expand Down Expand Up @@ -141,6 +142,23 @@ func DocNode(v *processor.Document, children ...*processor.DocumentNode) *proces
}
}

type minimalDocument struct {
Subject []struct {
URI string `json:"uri"`
} `json:"subject"`
}

func ExtractURI(blob []byte) (string, error) {
var doc minimalDocument
if err := json.Unmarshal(blob, &doc); err != nil {
return "", err
}
if len(doc.Subject) == 0 {
return "", errors.New("no subject found in document")
}
return doc.Subject[0].URI, nil
}

func DocEqualWithTimestamp(gotDoc, wantDoc *processor.Document) (bool, error) {
var testTime = time.Unix(1597826280, 0)

Expand Down
Loading

0 comments on commit 0f694a3

Please sign in to comment.