Skip to content

Commit

Permalink
Implement verification cache (trufflesecurity#3801)
Browse files Browse the repository at this point in the history
This PR introduces a cache that allows the scanner to avoid emitting multiple requests to verify the same credential. In practice, it doesn't seem to reduce scan time at all, but it does seem to reduce the number of calls to FromData rather drastically.

The cache is implemented as an opt-out feature that can be disabled with a new CLI flag. If we don't like this, we can change it.

The metrics collection hopefully isn't too architecture-astronauty; I wanted to create something useful here that could also accommodate future Prometheus configuration without making the implementation all stupid.
  • Loading branch information
rosecodym authored Dec 20, 2024
1 parent eeb5c04 commit ddc015e
Show file tree
Hide file tree
Showing 10 changed files with 546 additions and 36 deletions.
53 changes: 40 additions & 13 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ import (
"github.com/go-logr/logr"
"github.com/jpillora/overseer"
"github.com/mattn/go-isatty"
"github.com/trufflesecurity/trufflehog/v3/pkg/cache/simple"
"github.com/trufflesecurity/trufflehog/v3/pkg/detectors"
"github.com/trufflesecurity/trufflehog/v3/pkg/verificationcache"
"go.uber.org/automaxprocs/maxprocs"

"github.com/trufflesecurity/trufflehog/v3/pkg/analyzer"
Expand Down Expand Up @@ -76,6 +79,8 @@ var (
excludeDetectors = cli.Flag("exclude-detectors", "Comma separated list of detector types to exclude. Protobuf name or IDs may be used, as well as ranges. IDs defined here take precedence over the include list.").String()
jobReportFile = cli.Flag("output-report", "Write a scan report to the provided path.").Hidden().OpenFile(os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0666)

noVerificationCache = cli.Flag("no-verification-cache", "Disable verification caching").Bool()

// Add feature flags
forceSkipBinaries = cli.Flag("force-skip-binaries", "Force skipping binaries.").Bool()
forceSkipArchives = cli.Flag("force-skip-archives", "Force skipping archives.").Bool()
Expand Down Expand Up @@ -480,25 +485,32 @@ func run(state overseer.State) {
logFatal(err, "failed to configure results flag")
}

verificationCacheMetrics := verificationcache.InMemoryMetrics{}

engConf := engine.Config{
Concurrency: *concurrency,
// The engine must always be configured with the list of
// default detectors, which can be further filtered by the
// user. The filters are applied by the engine and are only
// subtractive.
Detectors: append(defaults.DefaultDetectors(), conf.Detectors...),
Verify: !*noVerification,
IncludeDetectors: *includeDetectors,
ExcludeDetectors: *excludeDetectors,
CustomVerifiersOnly: *customVerifiersOnly,
VerifierEndpoints: *verifiers,
Dispatcher: engine.NewPrinterDispatcher(printer),
FilterUnverified: *filterUnverified,
FilterEntropy: *filterEntropy,
VerificationOverlap: *allowVerificationOverlap,
Results: parsedResults,
PrintAvgDetectorTime: *printAvgDetectorTime,
ShouldScanEntireChunk: *scanEntireChunk,
Detectors: append(defaults.DefaultDetectors(), conf.Detectors...),
Verify: !*noVerification,
IncludeDetectors: *includeDetectors,
ExcludeDetectors: *excludeDetectors,
CustomVerifiersOnly: *customVerifiersOnly,
VerifierEndpoints: *verifiers,
Dispatcher: engine.NewPrinterDispatcher(printer),
FilterUnverified: *filterUnverified,
FilterEntropy: *filterEntropy,
VerificationOverlap: *allowVerificationOverlap,
Results: parsedResults,
PrintAvgDetectorTime: *printAvgDetectorTime,
ShouldScanEntireChunk: *scanEntireChunk,
VerificationCacheMetrics: &verificationCacheMetrics,
}

if !*noVerificationCache {
engConf.VerificationResultCache = simple.NewCache[detectors.Result]()
}

if *compareDetectionStrategies {
Expand All @@ -518,6 +530,20 @@ func run(state overseer.State) {
logFatal(err, "error running scan")
}

verificationCacheMetrics := struct {
Hits int32
Misses int32
HitsWasted int32
AttemptsSaved int32
VerificationTimeSpentMS int64
}{
Hits: verificationCacheMetrics.ResultCacheHits.Load(),
Misses: verificationCacheMetrics.ResultCacheMisses.Load(),
HitsWasted: verificationCacheMetrics.ResultCacheHitsWasted.Load(),
AttemptsSaved: verificationCacheMetrics.CredentialVerificationsSaved.Load(),
VerificationTimeSpentMS: verificationCacheMetrics.FromDataVerifyTimeSpentMS.Load(),
}

// Print results.
logger.Info("finished scanning",
"chunks", metrics.ChunksScanned,
Expand All @@ -526,6 +552,7 @@ func run(state overseer.State) {
"unverified_secrets", metrics.UnverifiedSecretsFound,
"scan_duration", metrics.ScanDuration.String(),
"trufflehog_version", version.BuildVersion,
"verification_caching", verificationCacheMetrics,
)

if metrics.hasFoundResults && *fail {
Expand Down
13 changes: 12 additions & 1 deletion pkg/detectors/detectors.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,9 @@ type Result struct {
// DetectorName is the name of the Detector. Used for custom detectors.
DetectorName string
Verified bool
// VerificationFromCache indicates whether this result's verification result came from the verification cache rather
// than an actual remote request.
VerificationFromCache bool
// Raw contains the raw secret identifier data. Prefer IDs over secrets since it is used for deduping after hashing.
Raw []byte
// RawV2 contains the raw secret identifier that is a combination of both the ID and the secret.
Expand All @@ -111,7 +114,15 @@ type Result struct {
AnalysisInfo map[string]string
}

// SetVerificationError is the only way to set a verification error. Any sensitive values should be passed-in as secrets to be redacted.
// CopyVerificationInfo clones verification info (status and error) from another Result struct. This is used when
// loading verification info from a verification cache. (A method is necessary because verification errors are not
// exported, to prevent the accidental storage of sensitive information in them.)
func (r *Result) CopyVerificationInfo(from *Result) {
r.Verified = from.Verified
r.verificationError = from.verificationError
}

// SetVerificationError is the only way to set a new verification error. Any sensitive values should be passed-in as secrets to be redacted.
func (r *Result) SetVerificationError(err error, secrets ...string) {
if err != nil {
r.verificationError = redactSecrets(err, secrets...)
Expand Down
21 changes: 17 additions & 4 deletions pkg/engine/engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
"github.com/adrg/strutil"
"github.com/adrg/strutil/metrics"
lru "github.com/hashicorp/golang-lru/v2"
"github.com/trufflesecurity/trufflehog/v3/pkg/verificationcache"
"google.golang.org/protobuf/proto"

"github.com/trufflesecurity/trufflehog/v3/pkg/common"
Expand Down Expand Up @@ -145,6 +146,9 @@ type Config struct {

// VerificationOverlapWorkerMultiplier is used to determine the number of verification overlap workers to spawn.
VerificationOverlapWorkerMultiplier int

VerificationResultCache verificationcache.ResultCache
VerificationCacheMetrics verificationcache.MetricsReporter
}

// Engine represents the core scanning engine responsible for detecting secrets in input data.
Expand All @@ -153,9 +157,10 @@ type Config struct {
// customization through various options and configurations.
type Engine struct {
// CLI flags.
concurrency int
decoders []decoders.Decoder
detectors []detectors.Detector
concurrency int
decoders []decoders.Decoder
detectors []detectors.Detector
verificationCache *verificationcache.VerificationCache
// Any detectors configured to override sources' verification flags
detectorVerificationOverrides map[config.DetectorID]bool

Expand Down Expand Up @@ -216,10 +221,13 @@ type Engine struct {

// NewEngine creates a new Engine instance with the provided configuration.
func NewEngine(ctx context.Context, cfg *Config) (*Engine, error) {
verificationCache := verificationcache.New(cfg.VerificationResultCache, cfg.VerificationCacheMetrics)

engine := &Engine{
concurrency: cfg.Concurrency,
decoders: cfg.Decoders,
detectors: cfg.Detectors,
verificationCache: verificationCache,
dispatcher: cfg.Dispatcher,
verify: cfg.Verify,
filterUnverified: cfg.FilterUnverified,
Expand Down Expand Up @@ -1056,7 +1064,12 @@ func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) {
t := time.AfterFunc(detectionTimeout+1*time.Second, func() {
ctx.Logger().Error(nil, "a detector ignored the context timeout")
})
results, err := data.detector.Detector.FromData(ctx, data.chunk.Verify, matchBytes)
results, err := e.verificationCache.FromData(
ctx,
data.detector.Detector,
data.chunk.Verify,
data.chunk.SecretID != 0,
matchBytes)
t.Stop()
cancel()
if err != nil {
Expand Down
38 changes: 20 additions & 18 deletions pkg/output/json.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,10 @@ func (p *JSONPrinter) Print(_ context.Context, r *detectors.ResultWithMetadata)
// DetectorDescription is the description of the Detector.
DetectorDescription string
// DecoderName is the string name of the DecoderType.
DecoderName string
Verified bool
VerificationError string `json:",omitempty"`
DecoderName string
Verified bool
VerificationError string `json:",omitempty"`
VerificationFromCache bool
// Raw contains the raw secret data.
Raw string
// RawV2 contains the raw secret identifier that is a combination of both the ID and the secret.
Expand All @@ -54,21 +55,22 @@ func (p *JSONPrinter) Print(_ context.Context, r *detectors.ResultWithMetadata)
ExtraData map[string]string
StructuredData *detectorspb.StructuredData
}{
SourceMetadata: r.SourceMetadata,
SourceID: r.SourceID,
SourceType: r.SourceType,
SourceName: r.SourceName,
DetectorType: r.DetectorType,
DetectorName: r.DetectorType.String(),
DetectorDescription: r.DetectorDescription,
DecoderName: r.DecoderType.String(),
Verified: r.Verified,
VerificationError: verificationErr,
Raw: string(r.Raw),
RawV2: string(r.RawV2),
Redacted: r.Redacted,
ExtraData: r.ExtraData,
StructuredData: r.StructuredData,
SourceMetadata: r.SourceMetadata,
SourceID: r.SourceID,
SourceType: r.SourceType,
SourceName: r.SourceName,
DetectorType: r.DetectorType,
DetectorName: r.DetectorType.String(),
DetectorDescription: r.DetectorDescription,
DecoderName: r.DecoderType.String(),
Verified: r.Verified,
VerificationError: verificationErr,
VerificationFromCache: r.VerificationFromCache,
Raw: string(r.Raw),
RawV2: string(r.RawV2),
Redacted: r.Redacted,
ExtraData: r.ExtraData,
StructuredData: r.StructuredData,
}
out, err := json.Marshal(v)
if err != nil {
Expand Down
4 changes: 4 additions & 0 deletions pkg/output/plain.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ var (
boldGreenPrinter = color.New(color.Bold, color.FgHiGreen)
whitePrinter = color.New(color.FgWhite)
boldWhitePrinter = color.New(color.Bold, color.FgWhite)
cyanPrinter = color.New(color.FgCyan)
)

// PlainPrinter is a printer that prints results in plain text format.
Expand Down Expand Up @@ -56,6 +57,9 @@ func (p *PlainPrinter) Print(_ context.Context, r *detectors.ResultWithMetadata)
yellowPrinter.Printf("Verification issue: %s\n", out.VerificationError)
}
}
if r.VerificationFromCache {
cyanPrinter.Print("(Verification info cached)\n")
}
printer.Printf("Detector Type: %s\n", out.DetectorType)
printer.Printf("Decoder Type: %s\n", out.DecoderType)
printer.Printf("Raw result: %s\n", whitePrinter.Sprint(out.Raw))
Expand Down
37 changes: 37 additions & 0 deletions pkg/verificationcache/in_memory_metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
package verificationcache

import (
"sync/atomic"
"time"
)

// InMemoryMetrics is a MetricsReporter that stores reported metrics in memory for retrieval at the end of a scan.
type InMemoryMetrics struct {
CredentialVerificationsSaved atomic.Int32
FromDataVerifyTimeSpentMS atomic.Int64
ResultCacheHits atomic.Int32
ResultCacheHitsWasted atomic.Int32
ResultCacheMisses atomic.Int32
}

var _ MetricsReporter = (*InMemoryMetrics)(nil)

func (m *InMemoryMetrics) AddCredentialVerificationsSaved(count int) {
m.CredentialVerificationsSaved.Add(int32(count))
}

func (m *InMemoryMetrics) AddFromDataVerifyTimeSpent(wallTime time.Duration) {
m.FromDataVerifyTimeSpentMS.Add(wallTime.Milliseconds())
}

func (m *InMemoryMetrics) AddResultCacheHits(count int) {
m.ResultCacheHits.Add(int32(count))
}

func (m *InMemoryMetrics) AddResultCacheMisses(count int) {
m.ResultCacheMisses.Add(int32(count))
}

func (m *InMemoryMetrics) AddResultCacheHitsWasted(count int) {
m.ResultCacheHitsWasted.Add(int32(count))
}
28 changes: 28 additions & 0 deletions pkg/verificationcache/metrics_reporter.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
package verificationcache

import "time"

// MetricsReporter is an interface used by a verification cache to report various metrics related to its operation.
// Implementations must be thread-safe.
type MetricsReporter interface {
// AddCredentialVerificationsSaved records "saved" verification attempts, which is when credential verification
// status is loaded from the cache instead of retrieved from a remote verification endpoint. This number might be
// smaller than the cache hit count due to cache hit "wasting"; see AddResultCacheHitsWasted for more information.
AddCredentialVerificationsSaved(count int)

// AddFromDataVerifyTimeSpent records wall time spent in calls to detector.FromData with verify=true.
AddFromDataVerifyTimeSpent(wallTime time.Duration)

// AddResultCacheHits records result cache hits. Not all cache hits result in elided remote verification requests
// due to cache hit "wasting"; see AddResultCacheHitsWasted for more information.
AddResultCacheHits(count int)

// AddResultCacheMisses records result cache misses.
AddResultCacheMisses(count int)

// AddResultCacheHitsWasted records "wasted" result cache hits. A "wasted" result cache hit is a result cache hit
// that does not elide a remote verification request because there are other secret findings in the relevant chunk
// that are not cached. When this happens, the detector's FromData method must be called anyway, so the cache hit
// doesn't save any remote requests.
AddResultCacheHitsWasted(count int)
}
9 changes: 9 additions & 0 deletions pkg/verificationcache/result_cache.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
package verificationcache

import (
"github.com/trufflesecurity/trufflehog/v3/pkg/cache"
"github.com/trufflesecurity/trufflehog/v3/pkg/detectors"
)

// ResultCache is a cache that holds individual detector results. It serves as a component of a VerificationCache.
type ResultCache cache.Cache[detectors.Result]
Loading

0 comments on commit ddc015e

Please sign in to comment.