Skip to content

Commit

Permalink
Backport of Telemetry Metrics Configuration. (#18186) into release/1.…
Browse files Browse the repository at this point in the history
…12.x (#21092)

* Telemetry Metrics Configuration. (#18186)

* Telemetry Metrics Configuration.

* Err Shadowing Fix (woah, semgrep is cool).

* Fix TestBackend_RevokePlusTidy_Intermediate

* Add Changelog.

* Fix memory leak.  Code cleanup as suggested by Steve.

* Turn off metrics by default, breaking-change.

* Show on tidy-status before start-up.

* Fix tests

* make fmt

* Add emit metrics to periodicFunc

* Test not delivering unavailable metrics + fix.

* Better error message.

* Fixing the false-error bug.

* make fmt.

* Try to fix race issue, remove confusing comments.

* Switch metric counter variables to an atomic.Uint32

 - Switch the metric counter variables to an atomic variable type
   so that we are forced to properly load/store values to it

* Fix race-issue better by trying until the metric is sunk.

* make fmt.

* empty commit to retrigger non-race tests that all pass locally

---------

Co-authored-by: Steve Clark <steven.clark@hashicorp.com>

* Fix duplicate code due to merge conflicts doubling count.

* Remove tests for features that aren't in 1.12

---------

Co-authored-by: Steve Clark <steven.clark@hashicorp.com>
  • Loading branch information
kitography and stevendpclark authored Jun 13, 2023
1 parent e7f8a20 commit 0f06076
Show file tree
Hide file tree
Showing 11 changed files with 622 additions and 92 deletions.
155 changes: 115 additions & 40 deletions builtin/logical/pki/backend.go
Original file line number Diff line number Diff line change
Expand Up @@ -208,9 +208,12 @@ func Backend(conf *logical.BackendConfig) *backend {
b.lastTidy = time.Now()

// Metrics initialization for count of certificates in storage
b.certCountEnabled = atomic2.NewBool(false)
b.publishCertCountMetrics = atomic2.NewBool(false)
b.certsCounted = atomic2.NewBool(false)
b.certCount = new(uint32)
b.revokedCertCount = new(uint32)
b.certCountError = "Initialize Not Yet Run, Cert Counts Unavailable"
b.certCount = &atomic.Uint32{}
b.revokedCertCount = &atomic.Uint32{}
b.possibleDoubleCountedSerials = make([]string, 0, 250)
b.possibleDoubleCountedRevokedSerials = make([]string, 0, 250)

Expand All @@ -230,9 +233,12 @@ type backend struct {
tidyStatus *tidyStatus
lastTidy time.Time

certCount *uint32
revokedCertCount *uint32
certCountEnabled *atomic2.Bool
publishCertCountMetrics *atomic2.Bool
certCount *atomic.Uint32
revokedCertCount *atomic.Uint32
certsCounted *atomic2.Bool
certCountError string
possibleDoubleCountedSerials []string
possibleDoubleCountedRevokedSerials []string

Expand Down Expand Up @@ -355,9 +361,10 @@ func (b *backend) initialize(ctx context.Context, _ *logical.InitializationReque
// Initialize also needs to populate our certificate and revoked certificate count
err = b.initializeStoredCertificateCounts(ctx)
if err != nil {
return err
// Don't block/err initialize/startup for metrics. Context on this call can time out due to number of certificates.
b.Logger().Error("Could not initialize stored certificate counts", err)
b.certCountError = err.Error()
}

return nil
}

Expand Down Expand Up @@ -547,6 +554,13 @@ func (b *backend) periodicFunc(ctx context.Context, request *logical.Request) er
return fmt.Errorf("Error building CRLs:\n - %v\n\nError running auto-tidy:\n - %v\n", crlErr, tidyErr)
}

// Periodically re-emit gauges so that they don't disappear/go stale
tidyConfig, err := sc.getAutoTidyConfig()
if err != nil {
return err
}
b.emitCertStoreMetrics(tidyConfig)

if crlErr != nil {
return fmt.Errorf("Error building CRLs:\n - %v\n", crlErr)
}
Expand All @@ -566,24 +580,51 @@ func (b *backend) periodicFunc(ctx context.Context, request *logical.Request) er
}

func (b *backend) initializeStoredCertificateCounts(ctx context.Context) error {
b.tidyStatusLock.RLock()
defer b.tidyStatusLock.RUnlock()
// For performance reasons, we can't lock on issuance/storage of certs until a list operation completes,
// but we want to limit possible miscounts / double-counts to over-counting, so we take the tidy lock which
// prevents (most) deletions - in particular we take a read lock (sufficient to block the write lock in
// tidyStatusStart while allowing tidy to still acquire a read lock to report via its endpoint)
b.tidyStatusLock.RLock()
defer b.tidyStatusLock.RUnlock()
sc := b.makeStorageContext(ctx, b.storage)
config, err := sc.getAutoTidyConfig()
if err != nil {
return err
}

b.certCountEnabled.Store(config.MaintainCount)
b.publishCertCountMetrics.Store(config.PublishMetrics)

if config.MaintainCount == false {
b.possibleDoubleCountedRevokedSerials = nil
b.possibleDoubleCountedSerials = nil
b.certsCounted.Store(true)
b.certCount.Store(0)
b.revokedCertCount.Store(0)
b.certCountError = "Cert Count is Disabled: enable via Tidy Config maintain_stored_certificate_counts"
return nil
}

// Ideally these three things would be set in one transaction, since that isn't possible, set the counts to "0",
// first, so count will over-count (and miss putting things in deduplicate queue), rather than under-count.
b.certCount.Store(0)
b.revokedCertCount.Store(0)
b.possibleDoubleCountedRevokedSerials = nil
b.possibleDoubleCountedSerials = nil
// A cert issued or revoked here will be double-counted. That's okay, this is "best effort" metrics.
b.certsCounted.Store(false)

entries, err := b.storage.List(ctx, "certs/")
if err != nil {
return err
}
atomic.AddUint32(b.certCount, uint32(len(entries)))
b.certCount.Add(uint32(len(entries)))

revokedEntries, err := b.storage.List(ctx, "revoked/")
if err != nil {
return err
}
atomic.AddUint32(b.revokedCertCount, uint32(len(revokedEntries)))
b.revokedCertCount.Add(uint32(len(revokedEntries)))

b.certsCounted.Store(true)
// Now that the metrics are set, we can switch from appending newly-stored certificates to the possible double-count
Expand Down Expand Up @@ -664,64 +705,98 @@ func (b *backend) initializeStoredCertificateCounts(ctx context.Context) error {
b.possibleDoubleCountedRevokedSerials = nil
b.possibleDoubleCountedSerials = nil

certCount := atomic.LoadUint32(b.certCount)
metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_certificates_stored"}, float32(certCount))
revokedCertCount := atomic.LoadUint32(b.revokedCertCount)
metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_revoked_certificates_stored"}, float32(revokedCertCount))
b.emitCertStoreMetrics(config)

b.certCountError = ""

return nil
}

func (b *backend) emitCertStoreMetrics(config *tidyConfig) {
if config.PublishMetrics == true {
certCount := b.certCount.Load()
b.emitTotalCertCountMetric(certCount)
revokedCertCount := b.revokedCertCount.Load()
b.emitTotalRevokedCountMetric(revokedCertCount)
}
}

// The "certsCounted" boolean here should be loaded from the backend certsCounted before the corresponding storage call:
// eg. certsCounted := b.certsCounted.Load()
func (b *backend) incrementTotalCertificatesCount(certsCounted bool, newSerial string) {
certCount := atomic.AddUint32(b.certCount, 1)
switch {
case !certsCounted:
// This is unsafe, but a good best-attempt
if strings.HasPrefix(newSerial, "certs/") {
newSerial = newSerial[6:]
func (b *backend) ifCountEnabledIncrementTotalCertificatesCount(certsCounted bool, newSerial string) {
if b.certCountEnabled.Load() {
certCount := b.certCount.Add(1)
switch {
case !certsCounted:
// This is unsafe, but a good best-attempt
if strings.HasPrefix(newSerial, "certs/") {
newSerial = newSerial[6:]
}
b.possibleDoubleCountedSerials = append(b.possibleDoubleCountedSerials, newSerial)
default:
if b.publishCertCountMetrics.Load() {
b.emitTotalCertCountMetric(certCount)
}
}
}
}

func (b *backend) ifCountEnabledDecrementTotalCertificatesCountReport() {
if b.certCountEnabled.Load() {
certCount := b.decrementTotalCertificatesCountNoReport()
if b.publishCertCountMetrics.Load() {
b.emitTotalCertCountMetric(certCount)
}
b.possibleDoubleCountedSerials = append(b.possibleDoubleCountedSerials, newSerial)
default:
metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_certificates_stored"}, float32(certCount))
}
}

func (b *backend) decrementTotalCertificatesCountReport() {
certCount := b.decrementTotalCertificatesCountNoReport()
func (b *backend) emitTotalCertCountMetric(certCount uint32) {
metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_certificates_stored"}, float32(certCount))
}

// Called directly only by the initialize function to deduplicate the count, when we don't have a full count yet
// Does not respect whether-we-are-counting backend information.
func (b *backend) decrementTotalCertificatesCountNoReport() uint32 {
newCount := atomic.AddUint32(b.certCount, ^uint32(0))
newCount := b.certCount.Add(^uint32(0))
return newCount
}

// The "certsCounted" boolean here should be loaded from the backend certsCounted before the corresponding storage call:
// eg. certsCounted := b.certsCounted.Load()
func (b *backend) incrementTotalRevokedCertificatesCount(certsCounted bool, newSerial string) {
newRevokedCertCount := atomic.AddUint32(b.revokedCertCount, 1)
switch {
case !certsCounted:
// This is unsafe, but a good best-attempt
if strings.HasPrefix(newSerial, "revoked/") { // allow passing in the path (revoked/serial) OR the serial
newSerial = newSerial[8:]
func (b *backend) ifCountEnabledIncrementTotalRevokedCertificatesCount(certsCounted bool, newSerial string) {
if b.certCountEnabled.Load() {
newRevokedCertCount := b.revokedCertCount.Add(1)
switch {
case !certsCounted:
// This is unsafe, but a good best-attempt
if strings.HasPrefix(newSerial, "revoked/") { // allow passing in the path (revoked/serial) OR the serial
newSerial = newSerial[8:]
}
b.possibleDoubleCountedRevokedSerials = append(b.possibleDoubleCountedRevokedSerials, newSerial)
default:
if b.publishCertCountMetrics.Load() {
b.emitTotalRevokedCountMetric(newRevokedCertCount)
}
}
}
}

func (b *backend) ifCountEnabledDecrementTotalRevokedCertificatesCountReport() {
if b.certCountEnabled.Load() {
revokedCertCount := b.decrementTotalRevokedCertificatesCountNoReport()
if b.publishCertCountMetrics.Load() {
b.emitTotalRevokedCountMetric(revokedCertCount)
}
b.possibleDoubleCountedRevokedSerials = append(b.possibleDoubleCountedRevokedSerials, newSerial)
default:
metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_revoked_certificates_stored"}, float32(newRevokedCertCount))
}
}

func (b *backend) decrementTotalRevokedCertificatesCountReport() {
revokedCertCount := b.decrementTotalRevokedCertificatesCountNoReport()
func (b *backend) emitTotalRevokedCountMetric(revokedCertCount uint32) {
metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_revoked_certificates_stored"}, float32(revokedCertCount))
}

// Called directly only by the initialize function to deduplicate the count, when we don't have a full count yet
// Does not respect whether-we-are-counting backend information.
func (b *backend) decrementTotalRevokedCertificatesCountNoReport() uint32 {
newRevokedCertCount := atomic.AddUint32(b.revokedCertCount, ^uint32(0))
newRevokedCertCount := b.revokedCertCount.Add(^uint32(0))
return newRevokedCertCount
}
49 changes: 36 additions & 13 deletions builtin/logical/pki/backend_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ import (
"strconv"
"strings"
"sync"
"sync/atomic"
"testing"
"time"

Expand Down Expand Up @@ -3791,6 +3790,15 @@ func TestBackend_RevokePlusTidy_Intermediate(t *testing.T) {
t.Fatal(err)
}

// Set up Metric Configuration, then restart to enable it
_, err = client.Logical().Write("pki/config/auto-tidy", map[string]interface{}{
"maintain_stored_certificate_counts": true,
"publish_stored_certificate_count_metrics": true,
})
_, err = client.Logical().Write("/sys/plugins/reload/backend", map[string]interface{}{
"mounts": "pki/",
})

// Check the metrics initialized in order to calculate backendUUID for /pki
// BackendUUID not consistent during tests with UUID from /sys/mounts/pki
metricsSuffix := "total_certificates_stored"
Expand Down Expand Up @@ -3827,6 +3835,14 @@ func TestBackend_RevokePlusTidy_Intermediate(t *testing.T) {
if err != nil {
t.Fatal(err)
}
// Set up Metric Configuration, then restart to enable it
_, err = client.Logical().Write("pki2/config/auto-tidy", map[string]interface{}{
"maintain_stored_certificate_counts": true,
"publish_stored_certificate_count_metrics": true,
})
_, err = client.Logical().Write("/sys/plugins/reload/backend", map[string]interface{}{
"mounts": "pki2/",
})

// Create a CSR for the intermediate CA
secret, err := client.Logical().Write("pki2/intermediate/generate/internal", nil)
Expand Down Expand Up @@ -3948,6 +3964,7 @@ func TestBackend_RevokePlusTidy_Intermediate(t *testing.T) {
"missing_issuer_cert_count": json.Number("0"),
"current_cert_store_count": json.Number("0"),
"current_revoked_cert_count": json.Number("0"),
"internal_backend_uuid": backendUUID,
}
// Let's copy the times from the response so that we can use deep.Equal()
timeStarted, ok := tidyStatus.Data["time_started"]
Expand Down Expand Up @@ -5580,6 +5597,14 @@ func TestBackend_InitializeCertificateCounts(t *testing.T) {
serials[i] = resp.Data["serial_number"].(string)
}

// Turn on certificate counting:
CBWrite(b, s, "config/auto-tidy", map[string]interface{}{
"maintain_stored_certificate_counts": true,
"publish_stored_certificate_count_metrics": false,
})
// Assert initialize from clean is correct:
b.initializeStoredCertificateCounts(ctx)

// Revoke certificates A + B
revocations := serials[0:2]
for _, key := range revocations {
Expand All @@ -5591,18 +5616,16 @@ func TestBackend_InitializeCertificateCounts(t *testing.T) {
}
}

// Assert initialize from clean is correct:
b.initializeStoredCertificateCounts(ctx)
if atomic.LoadUint32(b.certCount) != 6 {
t.Fatalf("Failed to count six certificates root,A,B,C,D,E, instead counted %d certs", atomic.LoadUint32(b.certCount))
if b.certCount.Load() != 6 {
t.Fatalf("Failed to count six certificates root,A,B,C,D,E, instead counted %d certs", b.certCount.Load())
}
if atomic.LoadUint32(b.revokedCertCount) != 2 {
t.Fatalf("Failed to count two revoked certificates A+B, instead counted %d certs", atomic.LoadUint32(b.revokedCertCount))
if b.revokedCertCount.Load() != 2 {
t.Fatalf("Failed to count two revoked certificates A+B, instead counted %d certs", b.revokedCertCount.Load())
}

// Simulates listing while initialize in progress, by "restarting it"
atomic.StoreUint32(b.certCount, 0)
atomic.StoreUint32(b.revokedCertCount, 0)
b.certCount.Store(0)
b.revokedCertCount.Store(0)
b.certsCounted.Store(false)

// Revoke certificates C, D
Expand Down Expand Up @@ -5631,12 +5654,12 @@ func TestBackend_InitializeCertificateCounts(t *testing.T) {
b.initializeStoredCertificateCounts(ctx)

// Test certificate count
if atomic.LoadUint32(b.certCount) != 8 {
t.Fatalf("Failed to initialize count of certificates root, A,B,C,D,E,F,G counted %d certs", *(b.certCount))
if b.certCount.Load() != 8 {
t.Fatalf("Failed to initialize count of certificates root, A,B,C,D,E,F,G counted %d certs", b.certCount.Load())
}

if atomic.LoadUint32(b.revokedCertCount) != 4 {
t.Fatalf("Failed to count revoked certificates A,B,C,D counted %d certs", *(b.revokedCertCount))
if b.revokedCertCount.Load() != 4 {
t.Fatalf("Failed to count revoked certificates A,B,C,D counted %d certs", b.revokedCertCount.Load())
}

return
Expand Down
7 changes: 4 additions & 3 deletions builtin/logical/pki/cert_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,8 @@ func fetchCertBySerial(ctx context.Context, b *backend, req *logical.Request, pr
hyphenSerial := normalizeSerial(serial)
colonSerial := strings.ReplaceAll(strings.ToLower(serial), "-", ":")

sc := b.makeStorageContext(ctx, req.Storage)

switch {
// Revoked goes first as otherwise crl get hardcoded paths which fail if
// we actually want revocation info
Expand All @@ -178,7 +180,6 @@ func fetchCertBySerial(ctx context.Context, b *backend, req *logical.Request, pr
if err = b.crlBuilder.rebuildIfForced(ctx, b, req); err != nil {
return nil, err
}
sc := b.makeStorageContext(ctx, req.Storage)
path, err = sc.resolveIssuerCRLPath(defaultRef)
if err != nil {
return nil, err
Expand Down Expand Up @@ -234,9 +235,9 @@ func fetchCertBySerial(ctx context.Context, b *backend, req *logical.Request, pr
// If we fail here, we have an extra (copy) of a cert in storage, add to metrics:
switch {
case strings.HasPrefix(prefix, "revoked/"):
b.incrementTotalRevokedCertificatesCount(certsCounted, path)
sc.Backend.ifCountEnabledIncrementTotalRevokedCertificatesCount(certsCounted, path)
default:
b.incrementTotalCertificatesCount(certsCounted, path)
sc.Backend.ifCountEnabledIncrementTotalCertificatesCount(certsCounted, path)
}
return nil, errutil.InternalError{Err: fmt.Sprintf("error deleting certificate with serial %s from old location", serial)}
}
Expand Down
2 changes: 1 addition & 1 deletion builtin/logical/pki/crl_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -582,7 +582,7 @@ func revokeCert(ctx context.Context, b *backend, req *logical.Request, serial st
if err != nil {
return nil, fmt.Errorf("error saving revoked certificate to new location")
}
b.incrementTotalRevokedCertificatesCount(certsCounted, revEntry.Key)
b.ifCountEnabledIncrementTotalRevokedCertificatesCount(certsCounted, revEntry.Key)
}

// Fetch the config and see if we need to rebuild the CRL. If we have
Expand Down
Loading

0 comments on commit 0f06076

Please sign in to comment.