Skip to content

Commit

Permalink
Merge pull request #9924 from hashicorp/dnephin/cert-expiration-metric
Browse files Browse the repository at this point in the history
connect: emit a metric for the seconds until root CA expiry
  • Loading branch information
dnephin authored Jun 18, 2021
2 parents 19d3eef + aec7e79 commit d81f527
Show file tree
Hide file tree
Showing 6 changed files with 82 additions and 0 deletions.
4 changes: 4 additions & 0 deletions .changelog/9924.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
```release-note:improvement
telemetry: add a new `mesh.active-root-ca.expiry` metric for tracking when the root certificate expires.
```

1 change: 1 addition & 0 deletions agent/consul/leader_connect.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ func (s *Server) startConnectLeader(ctx context.Context) error {

s.caManager.Start(ctx)
s.leaderRoutineManager.Start(ctx, caRootPruningRoutineName, s.runCARootPruning)
s.leaderRoutineManager.Start(ctx, caRootMetricRoutineName, rootCAExpiryMonitor(s).monitor)

return s.startIntentionConfigEntryMigration(ctx)
}
Expand Down
74 changes: 74 additions & 0 deletions agent/consul/leader_metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
package consul

import (
"context"
"fmt"
"time"

"github.com/armon/go-metrics"
"github.com/armon/go-metrics/prometheus"
"github.com/hashicorp/go-hclog"

"github.com/hashicorp/consul/logging"
)

var CertExpirationGauges = []prometheus.GaugeDefinition{
{
Name: metricsKeyMeshRootCAExpiry,
Help: "Seconds until the service mesh root certificate expires.",
},
}

var metricsKeyMeshRootCAExpiry = []string{"mesh", "active-root-ca", "expiry"}

func rootCAExpiryMonitor(s *Server) certExpirationMonitor {
return certExpirationMonitor{
Key: metricsKeyMeshRootCAExpiry,
Labels: []metrics.Label{
{Name: "datacenter", Value: s.config.Datacenter},
},
Logger: s.logger.Named(logging.Connect),
Query: func() (time.Duration, error) {
state := s.fsm.State()
_, root, err := state.CARootActive(nil)
switch {
case err != nil:
return 0, fmt.Errorf("failed to retrieve root CA: %w", err)
case root == nil:
return 0, fmt.Errorf("no active root CA")
}

return time.Until(root.NotAfter), nil
},
}
}

type certExpirationMonitor struct {
Key []string
Labels []metrics.Label
Logger hclog.Logger
// Query is called at each interval. It should return the duration until the
// certificate expires, or an error if the query failed.
Query func() (time.Duration, error)
}

const certExpirationMonitorInterval = time.Hour

func (m certExpirationMonitor) monitor(ctx context.Context) error {
ticker := time.NewTicker(certExpirationMonitorInterval)
defer ticker.Stop()

for {
select {
case <-ctx.Done():
return nil
case <-ticker.C:
d, err := m.Query()
if err != nil {
m.Logger.Warn("failed to emit certificate expiry metric", "error", err)
}
expiry := d / time.Second
metrics.SetGaugeWithLabels(m.Key, float32(expiry), m.Labels)
}
}
}
1 change: 1 addition & 0 deletions agent/consul/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ const (
aclTokenReapingRoutineName = "acl token reaping"
aclUpgradeRoutineName = "legacy ACL token upgrade"
caRootPruningRoutineName = "CA root pruning"
caRootMetricRoutineName = "CA root expiration metric"
configReplicationRoutineName = "config entry replication"
federationStateReplicationRoutineName = "federation state replication"
federationStateAntiEntropyRoutineName = "federation state anti-entropy"
Expand Down
1 change: 1 addition & 0 deletions agent/setup.go
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ func getPrometheusDefs(cfg lib.TelemetryConfig) ([]prometheus.GaugeDefinition, [
xds.StatsGauges,
usagemetrics.Gauges,
consul.ReplicationGauges,
consul.CertExpirationGauges,
Gauges,
raftGauges,
}
Expand Down
1 change: 1 addition & 0 deletions website/content/docs/agent/telemetry.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -478,6 +478,7 @@ These metrics give insight into the health of the cluster as a whole.
| `consul.catalog.connect.query-tag..` | Increments for each connect-based catalog query for the given service with the given tag. | queries | counter |
| `consul.catalog.connect.query-tags..` | Increments for each connect-based catalog query for the given service with the given tags. | queries | counter |
| `consul.catalog.connect.not-found.` | Increments for each connect-based catalog query where the given service could not be found. | queries | counter |
| `consul.mesh.active-root-ca.expiry` | The number of seconds until the root CA expires, updated every hour. | seconds | gauge |

## Connect Built-in Proxy Metrics

Expand Down

0 comments on commit d81f527

Please sign in to comment.