Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

telemetry: add Agent TLS Certificate expiration metric #10768

Merged
merged 2 commits into from
Aug 4, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .changelog/10768.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
```release-note:improvement
telemetry: add a new `agent.tls.cert.expiry` metric for tracking when the Agent TLS certificate expires.
```

5 changes: 5 additions & 0 deletions agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -639,6 +639,11 @@ func (a *Agent) Start(ctx context.Context) error {
a.logger.Warn("DEPRECATED Backwards compatibility with pre-1.9 metrics enabled. These metrics will be removed in a future version of Consul. Set `telemetry { disable_compat_1.9 = true }` to disable them.")
}

if a.tlsConfigurator.Cert() != nil {
m := consul.AgentTLSCertExpirationMonitor(a.tlsConfigurator, a.logger, a.config.Datacenter)
go m.Monitor(&lib.StopChannelContext{StopCh: a.shutdownCh})
}

// consul version metric with labels
metrics.SetGaugeWithLabels([]string{"version"}, 1, []metrics.Label{
{Name: "version", Value: a.config.Version},
Expand Down
4 changes: 2 additions & 2 deletions agent/consul/leader_connect.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ func (s *Server) startConnectLeader(ctx context.Context) error {

s.caManager.Start(ctx)
s.leaderRoutineManager.Start(ctx, caRootPruningRoutineName, s.runCARootPruning)
s.leaderRoutineManager.Start(ctx, caRootMetricRoutineName, rootCAExpiryMonitor(s).monitor)
s.leaderRoutineManager.Start(ctx, caSigningMetricRoutineName, signingCAExpiryMonitor(s).monitor)
s.leaderRoutineManager.Start(ctx, caRootMetricRoutineName, rootCAExpiryMonitor(s).Monitor)
s.leaderRoutineManager.Start(ctx, caSigningMetricRoutineName, signingCAExpiryMonitor(s).Monitor)

return s.startIntentionConfigEntryMigration(ctx)
}
Expand Down
86 changes: 60 additions & 26 deletions agent/consul/leader_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,20 @@ package consul

import (
"context"
"crypto/x509"
"errors"
"fmt"
"strings"
"time"

"github.com/hashicorp/consul/agent/connect/ca"

"github.com/hashicorp/consul/agent/connect"

"github.com/armon/go-metrics"
"github.com/armon/go-metrics/prometheus"
"github.com/hashicorp/consul/logging"
"github.com/hashicorp/go-hclog"

"github.com/hashicorp/consul/agent/connect"
"github.com/hashicorp/consul/agent/connect/ca"
"github.com/hashicorp/consul/logging"
"github.com/hashicorp/consul/tlsutil"
)

var metricsKeyMeshRootCAExpiry = []string{"mesh", "active-root-ca", "expiry"}
Expand All @@ -28,10 +30,14 @@ var CertExpirationGauges = []prometheus.GaugeDefinition{
Name: metricsKeyMeshActiveSigningCAExpiry,
Help: "Seconds until the service mesh signing certificate expires. Updated every hour",
},
{
Name: metricsKeyAgentTLSCertExpiry,
Help: "Seconds until the agent tls certificate expires. Updated every hour",
},
}

func rootCAExpiryMonitor(s *Server) certExpirationMonitor {
return certExpirationMonitor{
func rootCAExpiryMonitor(s *Server) CertExpirationMonitor {
return CertExpirationMonitor{
Key: metricsKeyMeshRootCAExpiry,
Labels: []metrics.Label{
{Name: "datacenter", Value: s.config.Datacenter},
Expand All @@ -56,10 +62,10 @@ func getRootCAExpiry(s *Server) (time.Duration, error) {
return time.Until(root.NotAfter), nil
}

func signingCAExpiryMonitor(s *Server) certExpirationMonitor {
func signingCAExpiryMonitor(s *Server) CertExpirationMonitor {
isPrimary := s.config.Datacenter == s.config.PrimaryDatacenter
if isPrimary {
return certExpirationMonitor{
return CertExpirationMonitor{
Key: metricsKeyMeshActiveSigningCAExpiry,
Labels: []metrics.Label{
{Name: "datacenter", Value: s.config.Datacenter},
Expand All @@ -68,26 +74,24 @@ func signingCAExpiryMonitor(s *Server) certExpirationMonitor {
Query: func() (time.Duration, error) {
provider, _ := s.caManager.getCAProvider()

if _, ok := provider.(ca.PrimaryUsesIntermediate); !ok {
if _, ok := provider.(ca.PrimaryUsesIntermediate); ok {
return getActiveIntermediateExpiry(s)
}

return getRootCAExpiry(s)

},
}
} else {
return certExpirationMonitor{
Key: metricsKeyMeshActiveSigningCAExpiry,
Labels: []metrics.Label{
{Name: "datacenter", Value: s.config.Datacenter},
},
Logger: s.logger.Named(logging.Connect),
Query: func() (time.Duration, error) {
return getActiveIntermediateExpiry(s)
},
}
}

return CertExpirationMonitor{
Key: metricsKeyMeshActiveSigningCAExpiry,
Labels: []metrics.Label{
{Name: "datacenter", Value: s.config.Datacenter},
},
Logger: s.logger.Named(logging.Connect),
Query: func() (time.Duration, error) {
return getActiveIntermediateExpiry(s)
},
}
}

func getActiveIntermediateExpiry(s *Server) (time.Duration, error) {
Expand All @@ -109,7 +113,7 @@ func getActiveIntermediateExpiry(s *Server) (time.Duration, error) {
return time.Until(cert.NotAfter), nil
}

type certExpirationMonitor struct {
type CertExpirationMonitor struct {
Key []string
Labels []metrics.Label
Logger hclog.Logger
Expand All @@ -120,21 +124,51 @@ type certExpirationMonitor struct {

const certExpirationMonitorInterval = time.Hour

func (m certExpirationMonitor) monitor(ctx context.Context) error {
func (m CertExpirationMonitor) Monitor(ctx context.Context) error {
ticker := time.NewTicker(certExpirationMonitorInterval)
defer ticker.Stop()

logger := m.Logger.With("metric", strings.Join(m.Key, "."))

for {
select {
case <-ctx.Done():
return nil
case <-ticker.C:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will this mean that the metric/logs do not get emitted until the monitoring interval has passed (so after an hour)? Or does the ticker fire immediately and then again after each interval?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It does not fire immediately. I think that would be a good improvement to handle the scenario you mentioned.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added that in #10771.

d, err := m.Query()
if err != nil {
m.Logger.Warn("failed to emit certificate expiry metric", "error", err)
logger.Warn("failed to emit certificate expiry metric", "error", err)
continue
}
expiry := d / time.Second
metrics.SetGaugeWithLabels(m.Key, float32(expiry), m.Labels)
}
}
}

var metricsKeyAgentTLSCertExpiry = []string{"agent", "tls", "cert", "expiry"}

// AgentTLSCertExpirationMonitor returns a CertExpirationMonitor which will
// monitor the expiration of the certificate used for agent TLS.
func AgentTLSCertExpirationMonitor(c *tlsutil.Configurator, logger hclog.Logger, dc string) CertExpirationMonitor {
return CertExpirationMonitor{
Key: metricsKeyAgentTLSCertExpiry,
Labels: []metrics.Label{
{Name: "node", Value: c.Base().NodeName},
{Name: "datacenter", Value: dc},
},
Logger: logger,
Query: func() (time.Duration, error) {
raw := c.Cert()
if raw == nil {
return 0, fmt.Errorf("tls not enabled")
}

cert, err := x509.ParseCertificate(raw.Certificate[0])
if err != nil {
return 0, fmt.Errorf("failed to parse agent tls cert: %w", err)
}
return time.Until(cert.NotAfter), nil
},
}
}
1 change: 1 addition & 0 deletions website/content/docs/agent/telemetry.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -480,6 +480,7 @@ These metrics give insight into the health of the cluster as a whole.
| `consul.catalog.connect.not-found.` | Increments for each connect-based catalog query where the given service could not be found. | queries | counter |
| `consul.mesh.active-root-ca.expiry` | The number of seconds until the root CA expires, updated every hour. | seconds | gauge |
| `consul.mesh.active-signing-ca.expiry` | The number of seconds until the signing CA expires, updated every hour. | seconds | gauge |
| `consul.agent.tls.cert.expiry` | The number of seconds until the Agent TLS certificate expires, updated every hour. | seconds | gauge |

## Connect Built-in Proxy Metrics

Expand Down