-
Notifications
You must be signed in to change notification settings - Fork 2.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add certificate expiry check and warnings
* Add ADR * Add `k3s certificate check` command. * Add periodic check and events when certs are about to expire. * Add metrics for certificate validity remaining, labeled by cert subject Signed-off-by: Brad Davidson <brad.davidson@rancher.com>
- Loading branch information
Showing
11 changed files
with
704 additions
and
150 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
# Add Support for Checking and Alerting on Certificate Expiry | ||
|
||
Date: 2024-03-26 | ||
|
||
## Status | ||
|
||
Accepted | ||
|
||
## Context | ||
|
||
The certificates generated by K3s have two lifecycles: | ||
* Certificate authority certificates expire 3650 days (roughly 10 years) from their moment of issuance. | ||
The CA certificates are not automatically renewed, and require manual intervention to extend their validity. | ||
* Leaf certificates (client and server certs) expire 365 days (roughly 1 year) from their moment of issuance. | ||
The certificates are automatically renewed if they are within 90 days of expiring at the time K3s starts. | ||
|
||
K3s does not currently expose any information about certificate validity. | ||
There are no metrics, CLI tools, or events that an administrator can use to track when certificates must be renewed or rotated to avoid outages when certificates expire. | ||
The best we can do at the moment is recommend that administrators either restart their nodes regularly to ensure that certificates are renewed within the 90 day window, or manually rotate their certs yearly. | ||
|
||
We do not have any guidance around renewing the CA certs, which will be a major undertaking for users as their clusters approach the 10-year mark. We currently have a bit of runway on this issue, as K3s has not been around for 10 years. | ||
|
||
## Decision | ||
|
||
* K3s will add a CLI command to print certificate validity. It will be grouped alongside the command used to rotate the leaf certificates (`k3s certificate rotate`). | ||
* K3s will add an internal controller that maintains metrics for certificate expiration, and creates Events when certificates are about to or have expired. | ||
|
||
## Consequences | ||
|
||
This will require additional documentation, CLI subcommands, and QA work to validate the process steps. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,144 @@ | ||
package certmonitor | ||
|
||
import ( | ||
"context" | ||
"crypto/x509" | ||
"fmt" | ||
"os" | ||
"path/filepath" | ||
"strings" | ||
"time" | ||
|
||
daemonconfig "github.com/k3s-io/k3s/pkg/daemons/config" | ||
"github.com/k3s-io/k3s/pkg/daemons/control/deps" | ||
"github.com/k3s-io/k3s/pkg/util" | ||
"github.com/k3s-io/k3s/pkg/util/services" | ||
"github.com/k3s-io/k3s/pkg/version" | ||
"github.com/prometheus/client_golang/prometheus" | ||
certutil "github.com/rancher/dynamiclistener/cert" | ||
"github.com/rancher/wrangler/pkg/merr" | ||
"github.com/sirupsen/logrus" | ||
corev1 "k8s.io/api/core/v1" | ||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
"k8s.io/apimachinery/pkg/types" | ||
"k8s.io/apimachinery/pkg/util/wait" | ||
"k8s.io/component-base/metrics/legacyregistry" | ||
) | ||
|
||
var ( | ||
// DefaultRegisterer and DefaultGatherer are the implementations of the | ||
// prometheus Registerer and Gatherer interfaces that all metrics operations | ||
// will use. They are variables so that packages that embed this library can | ||
// replace them at runtime, instead of having to pass around specific | ||
// registries. | ||
DefaultRegisterer = legacyregistry.Registerer() | ||
DefaultGatherer = legacyregistry.DefaultGatherer | ||
|
||
// Check certificates twice an hour. Kubernetes events have a TTL of 1 hour by default, | ||
// so similar events should be aggregated and refreshed by the event recorder as long | ||
// as they are created within the TTL period. | ||
certCheckInterval = time.Minute * 30 | ||
|
||
controllerName = version.Program + "-cert-monitor" | ||
|
||
certificateExpirationSeconds = prometheus.NewGaugeVec(prometheus.GaugeOpts{ | ||
Name: version.Program + "_certificate_expiration_seconds", | ||
Help: "Remaining lifetime on the certificate.", | ||
}, []string{"subject", "usages"}) | ||
) | ||
|
||
// Setup starts the certificate expiration monitor | ||
func Setup(ctx context.Context, nodeConfig *daemonconfig.Node, dataDir string) error { | ||
logrus.Debugf("Starting %s with monitoring period %s", controllerName, certCheckInterval) | ||
DefaultRegisterer.MustRegister(certificateExpirationSeconds) | ||
|
||
client, err := util.GetClientSet(nodeConfig.AgentConfig.KubeConfigKubelet) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
recorder := util.BuildControllerEventRecorder(client, controllerName, metav1.NamespaceDefault) | ||
|
||
// This is consistent with events attached to the node generated by the kubelet | ||
// https://github.com/kubernetes/kubernetes/blob/612130dd2f4188db839ea5c2dea07a96b0ad8d1c/pkg/kubelet/kubelet.go#L479-L485 | ||
nodeRef := &corev1.ObjectReference{ | ||
Kind: "Node", | ||
Name: nodeConfig.AgentConfig.NodeName, | ||
UID: types.UID(nodeConfig.AgentConfig.NodeName), | ||
Namespace: "", | ||
} | ||
|
||
// Create a dummy controlConfig just to hold the paths for the server certs | ||
controlConfig := daemonconfig.Control{ | ||
DataDir: filepath.Join(dataDir, "server"), | ||
Runtime: &daemonconfig.ControlRuntime{}, | ||
} | ||
deps.CreateRuntimeCertFiles(&controlConfig) | ||
|
||
caMap := map[string][]string{} | ||
nodeList := services.Agent | ||
if _, err := os.Stat(controlConfig.DataDir); err == nil { | ||
nodeList = services.All | ||
caMap, err = services.FilesForServices(controlConfig, services.CA) | ||
if err != nil { | ||
return err | ||
} | ||
} | ||
|
||
nodeMap, err := services.FilesForServices(controlConfig, nodeList) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
go wait.Until(func() { | ||
logrus.Debugf("Running %s certificate expiration check", controllerName) | ||
if err := checkCerts(nodeMap, time.Hour*24*daemonconfig.CertificateRenewDays); err != nil { | ||
message := fmt.Sprintf("Node certificates require attention - restart %s on this node to trigger automatic rotation: %v", version.Program, err) | ||
recorder.Event(nodeRef, corev1.EventTypeWarning, "CertificateExpirationWarning", message) | ||
} | ||
if err := checkCerts(caMap, time.Hour*24*365); err != nil { | ||
message := fmt.Sprintf("Certificate authority certificates require attention - check %s documentation and begin planning rotation: %v", version.Program, err) | ||
recorder.Event(nodeRef, corev1.EventTypeWarning, "CACertificateExpirationWarning", message) | ||
|
||
} | ||
}, certCheckInterval, ctx.Done()) | ||
|
||
return nil | ||
} | ||
|
||
func checkCerts(fileMap map[string][]string, warningPeriod time.Duration) error { | ||
errs := merr.Errors{} | ||
now := time.Now() | ||
warn := now.Add(warningPeriod) | ||
|
||
for service, files := range fileMap { | ||
for _, file := range files { | ||
basename := filepath.Base(file) | ||
certs, _ := certutil.CertsFromFile(file) | ||
for _, cert := range certs { | ||
usages := []string{} | ||
if cert.KeyUsage&x509.KeyUsageCertSign != 0 { | ||
usages = append(usages, "CertSign") | ||
} | ||
for _, eku := range cert.ExtKeyUsage { | ||
switch eku { | ||
case x509.ExtKeyUsageServerAuth: | ||
usages = append(usages, "ServerAuth") | ||
case x509.ExtKeyUsageClientAuth: | ||
usages = append(usages, "ClientAuth") | ||
} | ||
} | ||
certificateExpirationSeconds.WithLabelValues(cert.Subject.String(), strings.Join(usages, ",")).Set(cert.NotAfter.Sub(now).Seconds()) | ||
if now.Before(cert.NotBefore) { | ||
errs = append(errs, fmt.Errorf("%s/%s: certificate %s is not valid before %s", service, basename, cert.Subject, cert.NotBefore.Format(time.RFC3339))) | ||
} else if now.After(cert.NotAfter) { | ||
errs = append(errs, fmt.Errorf("%s/%s: certificate %s expired at %s", service, basename, cert.Subject, cert.NotAfter.Format(time.RFC3339))) | ||
} else if warn.After(cert.NotAfter) { | ||
errs = append(errs, fmt.Errorf("%s/%s: certificate %s will expire within %d days at %s", service, basename, cert.Subject, daemonconfig.CertificateRenewDays, cert.NotAfter.Format(time.RFC3339))) | ||
} | ||
} | ||
} | ||
} | ||
|
||
return merr.NewErrors(errs...) | ||
} |
Oops, something went wrong.