-
Notifications
You must be signed in to change notification settings - Fork 47
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
fix: CrashLoopBackOff once tlsProfile changed #640
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -17,6 +17,7 @@ limitations under the License. | |||||||||||||
package main | ||||||||||||||
|
||||||||||||||
import ( | ||||||||||||||
"context" | ||||||||||||||
"crypto/tls" | ||||||||||||||
"flag" | ||||||||||||||
"fmt" | ||||||||||||||
|
@@ -28,13 +29,18 @@ import ( | |||||||||||||
// to ensure that exec-entrypoint and run can make use of them. | ||||||||||||||
_ "k8s.io/client-go/plugin/pkg/client/auth" | ||||||||||||||
|
||||||||||||||
ocpconfigv1 "github.com/openshift/api/config/v1" | ||||||||||||||
"github.com/openshift/library-go/pkg/crypto" | ||||||||||||||
"github.com/prometheus/client_golang/prometheus/promhttp" | ||||||||||||||
ctrl "sigs.k8s.io/controller-runtime" | ||||||||||||||
"sigs.k8s.io/controller-runtime/pkg/cache" | ||||||||||||||
"sigs.k8s.io/controller-runtime/pkg/certwatcher" | ||||||||||||||
"sigs.k8s.io/controller-runtime/pkg/healthz" | ||||||||||||||
"sigs.k8s.io/controller-runtime/pkg/log/zap" | ||||||||||||||
"sigs.k8s.io/controller-runtime/pkg/metrics" | ||||||||||||||
"sigs.k8s.io/controller-runtime/pkg/webhook" | ||||||||||||||
|
||||||||||||||
ssp "kubevirt.io/ssp-operator/api/v1beta2" | ||||||||||||||
"kubevirt.io/ssp-operator/controllers" | ||||||||||||||
"kubevirt.io/ssp-operator/internal/common" | ||||||||||||||
common_templates "kubevirt.io/ssp-operator/internal/operands/common-templates" | ||||||||||||||
|
@@ -66,53 +72,121 @@ const ( | |||||||||||||
webhookPort = 9443 | ||||||||||||||
) | ||||||||||||||
|
||||||||||||||
func runPrometheusServer(metricsAddr string, tlsOptions common.SSPTLSOptions) error { | ||||||||||||||
// This callback executes on each client call returning a new config to be used | ||||||||||||||
// please be aware that the APIServer is using http keepalive so this is going to | ||||||||||||||
// be executed only after a while for fresh connections and not on existing ones | ||||||||||||||
func getConfigForClient(ctx context.Context, cfg *tls.Config, cache cache.Cache) (*tls.Config, error) { | ||||||||||||||
var sspList ssp.SSPList | ||||||||||||||
err := cache.List(ctx, &sspList) | ||||||||||||||
if err != nil { | ||||||||||||||
return nil, err | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
if len(sspList.Items) == 0 || sspList.Items[0].Spec.TLSSecurityProfile == nil { | ||||||||||||||
cfg.MinVersion = crypto.DefaultTLSVersion() | ||||||||||||||
cfg.CipherSuites = nil | ||||||||||||||
return cfg, nil | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
tlsProfile := sspList.Items[0].Spec.TLSSecurityProfile | ||||||||||||||
if tlsProfile.Type == ocpconfigv1.TLSProfileCustomType { | ||||||||||||||
minVersion, err := crypto.TLSVersion(string(tlsProfile.Custom.MinTLSVersion)) | ||||||||||||||
if err != nil { | ||||||||||||||
return nil, err | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When this error is returned, the |
||||||||||||||
} | ||||||||||||||
cfg.MinVersion = minVersion | ||||||||||||||
cfg.CipherSuites = common.CipherIDs(tlsProfile.Custom.Ciphers, &ctrl.Log) | ||||||||||||||
return cfg, nil | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
minVersion, err := crypto.TLSVersion(string(ocpconfigv1.TLSProfiles[tlsProfile.Type].MinTLSVersion)) | ||||||||||||||
if err != nil { | ||||||||||||||
return nil, err | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Similarly here, please keep the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This too? |
||||||||||||||
} | ||||||||||||||
cfg.MinVersion = minVersion | ||||||||||||||
cfg.CipherSuites = common.CipherIDs(ocpconfigv1.TLSProfiles[tlsProfile.Type].Ciphers, &ctrl.Log) | ||||||||||||||
|
||||||||||||||
return cfg, nil | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
type prometheusServer struct { | ||||||||||||||
cache cache.Cache | ||||||||||||||
certPath string | ||||||||||||||
keyPath string | ||||||||||||||
serverAddress string | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
// NeedLeaderElection implements the LeaderElectionRunnable interface, which indicates | ||||||||||||||
// the prometheus server doesn't need leader election. | ||||||||||||||
func (s *prometheusServer) NeedLeaderElection() bool { | ||||||||||||||
return false | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What's the point in always returning false? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This interface is used by the manager to check if the ssp-operator/vendor/sigs.k8s.io/controller-runtime/pkg/manager/manager.go Lines 339 to 344 in 862a7ef
In case of metrics server, we want it to start for all the pods, even ones that fail leader election. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ACK There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Might add a comment about this interface? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you check if this interface is also deprecated? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This interface should not be deprecated. |
||||||||||||||
} | ||||||||||||||
|
||||||||||||||
func (s *prometheusServer) Start(ctx context.Context) error { | ||||||||||||||
akrejcir marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||||||
setupLog.Info("Starting Prometheus metrics endpoint server with TLS") | ||||||||||||||
metrics.Registry.MustRegister(common_templates.CommonTemplatesRestored) | ||||||||||||||
metrics.Registry.MustRegister(common.SSPOperatorReconcileSucceeded) | ||||||||||||||
handler := promhttp.HandlerFor(metrics.Registry, promhttp.HandlerOpts{}) | ||||||||||||||
mux := http.NewServeMux() | ||||||||||||||
mux.Handle("/metrics", handler) | ||||||||||||||
|
||||||||||||||
minTlsVersion, err := tlsOptions.MinTLSVersionId() | ||||||||||||||
if err != nil { | ||||||||||||||
return err | ||||||||||||||
server := &http.Server{ | ||||||||||||||
Addr: s.serverAddress, | ||||||||||||||
Handler: mux, | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
tlsConfig := tls.Config{ | ||||||||||||||
CipherSuites: tlsOptions.CipherIDs(&setupLog), | ||||||||||||||
MinVersion: minTlsVersion, | ||||||||||||||
certWatcher, err := certwatcher.New(s.certPath, s.keyPath) | ||||||||||||||
if err != nil { | ||||||||||||||
return err | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
server := http.Server{ | ||||||||||||||
Addr: metricsAddr, | ||||||||||||||
Handler: mux, | ||||||||||||||
TLSConfig: &tlsConfig, | ||||||||||||||
} | ||||||||||||||
go func() { | ||||||||||||||
// TODO: change context, so it can be closed when | ||||||||||||||
// this function returns an error | ||||||||||||||
if err := certWatcher.Start(ctx); err != nil { | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ideally we should use a different context for the We can do it in a follow-up PR. |
||||||||||||||
setupLog.Error(err, "certificate watcher error") | ||||||||||||||
} | ||||||||||||||
}() | ||||||||||||||
|
||||||||||||||
idleConnsClosed := make(chan struct{}) | ||||||||||||||
go func() { | ||||||||||||||
akrejcir marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||||||
err := server.ListenAndServeTLS(path.Join(sdkTLSDir, sdkTLSCrt), path.Join(sdkTLSDir, sdkTLSKey)) | ||||||||||||||
if err != nil { | ||||||||||||||
setupLog.Error(err, "Failed to start Prometheus metrics endpoint server") | ||||||||||||||
// TODO: make sure that the goroutine finishes when | ||||||||||||||
// this function returns an error | ||||||||||||||
<-ctx.Done() | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When below There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've reconsidered this. Can you add a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||||||||||||||
setupLog.Info("shutting down Prometheus metrics server") | ||||||||||||||
|
||||||||||||||
if err := server.Shutdown(context.Background()); err != nil { | ||||||||||||||
setupLog.Error(err, "error shutting down the HTTP server") | ||||||||||||||
} | ||||||||||||||
close(idleConnsClosed) | ||||||||||||||
}() | ||||||||||||||
|
||||||||||||||
server.TLSConfig = s.getPrometheusTLSConfig(ctx, certWatcher) | ||||||||||||||
|
||||||||||||||
if err := server.ListenAndServeTLS(s.certPath, s.keyPath); err != nil && err != http.ErrServerClosed { | ||||||||||||||
setupLog.Error(err, "Failed to start Prometheus metrics endpoint server") | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The error should be returned from this function. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The documentation for These links are for go |
||||||||||||||
return err | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
<-idleConnsClosed | ||||||||||||||
return nil | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
func getWebhookServer(sspTLSOptions common.SSPTLSOptions) *webhook.Server { | ||||||||||||||
// If TLSSecurityProfile is empty, we want to return nil so that the default | ||||||||||||||
// webhook server configuration is used. | ||||||||||||||
if sspTLSOptions.IsEmpty() { | ||||||||||||||
return nil | ||||||||||||||
func (s *prometheusServer) getPrometheusTLSConfig(ctx context.Context, certWatcher *certwatcher.CertWatcher) *tls.Config { | ||||||||||||||
return &tls.Config{ | ||||||||||||||
GetConfigForClient: func(_ *tls.ClientHelloInfo) (*tls.Config, error) { | ||||||||||||||
cfg := &tls.Config{} | ||||||||||||||
cfg.GetCertificate = certWatcher.GetCertificate | ||||||||||||||
return getConfigForClient(ctx, cfg, s.cache) | ||||||||||||||
}, | ||||||||||||||
} | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
tlsCfgFunc := func(cfg *tls.Config) { | ||||||||||||||
cfg.CipherSuites = sspTLSOptions.CipherIDs(&setupLog) | ||||||||||||||
setupLog.Info("Configured ciphers", "ciphers", cfg.CipherSuites) | ||||||||||||||
func newPrometheusServer(metricsAddr string, cache cache.Cache) *prometheusServer { | ||||||||||||||
return &prometheusServer{ | ||||||||||||||
certPath: path.Join(sdkTLSDir, sdkTLSCrt), | ||||||||||||||
keyPath: path.Join(sdkTLSDir, sdkTLSKey), | ||||||||||||||
cache: cache, | ||||||||||||||
serverAddress: metricsAddr, | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
funcs := []func(*tls.Config){tlsCfgFunc} | ||||||||||||||
return &webhook.Server{Port: webhookPort, TLSMinVersion: sspTLSOptions.MinTLSVersion, TLSOpts: funcs} | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
func main() { | ||||||||||||||
|
@@ -127,6 +201,8 @@ func main() { | |||||||||||||
opts := zap.Options{} | ||||||||||||||
opts.BindFlags(flag.CommandLine) | ||||||||||||||
flag.Parse() | ||||||||||||||
metrics.Registry.MustRegister(common_templates.CommonTemplatesRestored) | ||||||||||||||
metrics.Registry.MustRegister(common.SSPOperatorReconcileSucceeded) | ||||||||||||||
|
||||||||||||||
ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) | ||||||||||||||
|
||||||||||||||
|
@@ -138,26 +214,24 @@ func main() { | |||||||||||||
|
||||||||||||||
ctx := ctrl.SetupSignalHandler() | ||||||||||||||
|
||||||||||||||
tlsOptions, err := common.GetSspTlsOptions(ctx) | ||||||||||||||
if err != nil { | ||||||||||||||
setupLog.Error(err, "Error while getting tls profile") | ||||||||||||||
os.Exit(1) | ||||||||||||||
} | ||||||||||||||
var mgr ctrl.Manager | ||||||||||||||
|
||||||||||||||
err = runPrometheusServer(metricsAddr, *tlsOptions) | ||||||||||||||
if err != nil { | ||||||||||||||
setupLog.Error(err, "unable to start prometheus server") | ||||||||||||||
os.Exit(1) | ||||||||||||||
getTLSOptsFunc := func(cfg *tls.Config) { | ||||||||||||||
cfg.GetConfigForClient = func(_ *tls.ClientHelloInfo) (*tls.Config, error) { | ||||||||||||||
return getConfigForClient(ctx, cfg, mgr.GetCache()) | ||||||||||||||
} | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{ | ||||||||||||||
mgr, err = ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{ | ||||||||||||||
Scheme: common.Scheme, | ||||||||||||||
MetricsBindAddress: "0", | ||||||||||||||
HealthProbeBindAddress: probeAddr, | ||||||||||||||
LeaderElection: enableLeaderElection, | ||||||||||||||
LeaderElectionID: leaderElectionID, | ||||||||||||||
// If WebhookServer is set to nil, a default one will be created. | ||||||||||||||
WebhookServer: getWebhookServer(*tlsOptions), | ||||||||||||||
WebhookServer: &webhook.Server{ | ||||||||||||||
Port: webhookPort, | ||||||||||||||
TLSOpts: []func(*tls.Config){getTLSOptsFunc}, | ||||||||||||||
}, | ||||||||||||||
}) | ||||||||||||||
|
||||||||||||||
if err != nil { | ||||||||||||||
|
@@ -171,10 +245,23 @@ func main() { | |||||||||||||
os.Exit(1) | ||||||||||||||
} | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
metricsServer := newPrometheusServer(metricsAddr, mgr.GetCache()) | ||||||||||||||
if err != nil { | ||||||||||||||
setupLog.Error(err, "unable create Prometheus server") | ||||||||||||||
os.Exit(1) | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
if err := mgr.Add(metricsServer); err != nil { | ||||||||||||||
setupLog.Error(err, "unable to set up metrics") | ||||||||||||||
os.Exit(1) | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
if err := mgr.AddReadyzCheck("check", healthz.Ping); err != nil { | ||||||||||||||
setupLog.Error(err, "unable to set up ready check") | ||||||||||||||
os.Exit(1) | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
if err := mgr.AddHealthzCheck("health", healthz.Ping); err != nil { | ||||||||||||||
setupLog.Error(err, "unable to set up health check") | ||||||||||||||
os.Exit(1) | ||||||||||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why did you revert back to an explicit API call per callback? Please keep a cached version / a shared informer of the SSP CR.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is the client that is passed from manager and it should be using cache.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It was my idea. The
apiClient
uses a cache internally. It creates a shared informer for any requested resource. So callingGet
andList
methods on the client does not make an API call.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ACK