Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: CrashLoopBackOff once tlsProfile changed #640

Merged
merged 1 commit into from
Aug 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 0 additions & 25 deletions controllers/ssp_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ package controllers
import (
"context"
"fmt"
"os"
"reflect"
"strconv"
"strings"
Expand Down Expand Up @@ -148,7 +147,6 @@ func (r *sspReconciler) Reconcile(ctx context.Context, req ctrl.Request) (res ct
// Error reading the object - requeue the request.
return ctrl.Result{}, err
}
restartNeeded := r.isRestartNeeded(instance)
r.clearCacheIfNeeded(instance)

sspRequest := &common.Request{
Expand All @@ -163,10 +161,6 @@ func (r *sspReconciler) Reconcile(ctx context.Context, req ctrl.Request) (res ct
CrdList: r.crdList,
}

if restartNeeded {
r.restart(sspRequest)
}

if !isInitialized(sspRequest.Instance) {
err := initialize(sspRequest)
// No need to requeue here, because
Expand Down Expand Up @@ -235,25 +229,6 @@ func (r *sspReconciler) Reconcile(ctx context.Context, req ctrl.Request) (res ct
return ctrl.Result{}, nil
}

func (r *sspReconciler) isRestartNeeded(sspObj *ssp.SSP) bool {
if reflect.DeepEqual(r.lastSspSpec, ssp.SSPSpec{}) {
return false
}
if !reflect.DeepEqual(r.lastSspSpec.TLSSecurityProfile, sspObj.Spec.TLSSecurityProfile) {
return true
}
return false
}

func (r *sspReconciler) restart(request *common.Request) {
r.log.Info("TLSSecurityProfile changed, restarting")
err := setSspResourceDeploying(request)
if err != nil {
r.log.Info("Error at setSspResourceDeploying", "Error: ", err)
}
os.Exit(0)
}

func (r *sspReconciler) clearCacheIfNeeded(sspObj *ssp.SSP) {
if !reflect.DeepEqual(r.lastSspSpec, sspObj.Spec) {
r.subresourceCache = common.VersionCache{}
Expand Down
35 changes: 2 additions & 33 deletions internal/common/crypto_policy.go
Original file line number Diff line number Diff line change
@@ -1,16 +1,12 @@
package common

import (
"context"
"crypto/tls"
"fmt"

"github.com/go-logr/logr"
ocpv1 "github.com/openshift/api/config/v1"
"github.com/openshift/library-go/pkg/crypto"
ssp "kubevirt.io/ssp-operator/api/v1beta2"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
)

type SSPTLSOptions struct {
Expand Down Expand Up @@ -65,8 +61,8 @@ func GetKnownCipherId(IANACipherName string) (uint16, bool) {
return 0, false
}

func (s *SSPTLSOptions) CipherIDs(logger *logr.Logger) (cipherSuites []uint16) {
for _, cipherName := range crypto.OpenSSLToIANACipherSuites(s.OpenSSLCipherNames) {
func CipherIDs(names []string, logger *logr.Logger) (cipherSuites []uint16) {
for _, cipherName := range crypto.OpenSSLToIANACipherSuites(names) {
if id, ok := GetKnownCipherId(cipherName); ok {
cipherSuites = append(cipherSuites, id)
} else {
Expand All @@ -78,33 +74,6 @@ func (s *SSPTLSOptions) CipherIDs(logger *logr.Logger) (cipherSuites []uint16) {
return
}

func GetSspTlsOptions(ctx context.Context) (*SSPTLSOptions, error) {
setupLog := ctrl.Log.WithName("setup tls options")
restConfig := ctrl.GetConfigOrDie()
apiReader, err := client.New(restConfig, client.Options{Scheme: Scheme})
if err != nil {
return nil, err
}

var sspList ssp.SSPList
if err := apiReader.List(ctx, &sspList, &client.ListOptions{}); err != nil {
return nil, err
}

if len(sspList.Items) == 0 {
setupLog.Info("SSP CR not found, will use default tlsProfile")
return &SSPTLSOptions{}, nil
}

ssp := sspList.Items[0]

sspTLSOptions, err := NewSSPTLSOptions(ssp.Spec.TLSSecurityProfile, &setupLog)
if err != nil {
return nil, err
}
return sspTLSOptions, nil
}

func selectCipherSuitesAndMinTLSVersion(profile *ocpv1.TLSSecurityProfile) ([]string, ocpv1.TLSProtocolVersion) {
if profile == nil {
return nil, ""
Expand Down
2 changes: 1 addition & 1 deletion internal/template-validator/tlsinfo/tlsinfo.go
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ func (ti *TLSInfo) CreateTlsConfig() *tls.Config {
}

if !ti.sspTLSOptions.IsEmpty() {
tlsConfig.CipherSuites = ti.sspTLSOptions.CipherIDs(nil)
tlsConfig.CipherSuites = common.CipherIDs(ti.sspTLSOptions.OpenSSLCipherNames, nil)
minVersion, err := ti.sspTLSOptions.MinTLSVersionId()
if err != nil {
panic(fmt.Sprintf("TLS Configuration broken, min version misconfigured %v", err))
Expand Down
167 changes: 127 additions & 40 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ limitations under the License.
package main

import (
"context"
"crypto/tls"
"flag"
"fmt"
Expand All @@ -28,13 +29,18 @@ import (
// to ensure that exec-entrypoint and run can make use of them.
_ "k8s.io/client-go/plugin/pkg/client/auth"

ocpconfigv1 "github.com/openshift/api/config/v1"
"github.com/openshift/library-go/pkg/crypto"
"github.com/prometheus/client_golang/prometheus/promhttp"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/cache"
"sigs.k8s.io/controller-runtime/pkg/certwatcher"
"sigs.k8s.io/controller-runtime/pkg/healthz"
"sigs.k8s.io/controller-runtime/pkg/log/zap"
"sigs.k8s.io/controller-runtime/pkg/metrics"
"sigs.k8s.io/controller-runtime/pkg/webhook"

ssp "kubevirt.io/ssp-operator/api/v1beta2"
"kubevirt.io/ssp-operator/controllers"
"kubevirt.io/ssp-operator/internal/common"
common_templates "kubevirt.io/ssp-operator/internal/operands/common-templates"
Expand Down Expand Up @@ -66,53 +72,121 @@ const (
webhookPort = 9443
)

func runPrometheusServer(metricsAddr string, tlsOptions common.SSPTLSOptions) error {
// This callback executes on each client call returning a new config to be used
// please be aware that the APIServer is using http keepalive so this is going to
// be executed only after a while for fresh connections and not on existing ones
func getConfigForClient(ctx context.Context, cfg *tls.Config, cache cache.Cache) (*tls.Config, error) {
var sspList ssp.SSPList
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why did you revert back to an explicit API call per callback? Please keep a cached version / a shared informer of the SSP CR.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the client that is passed from manager and it should be using cache.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It was my idea. The apiClient uses a cache internally. It creates a shared informer for any requested resource. So calling Get and List methods on the client does not make an API call.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ACK

err := cache.List(ctx, &sspList)
if err != nil {
return nil, err
}

if len(sspList.Items) == 0 || sspList.Items[0].Spec.TLSSecurityProfile == nil {
cfg.MinVersion = crypto.DefaultTLSVersion()
cfg.CipherSuites = nil
return cfg, nil
}

tlsProfile := sspList.Items[0].Spec.TLSSecurityProfile
if tlsProfile.Type == ocpconfigv1.TLSProfileCustomType {
minVersion, err := crypto.TLSVersion(string(tlsProfile.Custom.MinTLSVersion))
if err != nil {
return nil, err
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When this error is returned, the cfg parameter has already been modified. Can you change the code, so that on error, the cfg parameter is kept unchanged?

}
cfg.MinVersion = minVersion
cfg.CipherSuites = common.CipherIDs(tlsProfile.Custom.Ciphers, &ctrl.Log)
return cfg, nil
}

minVersion, err := crypto.TLSVersion(string(ocpconfigv1.TLSProfiles[tlsProfile.Type].MinTLSVersion))
if err != nil {
return nil, err
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similarly here, please keep the cfg unchanged on error.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This too?

}
cfg.MinVersion = minVersion
cfg.CipherSuites = common.CipherIDs(ocpconfigv1.TLSProfiles[tlsProfile.Type].Ciphers, &ctrl.Log)

return cfg, nil
}

type prometheusServer struct {
cache cache.Cache
certPath string
keyPath string
serverAddress string
}

// NeedLeaderElection implements the LeaderElectionRunnable interface, which indicates
// the prometheus server doesn't need leader election.
func (s *prometheusServer) NeedLeaderElection() bool {
return false
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the point in always returning false?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This interface is used by the manager to check if the Runnable should run only when leader election succedes:

// LeaderElectionRunnable knows if a Runnable needs to be run in the leader election mode.
type LeaderElectionRunnable interface {
// NeedLeaderElection returns true if the Runnable needs to be run in the leader election mode.
// e.g. controllers need to be run in leader election mode, while webhook server doesn't.
NeedLeaderElection() bool
}

In case of metrics server, we want it to start for all the pods, even ones that fail leader election.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ACK

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might add a comment about this interface?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you check if this interface is also deprecated?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This interface should not be deprecated.

}

func (s *prometheusServer) Start(ctx context.Context) error {
akrejcir marked this conversation as resolved.
Show resolved Hide resolved
setupLog.Info("Starting Prometheus metrics endpoint server with TLS")
metrics.Registry.MustRegister(common_templates.CommonTemplatesRestored)
metrics.Registry.MustRegister(common.SSPOperatorReconcileSucceeded)
handler := promhttp.HandlerFor(metrics.Registry, promhttp.HandlerOpts{})
mux := http.NewServeMux()
mux.Handle("/metrics", handler)

minTlsVersion, err := tlsOptions.MinTLSVersionId()
if err != nil {
return err
server := &http.Server{
Addr: s.serverAddress,
Handler: mux,
}

tlsConfig := tls.Config{
CipherSuites: tlsOptions.CipherIDs(&setupLog),
MinVersion: minTlsVersion,
certWatcher, err := certwatcher.New(s.certPath, s.keyPath)
if err != nil {
return err
}

server := http.Server{
Addr: metricsAddr,
Handler: mux,
TLSConfig: &tlsConfig,
}
go func() {
// TODO: change context, so it can be closed when
// this function returns an error
if err := certWatcher.Start(ctx); err != nil {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ideally we should use a different context for the certWatcher than ctx, so that it can be closed when this function returns on error.

We can do it in a follow-up PR.

setupLog.Error(err, "certificate watcher error")
}
}()

idleConnsClosed := make(chan struct{})
go func() {
akrejcir marked this conversation as resolved.
Show resolved Hide resolved
err := server.ListenAndServeTLS(path.Join(sdkTLSDir, sdkTLSCrt), path.Join(sdkTLSDir, sdkTLSKey))
if err != nil {
setupLog.Error(err, "Failed to start Prometheus metrics endpoint server")
// TODO: make sure that the goroutine finishes when
// this function returns an error
<-ctx.Done()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When below server.ListenAndServeTLS() returns because of an error, and this Start() function returns, this goroutine will still be waiting until context is closed.
Can you change the code, so that before the Start() returns, it makes sure that this goroutine has finished?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've reconsidered this.
In this PR, we are already letting the above goroutine leak, from the Start() function, so this one can be left as is.

Can you add a // TODO comment to the code next to both goroutines, and open a new github issue, so that we don't forget to open a followup PR?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

setupLog.Info("shutting down Prometheus metrics server")

if err := server.Shutdown(context.Background()); err != nil {
setupLog.Error(err, "error shutting down the HTTP server")
}
close(idleConnsClosed)
}()

server.TLSConfig = s.getPrometheusTLSConfig(ctx, certWatcher)

if err := server.ListenAndServeTLS(s.certPath, s.keyPath); err != nil && err != http.ErrServerClosed {
setupLog.Error(err, "Failed to start Prometheus metrics endpoint server")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The error should be returned from this function.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

return err
}

<-idleConnsClosed
return nil
}

func getWebhookServer(sspTLSOptions common.SSPTLSOptions) *webhook.Server {
// If TLSSecurityProfile is empty, we want to return nil so that the default
// webhook server configuration is used.
if sspTLSOptions.IsEmpty() {
return nil
func (s *prometheusServer) getPrometheusTLSConfig(ctx context.Context, certWatcher *certwatcher.CertWatcher) *tls.Config {
return &tls.Config{
GetConfigForClient: func(_ *tls.ClientHelloInfo) (*tls.Config, error) {
cfg := &tls.Config{}
cfg.GetCertificate = certWatcher.GetCertificate
return getConfigForClient(ctx, cfg, s.cache)
},
}
}

tlsCfgFunc := func(cfg *tls.Config) {
cfg.CipherSuites = sspTLSOptions.CipherIDs(&setupLog)
setupLog.Info("Configured ciphers", "ciphers", cfg.CipherSuites)
func newPrometheusServer(metricsAddr string, cache cache.Cache) *prometheusServer {
return &prometheusServer{
certPath: path.Join(sdkTLSDir, sdkTLSCrt),
keyPath: path.Join(sdkTLSDir, sdkTLSKey),
cache: cache,
serverAddress: metricsAddr,
}

funcs := []func(*tls.Config){tlsCfgFunc}
return &webhook.Server{Port: webhookPort, TLSMinVersion: sspTLSOptions.MinTLSVersion, TLSOpts: funcs}
}

func main() {
Expand All @@ -127,6 +201,8 @@ func main() {
opts := zap.Options{}
opts.BindFlags(flag.CommandLine)
flag.Parse()
metrics.Registry.MustRegister(common_templates.CommonTemplatesRestored)
metrics.Registry.MustRegister(common.SSPOperatorReconcileSucceeded)

ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts)))

Expand All @@ -138,26 +214,24 @@ func main() {

ctx := ctrl.SetupSignalHandler()

tlsOptions, err := common.GetSspTlsOptions(ctx)
if err != nil {
setupLog.Error(err, "Error while getting tls profile")
os.Exit(1)
}
var mgr ctrl.Manager

err = runPrometheusServer(metricsAddr, *tlsOptions)
if err != nil {
setupLog.Error(err, "unable to start prometheus server")
os.Exit(1)
getTLSOptsFunc := func(cfg *tls.Config) {
cfg.GetConfigForClient = func(_ *tls.ClientHelloInfo) (*tls.Config, error) {
return getConfigForClient(ctx, cfg, mgr.GetCache())
}
}

mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{
mgr, err = ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{
Scheme: common.Scheme,
MetricsBindAddress: "0",
HealthProbeBindAddress: probeAddr,
LeaderElection: enableLeaderElection,
LeaderElectionID: leaderElectionID,
// If WebhookServer is set to nil, a default one will be created.
WebhookServer: getWebhookServer(*tlsOptions),
WebhookServer: &webhook.Server{
Port: webhookPort,
TLSOpts: []func(*tls.Config){getTLSOptsFunc},
},
})

if err != nil {
Expand All @@ -171,10 +245,23 @@ func main() {
os.Exit(1)
}
}

metricsServer := newPrometheusServer(metricsAddr, mgr.GetCache())
if err != nil {
setupLog.Error(err, "unable create Prometheus server")
os.Exit(1)
}

if err := mgr.Add(metricsServer); err != nil {
setupLog.Error(err, "unable to set up metrics")
os.Exit(1)
}

if err := mgr.AddReadyzCheck("check", healthz.Ping); err != nil {
setupLog.Error(err, "unable to set up ready check")
os.Exit(1)
}

if err := mgr.AddHealthzCheck("health", healthz.Ping); err != nil {
setupLog.Error(err, "unable to set up health check")
os.Exit(1)
Expand Down
Loading
Loading