From e60e939aa45c74a404e31c95b14a463f86b95898 Mon Sep 17 00:00:00 2001 From: Justin Kulikauskas Date: Thu, 19 Jan 2023 14:15:48 -0500 Subject: [PATCH] Clean up when deployment is being deleted Previously, when the controller is removed, it could leave a mess behind in the form of ConfigurationPolicy objects with finalizers on them. That would also result in the CRD being stuck. Now, the controller will put a finalizer on its own deployment when at least one ConfigurationPolicy has a finalizer. Also, while the deployment is being deleted, the finalizers on ConfigurationPolicies will be removed immediately, in the same way they would be when the CRD was being deleted. Note: in this implementation, related objects might *never* be pruned when the controller is removed. Since the controller deployment is in a different namespace than the ConfigutaionPolicies it watches, the controller's cache cannot be limited to a single namespace the way it was before. Instead, this uses additional cache selectors to limit watches on ConfigurationPolicies and Deployments to the relevant namespaces. Refs: - https://issues.redhat.com/browse/ACM-2923 Signed-off-by: Justin Kulikauskas --- controllers/configurationpolicy_controller.go | 147 ++++++++++++++---- controllers/configurationpolicy_utils.go | 18 ++- main.go | 80 ++++++---- pkg/common/operator-sdk-port.go | 38 +++++ 4 files changed, 221 insertions(+), 62 deletions(-) diff --git a/controllers/configurationpolicy_controller.go b/controllers/configurationpolicy_controller.go index 83d060ef..80fe6686 100644 --- a/controllers/configurationpolicy_controller.go +++ b/controllers/configurationpolicy_controller.go @@ -18,6 +18,7 @@ import ( gocmp "github.com/google/go-cmp/cmp" "github.com/prometheus/client_golang/prometheus" templates "github.com/stolostron/go-template-utils/v3/pkg/templates" + appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" extensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" extensionsv1beta1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1beta1" @@ -51,6 +52,7 @@ const ( ControllerName string = "configuration-policy-controller" CRDName string = "configurationpolicies.policy.open-cluster-management.io" complianceStatusConditionLimit = 10 + pruneObjectFinalizer string = "policy.open-cluster-management.io/delete-related-objects" ) var log = ctrl.Log.WithName(ControllerName) @@ -188,6 +190,26 @@ func (r *ConfigurationPolicyReconciler) PeriodicallyExecConfigPolicies(freq uint skipLoop = true } + needDeploymentFinalizer := false + + for _, plc := range policiesList.Items { + if objHasFinalizer(&plc, pruneObjectFinalizer) { + needDeploymentFinalizer = true + + break + } + } + + if err := r.manageDeploymentFinalizer(needDeploymentFinalizer); err != nil { + if errors.Is(err, common.ErrNoNamespace) || errors.Is(err, common.ErrRunLocal) { + log.Info("Not managing the controller's deployment finalizer because it is running locally") + } else { + log.Error(err, "Failed to manage the controller's deployment finalizer, skipping loop") + + skipLoop = true + } + } + // This is done every loop cycle since the channel needs to be variable in size to account for the number of // policies changing. policyQueue := make(chan *policyv1.ConfigurationPolicy, len(policiesList.Items)) @@ -565,6 +587,52 @@ func (r *ConfigurationPolicyReconciler) cleanUpChildObjects(plc policyv1.Configu return deletionFailures } +// cleanupImmediately returns true when the cluster is in a state where configurationpolicies should +// be removed as soon as possible, ignoring the pruneObjectBehavior of the policies. This is the +// case when the CRD or the controller's deployment are already being deleted. +func (r *ConfigurationPolicyReconciler) cleanupImmediately() (bool, error) { + deployDeleting, deployErr := r.deploymentIsDeleting() + if deployErr == nil && deployDeleting { + return true, nil + } + + defDeleting, defErr := r.definitionIsDeleting() + if defErr == nil && defDeleting { + return true, nil + } + + if deployErr == nil && defErr == nil { + // if either was deleting, we would've already returned. + return false, nil + } + + // At least one had an unexpected error, so the decision can't be made right now + //nolint:errorlint // we can't choose just one of the errors to "correctly" wrap + return false, fmt.Errorf("deploymentIsDeleting error: '%v', definitionIsDeleting error: '%v'", + deployErr, defErr) +} + +func (r *ConfigurationPolicyReconciler) deploymentIsDeleting() (bool, error) { + key, keyErr := common.GetOperatorNamespacedName() + if keyErr != nil { + if errors.Is(keyErr, common.ErrNoNamespace) || errors.Is(keyErr, common.ErrRunLocal) { + // running locally + return false, nil + } + + return false, keyErr + } + + deployment := appsv1.Deployment{} + + err := r.Get(context.TODO(), key, &deployment) + if err != nil { + return false, err + } + + return deployment.DeletionTimestamp != nil, nil +} + func (r *ConfigurationPolicyReconciler) definitionIsDeleting() (bool, error) { key := types.NamespacedName{Name: CRDName} v1def := extensionsv1.CustomResourceDefinition{} @@ -631,13 +699,31 @@ func (r *ConfigurationPolicyReconciler) handleObjectTemplates(plc policyv1.Confi return } - pruneObjectFinalizer := "policy.open-cluster-management.io/delete-related-objects" - // object handling for when configurationPolicy is deleted if plc.Spec.PruneObjectBehavior == "DeleteIfCreated" || plc.Spec.PruneObjectBehavior == "DeleteAll" { + cleanupNow, err := r.cleanupImmediately() + if err != nil { + log.Error(err, "Error determining whether to cleanup immediately, requeueing policy") + + return + } + + if cleanupNow { + if objHasFinalizer(&plc, pruneObjectFinalizer) { + plc.SetFinalizers(removeObjFinalizer(&plc, pruneObjectFinalizer)) + + err := r.Update(context.TODO(), &plc) + if err != nil { + log.V(1).Error(err, "Error removing finalizer for configuration policy", plc) + } + } + + return + } + // set finalizer if it hasn't been set - if !configPlcHasFinalizer(plc, pruneObjectFinalizer) { - plc.SetFinalizers(addConfigPlcFinalizer(plc, pruneObjectFinalizer)) + if !objHasFinalizer(&plc, pruneObjectFinalizer) { + plc.SetFinalizers(addObjFinalizer(&plc, pruneObjectFinalizer)) err := r.Update(context.TODO(), &plc) if err != nil { @@ -647,32 +733,13 @@ func (r *ConfigurationPolicyReconciler) handleObjectTemplates(plc policyv1.Confi // kick off object deletion if configurationPolicy has been deleted if plc.ObjectMeta.DeletionTimestamp != nil { - // If the CRD is deleting, don't prune objects - crdDeleting, err := r.definitionIsDeleting() - if err != nil { - log.Error(err, "Error getting configurationpolicies CRD, requeueing policy") - - return - } else if crdDeleting { - log.V(1).Info("The configuraionpolicy CRD is being deleted, ignoring and removing " + - "the delete-related-objects finalizer") - - plc.SetFinalizers(removeConfigPlcFinalizer(plc, pruneObjectFinalizer)) - err := r.Update(context.TODO(), &plc) - if err != nil { - log.V(1).Error(err, "Error unsetting finalizer for configuration policy", plc) - } - - return - } // else: CRD is not being deleted - log.V(1).Info("Config policy has been deleted, handling child objects") failures := r.cleanUpChildObjects(plc) if len(failures) == 0 { log.V(1).Info("Objects have been successfully cleaned up, removing finalizer") - plc.SetFinalizers(removeConfigPlcFinalizer(plc, pruneObjectFinalizer)) + plc.SetFinalizers(removeObjFinalizer(&plc, pruneObjectFinalizer)) err := r.Update(context.TODO(), &plc) if err != nil { @@ -704,9 +771,9 @@ func (r *ConfigurationPolicyReconciler) handleObjectTemplates(plc policyv1.Confi return } - } else if configPlcHasFinalizer(plc, pruneObjectFinalizer) { + } else if objHasFinalizer(&plc, pruneObjectFinalizer) { // if pruneObjectBehavior is none, no finalizer is needed - plc.SetFinalizers(removeConfigPlcFinalizer(plc, pruneObjectFinalizer)) + plc.SetFinalizers(removeObjFinalizer(&plc, pruneObjectFinalizer)) err := r.Update(context.TODO(), &plc) if err != nil { log.V(1).Error(err, "Error unsetting finalizer for configuration policy", plc) @@ -2672,3 +2739,31 @@ func convertPolicyStatusToString(plc *policyv1.ConfigurationPolicy) (results str return result } + +func (r *ConfigurationPolicyReconciler) manageDeploymentFinalizer(shouldBeSet bool) error { + key, err := common.GetOperatorNamespacedName() + if err != nil { + return err + } + + deployment := appsv1.Deployment{} + if err := r.Client.Get(context.TODO(), key, &deployment); err != nil { + return err + } + + if objHasFinalizer(&deployment, pruneObjectFinalizer) { + if shouldBeSet { + return nil + } + + deployment.SetFinalizers(removeObjFinalizer(&deployment, pruneObjectFinalizer)) + } else { + if !shouldBeSet { + return nil + } + + deployment.SetFinalizers(addObjFinalizer(&deployment, pruneObjectFinalizer)) + } + + return r.Update(context.TODO(), &deployment) +} diff --git a/controllers/configurationpolicy_utils.go b/controllers/configurationpolicy_utils.go index cd29512d..d1de1d78 100644 --- a/controllers/configurationpolicy_utils.go +++ b/controllers/configurationpolicy_utils.go @@ -10,6 +10,7 @@ import ( "strings" apiRes "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/util/json" @@ -530,8 +531,8 @@ func sortAndJoinKeys(m map[string]bool, sep string) string { return strings.Join(keys, sep) } -func configPlcHasFinalizer(plc policyv1.ConfigurationPolicy, finalizer string) bool { - for _, existingFinalizer := range plc.GetFinalizers() { +func objHasFinalizer(obj metav1.Object, finalizer string) bool { + for _, existingFinalizer := range obj.GetFinalizers() { if existingFinalizer == finalizer { return true } @@ -540,18 +541,19 @@ func configPlcHasFinalizer(plc policyv1.ConfigurationPolicy, finalizer string) b return false } -func addConfigPlcFinalizer(plc policyv1.ConfigurationPolicy, finalizer string) []string { - if configPlcHasFinalizer(plc, finalizer) { - return plc.GetFinalizers() +func addObjFinalizer(obj metav1.Object, finalizer string) []string { + if objHasFinalizer(obj, finalizer) { + return obj.GetFinalizers() } - return append(plc.GetFinalizers(), finalizer) + return append(obj.GetFinalizers(), finalizer) } -func removeConfigPlcFinalizer(plc policyv1.ConfigurationPolicy, finalizer string) []string { +// nolint: unparam +func removeObjFinalizer(obj metav1.Object, finalizer string) []string { result := []string{} - for _, existingFinalizer := range plc.GetFinalizers() { + for _, existingFinalizer := range obj.GetFinalizers() { if existingFinalizer != finalizer { result = append(result, existingFinalizer) } diff --git a/main.go b/main.go index 4cb18805..c57f04ec 100644 --- a/main.go +++ b/main.go @@ -15,9 +15,8 @@ import ( "github.com/go-logr/zapr" "github.com/spf13/pflag" "github.com/stolostron/go-log-utils/zaputil" + appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" - - // to ensure that exec-entrypoint and run can make use of them. extensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" extensionsv1beta1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1beta1" "k8s.io/apimachinery/pkg/fields" @@ -26,6 +25,8 @@ import ( "k8s.io/client-go/dynamic" "k8s.io/client-go/kubernetes" clientgoscheme "k8s.io/client-go/kubernetes/scheme" + + // Import all k8s client auth plugins to ensure that exec-entrypoint and run can use them _ "k8s.io/client-go/plugin/pkg/client/auth" "k8s.io/client-go/rest" "k8s.io/client-go/tools/clientcmd" @@ -147,14 +148,6 @@ func main() { printVersion() - namespace, err := common.GetWatchNamespace() - if err != nil { - log.Error(err, "Failed to get watch namespace") - os.Exit(1) - } - - log.V(2).Info("Configured the watch namespace", "namespace", namespace) - // Get a config to talk to the apiserver cfg, err := config.GetConfig() if err != nil { @@ -163,29 +156,65 @@ func main() { } // Set a field selector so that a watch on CRDs will be limited to just the configuration policy CRD. - newCacheFunc := cache.BuilderWithOptions( - cache.Options{ - SelectorsByObject: cache.SelectorsByObject{ - &extensionsv1.CustomResourceDefinition{}: { - Field: fields.SelectorFromSet(fields.Set{"metadata.name": controllers.CRDName}), - }, - &extensionsv1beta1.CustomResourceDefinition{}: { - Field: fields.SelectorFromSet(fields.Set{"metadata.name": controllers.CRDName}), - }, - }, + cacheSelectors := cache.SelectorsByObject{ + &extensionsv1.CustomResourceDefinition{}: { + Field: fields.SelectorFromSet(fields.Set{"metadata.name": controllers.CRDName}), }, - ) + &extensionsv1beta1.CustomResourceDefinition{}: { + Field: fields.SelectorFromSet(fields.Set{"metadata.name": controllers.CRDName}), + }, + } + + ctrlKey, err := common.GetOperatorNamespacedName() + if err != nil { + if errors.Is(err, common.ErrNoNamespace) || errors.Is(err, common.ErrRunLocal) { + log.Info("Running locally, skipping restrictions on the Deployment cache") + } else { + log.Error(err, "Failed to identify the controller's deployment") + os.Exit(1) + } + } else { + cacheSelectors[&appsv1.Deployment{}] = cache.ObjectSelector{ + Field: fields.SelectorFromSet(fields.Set{ + "metadata.namespace": ctrlKey.Namespace, + "metadata.name": ctrlKey.Name, + }), + } + } + + watchNamespace, err := common.GetWatchNamespace() + if err != nil { + log.Error(err, "Failed to get watch namespace") + os.Exit(1) + } + + if strings.Contains(watchNamespace, ",") { + err = fmt.Errorf("multiple watched namespaces are not allowed for this controller") + log.Error(err, "Failed to get watch namespace") + os.Exit(1) + } + + log.V(2).Info("Configured the watch namespace", "watchNamespace", watchNamespace) + + if watchNamespace != "" { + cacheSelectors[&policyv1.ConfigurationPolicy{}] = cache.ObjectSelector{ + Field: fields.SelectorFromSet(fields.Set{ + "metadata.namespace": watchNamespace, + }), + } + } else { + log.Info("Skipping restrictions on the ConfigurationPolicy cache because watchNamespace is empty") + } // Set default manager options options := manager.Options{ - Namespace: namespace, MetricsBindAddress: metricsAddr, Scheme: scheme, Port: 9443, HealthProbeBindAddress: probeAddr, LeaderElection: enableLeaderElection, LeaderElectionID: "config-policy-controller.open-cluster-management.io", - NewCache: newCacheFunc, + NewCache: cache.BuilderWithOptions(cache.Options{SelectorsByObject: cacheSelectors}), // Disable the cache for Secrets to avoid a watch getting created when the `policy-encryption-key` // Secret is retrieved. Special cache handling is done by the controller. ClientDisableCacheFor: []client.Object{&corev1.Secret{}}, @@ -217,11 +246,6 @@ func main() { ), } - if strings.Contains(namespace, ",") { - options.Namespace = "" - options.NewCache = cache.MultiNamespacedCacheBuilder(strings.Split(namespace, ",")) - } - if legacyLeaderElection { // If legacyLeaderElection is enabled, then that means the lease API is not available. // In this case, use the legacy leader election method of a ConfigMap. diff --git a/pkg/common/operator-sdk-port.go b/pkg/common/operator-sdk-port.go index 00adfe80..b9631ba9 100644 --- a/pkg/common/operator-sdk-port.go +++ b/pkg/common/operator-sdk-port.go @@ -10,6 +10,8 @@ import ( "fmt" "os" "strings" + + "k8s.io/apimachinery/pkg/types" ) type RunModeType string @@ -22,6 +24,10 @@ const ( // which specifies the Namespace to watch. // An empty value means the operator is running with cluster scope. watchNamespaceEnvVar = "WATCH_NAMESPACE" + + // OperatorNameEnvVar is the constant for env variable OPERATOR_NAME + // which is the name of the current operator + OperatorNameEnvVar = "OPERATOR_NAME" ) // ErrNoNamespace indicates that a namespace could not be found for the current @@ -63,3 +69,35 @@ func GetOperatorNamespace() (string, error) { return strings.TrimSpace(string(nsBytes)), nil } + +// GetOperatorName returns the operator name +func GetOperatorName() (string, error) { + operatorName, found := os.LookupEnv(OperatorNameEnvVar) + if !found { + return "", fmt.Errorf("%s must be set", OperatorNameEnvVar) + } + + if len(operatorName) == 0 { + return "", fmt.Errorf("%s must not be empty", OperatorNameEnvVar) + } + + return operatorName, nil +} + +// GetOperatorNamespacedName returns the name and namespace of the operator. +func GetOperatorNamespacedName() (types.NamespacedName, error) { + key := types.NamespacedName{} + var err error + + key.Namespace, err = GetOperatorNamespace() + if err != nil { + return key, err + } + + key.Name, err = GetOperatorName() + if err != nil { + return key, err + } + + return key, nil +}