From eb3b694ce426a48054e23596fe71660780ad69c8 Mon Sep 17 00:00:00 2001 From: mprahl Date: Thu, 30 Nov 2023 15:56:23 -0500 Subject: [PATCH] Stop the NS controller manager during hosted mode uninstalls In hosted mode, there is no guarantee that the managed cluster's API server is available when this controller is being uninstalled. In this case, we can't have any controller-runtime managers connecting to the managed cluster's API server during that time. If there are, the controller can restart without completing the required uninstall cleanup and then can't properly restart since the manager cannot be instantiated. Relates: https://issues.redhat.com/browse/ACM-8826 Signed-off-by: mprahl --- controllers/configurationpolicy_controller.go | 95 +++++++++++-------- main.go | 62 ++++++++---- 2 files changed, 100 insertions(+), 57 deletions(-) diff --git a/controllers/configurationpolicy_controller.go b/controllers/configurationpolicy_controller.go index 0f9353fc..589a59ac 100644 --- a/controllers/configurationpolicy_controller.go +++ b/controllers/configurationpolicy_controller.go @@ -134,6 +134,9 @@ type ConfigurationPolicyReconciler struct { openAPIParser *openapi.CachedOpenAPIParser // A lock when performing actions that are not thread safe (i.e. reassigning object properties). lock sync.RWMutex + // When true, the controller has detected it is being uninstalled and only basic cleanup should be performed before + // exiting. + UninstallMode bool } //+kubebuilder:rbac:groups=*,resources=*,verbs=* @@ -166,7 +169,7 @@ func (r *ConfigurationPolicyReconciler) Reconcile(ctx context.Context, request c // PeriodicallyExecConfigPolicies loops through all configurationpolicies in the target namespace and triggers // template handling for each one. This function drives all the work the configuration policy controller does. func (r *ConfigurationPolicyReconciler) PeriodicallyExecConfigPolicies( - ctx context.Context, freq uint, elected <-chan struct{}, + ctx context.Context, freq uint, elected <-chan struct{}, uninstallDetected context.CancelFunc, ) { log.Info("Waiting for leader election before periodically evaluating configuration policies") @@ -212,11 +215,24 @@ func (r *ConfigurationPolicyReconciler) PeriodicallyExecConfigPolicies( _ = r.refreshDiscoveryInfo() } - cleanupImmediately, err := r.cleanupImmediately() - if err != nil { - log.Error(err, "Failed to determine if it's time to cleanup immediately") + cleanupImmediately := r.UninstallMode + + if !r.UninstallMode { + uninstalling, crdDeleting, err := ShouldCleanupImmediately(r.Client) + if !uninstalling && !crdDeleting && err != nil { + log.Error(err, "Failed to determine if it's time to cleanup immediately") + + skipLoop = true + } + + cleanupImmediately = crdDeleting - skipLoop = true + if uninstalling { + r.UninstallMode = uninstalling + cleanupImmediately = true + + uninstallDetected() + } } if cleanupImmediately || !skipLoop { @@ -667,44 +683,41 @@ func (r *ConfigurationPolicyReconciler) cleanUpChildObjects(plc policyv1.Configu return deletionFailures } -// cleanupImmediately returns true when the cluster is in a state where configurationpolicies should +// ShouldCleanupImmediately returns true when the cluster is in a state where configurationpolicies should // be removed as soon as possible, ignoring the pruneObjectBehavior of the policies. This is the // case when the controller is being uninstalled or the CRD is being deleted. -func (r *ConfigurationPolicyReconciler) cleanupImmediately() (bool, error) { - beingUninstalled, beingUninstalledErr := r.isBeingUninstalled() - if beingUninstalledErr == nil && beingUninstalled { - return true, nil - } +func ShouldCleanupImmediately(client client.Client) (beingUninstalled bool, crdDeleting bool, err error) { + var beingUninstalledErr error - defDeleting, defErr := r.definitionIsDeleting() - if defErr == nil && defDeleting { - return true, nil - } + beingUninstalled, beingUninstalledErr = IsBeingUninstalled(client) - if beingUninstalledErr == nil && defErr == nil { - // if either was deleting, we would've already returned. - return false, nil + var defErr error + + crdDeleting, defErr = definitionIsDeleting(client) + + if beingUninstalledErr != nil && defErr != nil { + err = fmt.Errorf("%w; %w", beingUninstalledErr, defErr) + } else if beingUninstalledErr != nil { + err = beingUninstalledErr + } else if defErr != nil { + err = defErr } - // At least one had an unexpected error, so the decision can't be made right now - //nolint:errorlint // we can't choose just one of the errors to "correctly" wrap - return false, fmt.Errorf( - "isBeingUninstalled error: '%v', definitionIsDeleting error: '%v'", beingUninstalledErr, defErr, - ) + return } -func (r *ConfigurationPolicyReconciler) definitionIsDeleting() (bool, error) { +func definitionIsDeleting(client client.Client) (bool, error) { key := types.NamespacedName{Name: CRDName} v1def := extensionsv1.CustomResourceDefinition{} - v1err := r.Get(context.TODO(), key, &v1def) + v1err := client.Get(context.TODO(), key, &v1def) if v1err == nil { return (v1def.ObjectMeta.DeletionTimestamp != nil), nil } v1beta1def := extensionsv1beta1.CustomResourceDefinition{} - v1beta1err := r.Get(context.TODO(), key, &v1beta1def) + v1beta1err := client.Get(context.TODO(), key, &v1beta1def) if v1beta1err == nil { return (v1beta1def.DeletionTimestamp != nil), nil } @@ -761,18 +774,24 @@ func (r *ConfigurationPolicyReconciler) handleObjectTemplates(plc policyv1.Confi // object handling for when configurationPolicy is deleted if plc.Spec.PruneObjectBehavior == "DeleteIfCreated" || plc.Spec.PruneObjectBehavior == "DeleteAll" { - cleanupNow, err := r.cleanupImmediately() - if err != nil { - log.Error(err, "Error determining whether to cleanup immediately, requeueing policy") + var crdDeleting bool - return + if !r.UninstallMode { + var err error + + crdDeleting, err = definitionIsDeleting(r.Client) + if err != nil { + log.Error(err, "Error determining whether to cleanup immediately, requeueing policy") + + return + } } - if cleanupNow { + if r.UninstallMode || crdDeleting { if objHasFinalizer(&plc, pruneObjectFinalizer) { patch := removeObjFinalizerPatch(&plc, pruneObjectFinalizer) - err = r.Patch(context.TODO(), &plc, client.RawPatch(types.JSONPatchType, patch)) + err := r.Patch(context.TODO(), &plc, client.RawPatch(types.JSONPatchType, patch)) if err != nil { log.Error(err, "Error removing finalizer for configuration policy") @@ -815,7 +834,7 @@ func (r *ConfigurationPolicyReconciler) handleObjectTemplates(plc policyv1.Confi patch := removeObjFinalizerPatch(&plc, pruneObjectFinalizer) - err = r.Patch(context.TODO(), &plc, client.RawPatch(types.JSONPatchType, patch)) + err := r.Patch(context.TODO(), &plc, client.RawPatch(types.JSONPatchType, patch)) if err != nil { log.Error(err, "Error removing finalizer for configuration policy") @@ -3043,7 +3062,7 @@ func convertPolicyStatusToString(plc *policyv1.ConfigurationPolicy) string { // getDeployment gets the Deployment object associated with this controller. If the controller is running outside of // a cluster, no Deployment object or error will be returned. -func (r *ConfigurationPolicyReconciler) getDeployment() (*appsv1.Deployment, error) { +func getDeployment(client client.Client) (*appsv1.Deployment, error) { key, err := common.GetOperatorNamespacedName() if err != nil { // Running locally @@ -3055,15 +3074,15 @@ func (r *ConfigurationPolicyReconciler) getDeployment() (*appsv1.Deployment, err } deployment := appsv1.Deployment{} - if err := r.Client.Get(context.TODO(), key, &deployment); err != nil { + if err := client.Get(context.TODO(), key, &deployment); err != nil { return nil, err } return &deployment, nil } -func (r *ConfigurationPolicyReconciler) isBeingUninstalled() (bool, error) { - deployment, err := r.getDeployment() +func IsBeingUninstalled(client client.Client) (bool, error) { + deployment, err := getDeployment(client) if deployment == nil || err != nil { return false, err } @@ -3074,7 +3093,7 @@ func (r *ConfigurationPolicyReconciler) isBeingUninstalled() (bool, error) { // removeLegacyDeploymentFinalizer removes the policy.open-cluster-management.io/delete-related-objects on the // Deployment object. This finalizer is no longer needed on the Deployment object, so it is removed. func (r *ConfigurationPolicyReconciler) removeLegacyDeploymentFinalizer() error { - deployment, err := r.getDeployment() + deployment, err := getDeployment(r.Client) if deployment == nil || err != nil { return err } diff --git a/main.go b/main.go index 311752d7..c8fe9e18 100644 --- a/main.go +++ b/main.go @@ -278,6 +278,19 @@ func main() { os.Exit(1) } + terminatingCtx := ctrl.SetupSignalHandler() + + uninstallingCtx, uninstallingCtxCancel := context.WithCancel(terminatingCtx) + + beingUninstalled, err := controllers.IsBeingUninstalled(mgr.GetClient()) + if err != nil { + log.Error(err, "Failed to determine if the controller is being uninstalled at startup. Will assume it's not.") + } + + if beingUninstalled { + uninstallingCtxCancel() + } + var targetK8sClient kubernetes.Interface var targetK8sDynamicClient dynamic.Interface var targetK8sConfig *rest.Config @@ -303,16 +316,21 @@ func main() { targetK8sClient = kubernetes.NewForConfigOrDie(targetK8sConfig) targetK8sDynamicClient = dynamic.NewForConfigOrDie(targetK8sConfig) - nsSelMgr, err = manager.New(targetK8sConfig, manager.Options{ - NewCache: cache.BuilderWithOptions(cache.Options{ - TransformByObject: map[client.Object]toolscache.TransformFunc{ - &corev1.Namespace{}: nsTransform, - }, - }), - }) - if err != nil { - log.Error(err, "Unable to create manager from target kube config") - os.Exit(1) + // The managed cluster's API server is potentially not the same as the hosting cluster and it could be + // offline already as part of the uninstall process. In this case, the manager's instantiation will fail. + // This controller is not needed in uninstall mode, so just skip it. + if !beingUninstalled { + nsSelMgr, err = manager.New(targetK8sConfig, manager.Options{ + NewCache: cache.BuilderWithOptions(cache.Options{ + TransformByObject: map[client.Object]toolscache.TransformFunc{ + &corev1.Namespace{}: nsTransform, + }, + }), + }) + if err != nil { + log.Error(err, "Unable to create manager from target kube config") + os.Exit(1) + } } log.Info( @@ -323,12 +341,16 @@ func main() { instanceName, _ := os.Hostname() // on an error, instanceName will be empty, which is ok - nsSelReconciler := common.NamespaceSelectorReconciler{ - Client: nsSelMgr.GetClient(), - } - if err = nsSelReconciler.SetupWithManager(nsSelMgr); err != nil { - log.Error(err, "Unable to create controller", "controller", "NamespaceSelector") - os.Exit(1) + var nsSelReconciler common.NamespaceSelectorReconciler + + if !beingUninstalled { + nsSelReconciler = common.NamespaceSelectorReconciler{ + Client: nsSelMgr.GetClient(), + } + if err = nsSelReconciler.SetupWithManager(nsSelMgr); err != nil { + log.Error(err, "Unable to create controller", "controller", "NamespaceSelector") + os.Exit(1) + } } discoveryClient := discovery.NewDiscoveryClientForConfigOrDie(targetK8sConfig) @@ -362,6 +384,7 @@ func main() { TargetK8sConfig: targetK8sConfig, SelectorReconciler: &nsSelReconciler, EnableMetrics: opts.enableMetrics, + UninstallMode: beingUninstalled, } OpReconciler := controllers.OperatorPolicyReconciler{ @@ -392,14 +415,13 @@ func main() { os.Exit(1) } - terminatingCtx := ctrl.SetupSignalHandler() managerCtx, managerCancel := context.WithCancel(context.Background()) // PeriodicallyExecConfigPolicies is the go-routine that periodically checks the policies log.Info("Periodically processing Configuration Policies", "frequency", opts.frequency) go func() { - reconciler.PeriodicallyExecConfigPolicies(terminatingCtx, opts.frequency, mgr.Elected()) + reconciler.PeriodicallyExecConfigPolicies(terminatingCtx, opts.frequency, mgr.Elected(), uninstallingCtxCancel) managerCancel() }() @@ -467,7 +489,9 @@ func main() { wg.Add(1) go func() { - if err := nsSelMgr.Start(managerCtx); err != nil { + // Use the uninstallingCtx so that this shuts down when the controller is being uninstalled. This is + // important since the managed cluster's API server may become unavailable at this time when in hosted mdoe. + if err := nsSelMgr.Start(uninstallingCtx); err != nil { log.Error(err, "Problem running manager") managerCancel()