diff --git a/controllers/configurationpolicy_controller.go b/controllers/configurationpolicy_controller.go index 0f9353fc..589a59ac 100644 --- a/controllers/configurationpolicy_controller.go +++ b/controllers/configurationpolicy_controller.go @@ -134,6 +134,9 @@ type ConfigurationPolicyReconciler struct { openAPIParser *openapi.CachedOpenAPIParser // A lock when performing actions that are not thread safe (i.e. reassigning object properties). lock sync.RWMutex + // When true, the controller has detected it is being uninstalled and only basic cleanup should be performed before + // exiting. + UninstallMode bool } //+kubebuilder:rbac:groups=*,resources=*,verbs=* @@ -166,7 +169,7 @@ func (r *ConfigurationPolicyReconciler) Reconcile(ctx context.Context, request c // PeriodicallyExecConfigPolicies loops through all configurationpolicies in the target namespace and triggers // template handling for each one. This function drives all the work the configuration policy controller does. func (r *ConfigurationPolicyReconciler) PeriodicallyExecConfigPolicies( - ctx context.Context, freq uint, elected <-chan struct{}, + ctx context.Context, freq uint, elected <-chan struct{}, uninstallDetected context.CancelFunc, ) { log.Info("Waiting for leader election before periodically evaluating configuration policies") @@ -212,11 +215,24 @@ func (r *ConfigurationPolicyReconciler) PeriodicallyExecConfigPolicies( _ = r.refreshDiscoveryInfo() } - cleanupImmediately, err := r.cleanupImmediately() - if err != nil { - log.Error(err, "Failed to determine if it's time to cleanup immediately") + cleanupImmediately := r.UninstallMode + + if !r.UninstallMode { + uninstalling, crdDeleting, err := ShouldCleanupImmediately(r.Client) + if !uninstalling && !crdDeleting && err != nil { + log.Error(err, "Failed to determine if it's time to cleanup immediately") + + skipLoop = true + } + + cleanupImmediately = crdDeleting - skipLoop = true + if uninstalling { + r.UninstallMode = uninstalling + cleanupImmediately = true + + uninstallDetected() + } } if cleanupImmediately || !skipLoop { @@ -667,44 +683,41 @@ func (r *ConfigurationPolicyReconciler) cleanUpChildObjects(plc policyv1.Configu return deletionFailures } -// cleanupImmediately returns true when the cluster is in a state where configurationpolicies should +// ShouldCleanupImmediately returns true when the cluster is in a state where configurationpolicies should // be removed as soon as possible, ignoring the pruneObjectBehavior of the policies. This is the // case when the controller is being uninstalled or the CRD is being deleted. -func (r *ConfigurationPolicyReconciler) cleanupImmediately() (bool, error) { - beingUninstalled, beingUninstalledErr := r.isBeingUninstalled() - if beingUninstalledErr == nil && beingUninstalled { - return true, nil - } +func ShouldCleanupImmediately(client client.Client) (beingUninstalled bool, crdDeleting bool, err error) { + var beingUninstalledErr error - defDeleting, defErr := r.definitionIsDeleting() - if defErr == nil && defDeleting { - return true, nil - } + beingUninstalled, beingUninstalledErr = IsBeingUninstalled(client) - if beingUninstalledErr == nil && defErr == nil { - // if either was deleting, we would've already returned. - return false, nil + var defErr error + + crdDeleting, defErr = definitionIsDeleting(client) + + if beingUninstalledErr != nil && defErr != nil { + err = fmt.Errorf("%w; %w", beingUninstalledErr, defErr) + } else if beingUninstalledErr != nil { + err = beingUninstalledErr + } else if defErr != nil { + err = defErr } - // At least one had an unexpected error, so the decision can't be made right now - //nolint:errorlint // we can't choose just one of the errors to "correctly" wrap - return false, fmt.Errorf( - "isBeingUninstalled error: '%v', definitionIsDeleting error: '%v'", beingUninstalledErr, defErr, - ) + return } -func (r *ConfigurationPolicyReconciler) definitionIsDeleting() (bool, error) { +func definitionIsDeleting(client client.Client) (bool, error) { key := types.NamespacedName{Name: CRDName} v1def := extensionsv1.CustomResourceDefinition{} - v1err := r.Get(context.TODO(), key, &v1def) + v1err := client.Get(context.TODO(), key, &v1def) if v1err == nil { return (v1def.ObjectMeta.DeletionTimestamp != nil), nil } v1beta1def := extensionsv1beta1.CustomResourceDefinition{} - v1beta1err := r.Get(context.TODO(), key, &v1beta1def) + v1beta1err := client.Get(context.TODO(), key, &v1beta1def) if v1beta1err == nil { return (v1beta1def.DeletionTimestamp != nil), nil } @@ -761,18 +774,24 @@ func (r *ConfigurationPolicyReconciler) handleObjectTemplates(plc policyv1.Confi // object handling for when configurationPolicy is deleted if plc.Spec.PruneObjectBehavior == "DeleteIfCreated" || plc.Spec.PruneObjectBehavior == "DeleteAll" { - cleanupNow, err := r.cleanupImmediately() - if err != nil { - log.Error(err, "Error determining whether to cleanup immediately, requeueing policy") + var crdDeleting bool - return + if !r.UninstallMode { + var err error + + crdDeleting, err = definitionIsDeleting(r.Client) + if err != nil { + log.Error(err, "Error determining whether to cleanup immediately, requeueing policy") + + return + } } - if cleanupNow { + if r.UninstallMode || crdDeleting { if objHasFinalizer(&plc, pruneObjectFinalizer) { patch := removeObjFinalizerPatch(&plc, pruneObjectFinalizer) - err = r.Patch(context.TODO(), &plc, client.RawPatch(types.JSONPatchType, patch)) + err := r.Patch(context.TODO(), &plc, client.RawPatch(types.JSONPatchType, patch)) if err != nil { log.Error(err, "Error removing finalizer for configuration policy") @@ -815,7 +834,7 @@ func (r *ConfigurationPolicyReconciler) handleObjectTemplates(plc policyv1.Confi patch := removeObjFinalizerPatch(&plc, pruneObjectFinalizer) - err = r.Patch(context.TODO(), &plc, client.RawPatch(types.JSONPatchType, patch)) + err := r.Patch(context.TODO(), &plc, client.RawPatch(types.JSONPatchType, patch)) if err != nil { log.Error(err, "Error removing finalizer for configuration policy") @@ -3043,7 +3062,7 @@ func convertPolicyStatusToString(plc *policyv1.ConfigurationPolicy) string { // getDeployment gets the Deployment object associated with this controller. If the controller is running outside of // a cluster, no Deployment object or error will be returned. -func (r *ConfigurationPolicyReconciler) getDeployment() (*appsv1.Deployment, error) { +func getDeployment(client client.Client) (*appsv1.Deployment, error) { key, err := common.GetOperatorNamespacedName() if err != nil { // Running locally @@ -3055,15 +3074,15 @@ func (r *ConfigurationPolicyReconciler) getDeployment() (*appsv1.Deployment, err } deployment := appsv1.Deployment{} - if err := r.Client.Get(context.TODO(), key, &deployment); err != nil { + if err := client.Get(context.TODO(), key, &deployment); err != nil { return nil, err } return &deployment, nil } -func (r *ConfigurationPolicyReconciler) isBeingUninstalled() (bool, error) { - deployment, err := r.getDeployment() +func IsBeingUninstalled(client client.Client) (bool, error) { + deployment, err := getDeployment(client) if deployment == nil || err != nil { return false, err } @@ -3074,7 +3093,7 @@ func (r *ConfigurationPolicyReconciler) isBeingUninstalled() (bool, error) { // removeLegacyDeploymentFinalizer removes the policy.open-cluster-management.io/delete-related-objects on the // Deployment object. This finalizer is no longer needed on the Deployment object, so it is removed. func (r *ConfigurationPolicyReconciler) removeLegacyDeploymentFinalizer() error { - deployment, err := r.getDeployment() + deployment, err := getDeployment(r.Client) if deployment == nil || err != nil { return err } diff --git a/main.go b/main.go index 311752d7..c8fe9e18 100644 --- a/main.go +++ b/main.go @@ -278,6 +278,19 @@ func main() { os.Exit(1) } + terminatingCtx := ctrl.SetupSignalHandler() + + uninstallingCtx, uninstallingCtxCancel := context.WithCancel(terminatingCtx) + + beingUninstalled, err := controllers.IsBeingUninstalled(mgr.GetClient()) + if err != nil { + log.Error(err, "Failed to determine if the controller is being uninstalled at startup. Will assume it's not.") + } + + if beingUninstalled { + uninstallingCtxCancel() + } + var targetK8sClient kubernetes.Interface var targetK8sDynamicClient dynamic.Interface var targetK8sConfig *rest.Config @@ -303,16 +316,21 @@ func main() { targetK8sClient = kubernetes.NewForConfigOrDie(targetK8sConfig) targetK8sDynamicClient = dynamic.NewForConfigOrDie(targetK8sConfig) - nsSelMgr, err = manager.New(targetK8sConfig, manager.Options{ - NewCache: cache.BuilderWithOptions(cache.Options{ - TransformByObject: map[client.Object]toolscache.TransformFunc{ - &corev1.Namespace{}: nsTransform, - }, - }), - }) - if err != nil { - log.Error(err, "Unable to create manager from target kube config") - os.Exit(1) + // The managed cluster's API server is potentially not the same as the hosting cluster and it could be + // offline already as part of the uninstall process. In this case, the manager's instantiation will fail. + // This controller is not needed in uninstall mode, so just skip it. + if !beingUninstalled { + nsSelMgr, err = manager.New(targetK8sConfig, manager.Options{ + NewCache: cache.BuilderWithOptions(cache.Options{ + TransformByObject: map[client.Object]toolscache.TransformFunc{ + &corev1.Namespace{}: nsTransform, + }, + }), + }) + if err != nil { + log.Error(err, "Unable to create manager from target kube config") + os.Exit(1) + } } log.Info( @@ -323,12 +341,16 @@ func main() { instanceName, _ := os.Hostname() // on an error, instanceName will be empty, which is ok - nsSelReconciler := common.NamespaceSelectorReconciler{ - Client: nsSelMgr.GetClient(), - } - if err = nsSelReconciler.SetupWithManager(nsSelMgr); err != nil { - log.Error(err, "Unable to create controller", "controller", "NamespaceSelector") - os.Exit(1) + var nsSelReconciler common.NamespaceSelectorReconciler + + if !beingUninstalled { + nsSelReconciler = common.NamespaceSelectorReconciler{ + Client: nsSelMgr.GetClient(), + } + if err = nsSelReconciler.SetupWithManager(nsSelMgr); err != nil { + log.Error(err, "Unable to create controller", "controller", "NamespaceSelector") + os.Exit(1) + } } discoveryClient := discovery.NewDiscoveryClientForConfigOrDie(targetK8sConfig) @@ -362,6 +384,7 @@ func main() { TargetK8sConfig: targetK8sConfig, SelectorReconciler: &nsSelReconciler, EnableMetrics: opts.enableMetrics, + UninstallMode: beingUninstalled, } OpReconciler := controllers.OperatorPolicyReconciler{ @@ -392,14 +415,13 @@ func main() { os.Exit(1) } - terminatingCtx := ctrl.SetupSignalHandler() managerCtx, managerCancel := context.WithCancel(context.Background()) // PeriodicallyExecConfigPolicies is the go-routine that periodically checks the policies log.Info("Periodically processing Configuration Policies", "frequency", opts.frequency) go func() { - reconciler.PeriodicallyExecConfigPolicies(terminatingCtx, opts.frequency, mgr.Elected()) + reconciler.PeriodicallyExecConfigPolicies(terminatingCtx, opts.frequency, mgr.Elected(), uninstallingCtxCancel) managerCancel() }() @@ -467,7 +489,9 @@ func main() { wg.Add(1) go func() { - if err := nsSelMgr.Start(managerCtx); err != nil { + // Use the uninstallingCtx so that this shuts down when the controller is being uninstalled. This is + // important since the managed cluster's API server may become unavailable at this time when in hosted mdoe. + if err := nsSelMgr.Start(uninstallingCtx); err != nil { log.Error(err, "Problem running manager") managerCancel()