Skip to content

Commit

Permalink
Stop the NS controller manager during hosted mode uninstalls
Browse files Browse the repository at this point in the history
In hosted mode, there is no guarantee that the managed cluster's API
server is available when this controller is being uninstalled. In this
case, we can't have any controller-runtime managers connecting to the
managed cluster's API server during that time. If there are, the
controller can restart without completing the required uninstall cleanup
and then can't properly restart since the manager cannot be
instantiated.

Relates:
https://issues.redhat.com/browse/ACM-8826

Signed-off-by: mprahl <mprahl@users.noreply.github.com>
  • Loading branch information
mprahl committed Dec 1, 2023
1 parent 2d10d53 commit 5586260
Show file tree
Hide file tree
Showing 2 changed files with 99 additions and 56 deletions.
93 changes: 56 additions & 37 deletions controllers/configurationpolicy_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,9 @@ type ConfigurationPolicyReconciler struct {
openAPIParser *openapi.CachedOpenAPIParser
// A lock when performing actions that are not thread safe (i.e. reassigning object properties).
lock sync.RWMutex
// When true, the controller has detected it is being uninstalled and only basic cleanup should be performed before
// exiting.
UninstallMode bool
}

//+kubebuilder:rbac:groups=*,resources=*,verbs=*
Expand Down Expand Up @@ -166,7 +169,7 @@ func (r *ConfigurationPolicyReconciler) Reconcile(ctx context.Context, request c
// PeriodicallyExecConfigPolicies loops through all configurationpolicies in the target namespace and triggers
// template handling for each one. This function drives all the work the configuration policy controller does.
func (r *ConfigurationPolicyReconciler) PeriodicallyExecConfigPolicies(
ctx context.Context, freq uint, elected <-chan struct{},
ctx context.Context, freq uint, elected <-chan struct{}, uninstallDetected context.CancelFunc,
) {
log.Info("Waiting for leader election before periodically evaluating configuration policies")

Expand Down Expand Up @@ -212,11 +215,24 @@ func (r *ConfigurationPolicyReconciler) PeriodicallyExecConfigPolicies(
_ = r.refreshDiscoveryInfo()
}

cleanupImmediately, err := r.cleanupImmediately()
if err != nil {
log.Error(err, "Failed to determine if it's time to cleanup immediately")
cleanupImmediately := r.UninstallMode

if !r.UninstallMode {
uninstalling, crdDeleting, err := r.cleanupImmediately()
if !uninstalling && !crdDeleting && err != nil {
log.Error(err, "Failed to determine if it's time to cleanup immediately")

skipLoop = true
}

cleanupImmediately = crdDeleting

skipLoop = true
if uninstalling {
r.UninstallMode = uninstalling
cleanupImmediately = true

uninstallDetected()
}
}

if cleanupImmediately || !skipLoop {
Expand Down Expand Up @@ -667,30 +683,27 @@ func (r *ConfigurationPolicyReconciler) cleanUpChildObjects(plc policyv1.Configu
return deletionFailures
}

// cleanupImmediately returns true when the cluster is in a state where configurationpolicies should
// be removed as soon as possible, ignoring the pruneObjectBehavior of the policies. This is the
// case when the controller is being uninstalled or the CRD is being deleted.
func (r *ConfigurationPolicyReconciler) cleanupImmediately() (bool, error) {
beingUninstalled, beingUninstalledErr := r.isBeingUninstalled()
if beingUninstalledErr == nil && beingUninstalled {
return true, nil
}
// cleanupImmediately returns true (i.e. beingUninstalled or crdDeleting) when the cluster is in a state where
// configurationpolicies should be removed as soon as possible, ignoring the pruneObjectBehavior of the policies. This
// is the case when the controller is being uninstalled or the CRD is being deleted.
func (r *ConfigurationPolicyReconciler) cleanupImmediately() (beingUninstalled bool, crdDeleting bool, err error) {
var beingUninstalledErr error

defDeleting, defErr := r.definitionIsDeleting()
if defErr == nil && defDeleting {
return true, nil
}
beingUninstalled, beingUninstalledErr = IsBeingUninstalled(r.Client)

if beingUninstalledErr == nil && defErr == nil {
// if either was deleting, we would've already returned.
return false, nil
var defErr error

crdDeleting, defErr = r.definitionIsDeleting()

if beingUninstalledErr != nil && defErr != nil {
err = fmt.Errorf("%w; %w", beingUninstalledErr, defErr)
} else if beingUninstalledErr != nil {
err = beingUninstalledErr
} else if defErr != nil {
err = defErr
}

// At least one had an unexpected error, so the decision can't be made right now
//nolint:errorlint // we can't choose just one of the errors to "correctly" wrap
return false, fmt.Errorf(
"isBeingUninstalled error: '%v', definitionIsDeleting error: '%v'", beingUninstalledErr, defErr,
)
return
}

func (r *ConfigurationPolicyReconciler) definitionIsDeleting() (bool, error) {
Expand Down Expand Up @@ -761,18 +774,24 @@ func (r *ConfigurationPolicyReconciler) handleObjectTemplates(plc policyv1.Confi

// object handling for when configurationPolicy is deleted
if plc.Spec.PruneObjectBehavior == "DeleteIfCreated" || plc.Spec.PruneObjectBehavior == "DeleteAll" {
cleanupNow, err := r.cleanupImmediately()
if err != nil {
log.Error(err, "Error determining whether to cleanup immediately, requeueing policy")
var crdDeleting bool

return
if !r.UninstallMode {
var err error

crdDeleting, err = r.definitionIsDeleting()
if err != nil {
log.Error(err, "Error determining whether to cleanup immediately, requeueing policy")

return
}
}

if cleanupNow {
if r.UninstallMode || crdDeleting {
if objHasFinalizer(&plc, pruneObjectFinalizer) {
patch := removeObjFinalizerPatch(&plc, pruneObjectFinalizer)

err = r.Patch(context.TODO(), &plc, client.RawPatch(types.JSONPatchType, patch))
err := r.Patch(context.TODO(), &plc, client.RawPatch(types.JSONPatchType, patch))
if err != nil {
log.Error(err, "Error removing finalizer for configuration policy")

Expand Down Expand Up @@ -815,7 +834,7 @@ func (r *ConfigurationPolicyReconciler) handleObjectTemplates(plc policyv1.Confi

patch := removeObjFinalizerPatch(&plc, pruneObjectFinalizer)

err = r.Patch(context.TODO(), &plc, client.RawPatch(types.JSONPatchType, patch))
err := r.Patch(context.TODO(), &plc, client.RawPatch(types.JSONPatchType, patch))
if err != nil {
log.Error(err, "Error removing finalizer for configuration policy")

Expand Down Expand Up @@ -3043,7 +3062,7 @@ func convertPolicyStatusToString(plc *policyv1.ConfigurationPolicy) string {

// getDeployment gets the Deployment object associated with this controller. If the controller is running outside of
// a cluster, no Deployment object or error will be returned.
func (r *ConfigurationPolicyReconciler) getDeployment() (*appsv1.Deployment, error) {
func getDeployment(client client.Client) (*appsv1.Deployment, error) {
key, err := common.GetOperatorNamespacedName()
if err != nil {
// Running locally
Expand All @@ -3055,15 +3074,15 @@ func (r *ConfigurationPolicyReconciler) getDeployment() (*appsv1.Deployment, err
}

deployment := appsv1.Deployment{}
if err := r.Client.Get(context.TODO(), key, &deployment); err != nil {
if err := client.Get(context.TODO(), key, &deployment); err != nil {
return nil, err
}

return &deployment, nil
}

func (r *ConfigurationPolicyReconciler) isBeingUninstalled() (bool, error) {
deployment, err := r.getDeployment()
func IsBeingUninstalled(client client.Client) (bool, error) {
deployment, err := getDeployment(client)
if deployment == nil || err != nil {
return false, err
}
Expand All @@ -3074,7 +3093,7 @@ func (r *ConfigurationPolicyReconciler) isBeingUninstalled() (bool, error) {
// removeLegacyDeploymentFinalizer removes the policy.open-cluster-management.io/delete-related-objects on the
// Deployment object. This finalizer is no longer needed on the Deployment object, so it is removed.
func (r *ConfigurationPolicyReconciler) removeLegacyDeploymentFinalizer() error {
deployment, err := r.getDeployment()
deployment, err := getDeployment(r.Client)
if deployment == nil || err != nil {
return err
}
Expand Down
62 changes: 43 additions & 19 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,19 @@ func main() {
os.Exit(1)
}

terminatingCtx := ctrl.SetupSignalHandler()

uninstallingCtx, uninstallingCtxCancel := context.WithCancel(terminatingCtx)

beingUninstalled, err := controllers.IsBeingUninstalled(mgr.GetClient())
if err != nil {
log.Error(err, "Failed to determine if the controller is being uninstalled at startup. Will assume it's not.")
}

if beingUninstalled {
uninstallingCtxCancel()
}

var targetK8sClient kubernetes.Interface
var targetK8sDynamicClient dynamic.Interface
var targetK8sConfig *rest.Config
Expand All @@ -303,16 +316,21 @@ func main() {
targetK8sClient = kubernetes.NewForConfigOrDie(targetK8sConfig)
targetK8sDynamicClient = dynamic.NewForConfigOrDie(targetK8sConfig)

nsSelMgr, err = manager.New(targetK8sConfig, manager.Options{
NewCache: cache.BuilderWithOptions(cache.Options{
TransformByObject: map[client.Object]toolscache.TransformFunc{
&corev1.Namespace{}: nsTransform,
},
}),
})
if err != nil {
log.Error(err, "Unable to create manager from target kube config")
os.Exit(1)
// The managed cluster's API server is potentially not the same as the hosting cluster and it could be
// offline already as part of the uninstall process. In this case, the manager's instantiation will fail.
// This controller is not needed in uninstall mode, so just skip it.
if !beingUninstalled {
nsSelMgr, err = manager.New(targetK8sConfig, manager.Options{
NewCache: cache.BuilderWithOptions(cache.Options{
TransformByObject: map[client.Object]toolscache.TransformFunc{
&corev1.Namespace{}: nsTransform,
},
}),
})
if err != nil {
log.Error(err, "Unable to create manager from target kube config")
os.Exit(1)
}
}

log.Info(
Expand All @@ -323,12 +341,16 @@ func main() {

instanceName, _ := os.Hostname() // on an error, instanceName will be empty, which is ok

nsSelReconciler := common.NamespaceSelectorReconciler{
Client: nsSelMgr.GetClient(),
}
if err = nsSelReconciler.SetupWithManager(nsSelMgr); err != nil {
log.Error(err, "Unable to create controller", "controller", "NamespaceSelector")
os.Exit(1)
var nsSelReconciler common.NamespaceSelectorReconciler

if !beingUninstalled {
nsSelReconciler = common.NamespaceSelectorReconciler{
Client: nsSelMgr.GetClient(),
}
if err = nsSelReconciler.SetupWithManager(nsSelMgr); err != nil {
log.Error(err, "Unable to create controller", "controller", "NamespaceSelector")
os.Exit(1)
}
}

discoveryClient := discovery.NewDiscoveryClientForConfigOrDie(targetK8sConfig)
Expand Down Expand Up @@ -362,6 +384,7 @@ func main() {
TargetK8sConfig: targetK8sConfig,
SelectorReconciler: &nsSelReconciler,
EnableMetrics: opts.enableMetrics,
UninstallMode: beingUninstalled,
}

OpReconciler := controllers.OperatorPolicyReconciler{
Expand Down Expand Up @@ -392,14 +415,13 @@ func main() {
os.Exit(1)
}

terminatingCtx := ctrl.SetupSignalHandler()
managerCtx, managerCancel := context.WithCancel(context.Background())

// PeriodicallyExecConfigPolicies is the go-routine that periodically checks the policies
log.Info("Periodically processing Configuration Policies", "frequency", opts.frequency)

go func() {
reconciler.PeriodicallyExecConfigPolicies(terminatingCtx, opts.frequency, mgr.Elected())
reconciler.PeriodicallyExecConfigPolicies(terminatingCtx, opts.frequency, mgr.Elected(), uninstallingCtxCancel)
managerCancel()
}()

Expand Down Expand Up @@ -467,7 +489,9 @@ func main() {
wg.Add(1)

go func() {
if err := nsSelMgr.Start(managerCtx); err != nil {
// Use the uninstallingCtx so that this shuts down when the controller is being uninstalled. This is
// important since the managed cluster's API server may become unavailable at this time when in hosted mdoe.
if err := nsSelMgr.Start(uninstallingCtx); err != nil {
log.Error(err, "Problem running manager")

managerCancel()
Expand Down

0 comments on commit 5586260

Please sign in to comment.