Skip to content

Commit

Permalink
Trigger uninstalls through a new annotation
Browse files Browse the repository at this point in the history
Previous to this, a finalizer on the Deployment was added so that if the
Deployment was deleted, it would handle immediate clean up. This doesn't
handle the common case where the config-policy-controller ManagedClusterAddOn
is deleted, which causes the ManifestWork to be deleted, which triggers
all Configuration Policy controller deployment artifacts, including the
service account.

A new approach is taken so that a new annotation of
policy.open-cluster-management.io/uninstalling=true is set on the
Deployment to indicate that the Configuration Policy controller should
remove all finalizers because it's getting deleted.

The Policy Addon controller will be updated so that when the
config-policy-controller ManagedClusterAddOn object is deleted, a
finalizer will prevent it and a Pod will run on the managed cluster with
the new `trigger-uninstall` subcommand. This sets the uninstalling
annotation on the Deployment and then waits until all
ConfigurationPolicy finalizers have been removed. Once the command ends,
the Pod exits, and the ManagedClusterAddOn object's finalizer is removed
and the uninstall proceeds.

Relates:
https://issues.redhat.com/browse/ACM-3233
https://issues.redhat.com/browse/ACM-2923

Signed-off-by: mprahl <mprahl@users.noreply.github.com>
  • Loading branch information
mprahl committed Feb 8, 2023
1 parent ff83c38 commit d768d51
Show file tree
Hide file tree
Showing 14 changed files with 398 additions and 99 deletions.
5 changes: 5 additions & 0 deletions .github/workflows/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,11 @@ jobs:
make build-images
KUBECONFIG=${PWD}/kubeconfig_managed_e2e make kind-deploy-controller-dev
- name: E2E tests that require the controller running in a cluster
run: |
export GOPATH=$(go env GOPATH)
make e2e-test-running-in-cluster
- name: Debug
if: ${{ failure() }}
run: |
Expand Down
8 changes: 6 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -334,15 +334,19 @@ e2e-test: e2e-dependencies
$(GINKGO) -v --fail-fast --slow-spec-threshold=10s $(E2E_TEST_ARGS) test/e2e

.PHONY: e2e-test-coverage
e2e-test-coverage: E2E_TEST_ARGS = --json-report=report_e2e.json --label-filter='!hosted-mode' --output-dir=.
e2e-test-coverage: E2E_TEST_ARGS = --json-report=report_e2e.json --label-filter='!hosted-mode && !running-in-cluster' --output-dir=.
e2e-test-coverage: e2e-run-instrumented e2e-test e2e-stop-instrumented

.PHONY: e2e-test-hosted-mode-coverage
e2e-test-hosted-mode-coverage: E2E_TEST_ARGS = --json-report=report_e2e_hosted_mode.json --label-filter="hosted-mode" --output-dir=.
e2e-test-hosted-mode-coverage: E2E_TEST_ARGS = --json-report=report_e2e_hosted_mode.json --label-filter="hosted-mode && !running-in-cluster" --output-dir=.
e2e-test-hosted-mode-coverage: COVERAGE_E2E_OUT = coverage_e2e_hosted_mode.out
e2e-test-hosted-mode-coverage: export TARGET_KUBECONFIG_PATH = $(PWD)/kubeconfig_managed2
e2e-test-hosted-mode-coverage: e2e-run-instrumented e2e-test e2e-stop-instrumented

.PHONY: e2e-test-running-in-cluster
e2e-test-running-in-cluster: E2E_TEST_ARGS = --label-filter="running-in-cluster"
e2e-test-running-in-cluster: e2e-test

.PHONY: e2e-build-instrumented
e2e-build-instrumented:
go test -covermode=atomic -coverpkg=$(shell cat go.mod | head -1 | cut -d ' ' -f 2)/... -c -tags e2e ./ -o build/_output/bin/$(IMG)-instrumented
Expand Down
2 changes: 1 addition & 1 deletion build/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ COPY --from=builder ${REPO_PATH}/build/_output/bin/${COMPONENT} ${OPERATOR}
COPY --from=builder ${REPO_PATH}/build/bin /usr/local/bin
RUN /usr/local/bin/user_setup

ENTRYPOINT ["/usr/local/bin/entrypoint"]
ENTRYPOINT ["/usr/local/bin/entrypoint", "controller"]

RUN microdnf update && \
microdnf clean all
Expand Down
104 changes: 25 additions & 79 deletions controllers/configurationpolicy_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -163,20 +163,11 @@ func (r *ConfigurationPolicyReconciler) PeriodicallyExecConfigPolicies(
}

const waiting = 10 * time.Minute
var exiting bool
// Loop twice after exit condition is received to account for race conditions and retries.
loopsAfterExit := 2

for !exiting || (exiting && loopsAfterExit > 0) {
start := time.Now()

select {
case <-ctx.Done():
exiting = true
loopsAfterExit--
default:
}
exiting := false

for !exiting {
start := time.Now()
policiesList := policyv1.ConfigurationPolicyList{}

var skipLoop bool
Expand Down Expand Up @@ -206,28 +197,6 @@ func (r *ConfigurationPolicyReconciler) PeriodicallyExecConfigPolicies(
skipLoop = true
}

needDeploymentFinalizer := false

for i := range policiesList.Items {
plc := policiesList.Items[i]

if objHasFinalizer(&plc, pruneObjectFinalizer) {
needDeploymentFinalizer = true

break
}
}

if err := r.manageDeploymentFinalizer(needDeploymentFinalizer); err != nil {
if errors.Is(err, common.ErrNoNamespace) || errors.Is(err, common.ErrRunLocal) {
log.Info("Not managing the controller's deployment finalizer because it is running locally")
} else {
log.Error(err, "Failed to manage the controller's deployment finalizer, skipping loop")

skipLoop = true
}
}

cleanupImmediately, err := r.cleanupImmediately()
if err != nil {
log.Error(err, "Failed to determine if it's time to cleanup immediately")
Expand Down Expand Up @@ -281,6 +250,12 @@ func (r *ConfigurationPolicyReconciler) PeriodicallyExecConfigPolicies(
log.V(2).Info("Sleeping before reprocessing the configuration policies", "seconds", sleepTime)
time.Sleep(sleepTime)
}

select {
case <-ctx.Done():
exiting = true
default:
}
}
}

Expand Down Expand Up @@ -618,10 +593,10 @@ func (r *ConfigurationPolicyReconciler) cleanUpChildObjects(plc policyv1.Configu

// cleanupImmediately returns true when the cluster is in a state where configurationpolicies should
// be removed as soon as possible, ignoring the pruneObjectBehavior of the policies. This is the
// case when the CRD or the controller's deployment are already being deleted.
// case when the controller is being uninstalled or the CRD is being deleted.
func (r *ConfigurationPolicyReconciler) cleanupImmediately() (bool, error) {
deployDeleting, deployErr := r.deploymentIsDeleting()
if deployErr == nil && deployDeleting {
beingUninstalled, beingUninstalledErr := r.isBeingUninstalled()
if beingUninstalledErr == nil && beingUninstalled {
return true, nil
}

Expand All @@ -630,36 +605,16 @@ func (r *ConfigurationPolicyReconciler) cleanupImmediately() (bool, error) {
return true, nil
}

if deployErr == nil && defErr == nil {
if beingUninstalledErr == nil && defErr == nil {
// if either was deleting, we would've already returned.
return false, nil
}

// At least one had an unexpected error, so the decision can't be made right now
//nolint:errorlint // we can't choose just one of the errors to "correctly" wrap
return false, fmt.Errorf("deploymentIsDeleting error: '%v', definitionIsDeleting error: '%v'",
deployErr, defErr)
}

func (r *ConfigurationPolicyReconciler) deploymentIsDeleting() (bool, error) {
key, keyErr := common.GetOperatorNamespacedName()
if keyErr != nil {
if errors.Is(keyErr, common.ErrNoNamespace) || errors.Is(keyErr, common.ErrRunLocal) {
// running locally
return false, nil
}

return false, keyErr
}

deployment := appsv1.Deployment{}

err := r.Get(context.TODO(), key, &deployment)
if err != nil {
return false, err
}

return deployment.DeletionTimestamp != nil, nil
return false, fmt.Errorf(
"isBeingUninstalled error: '%v', definitionIsDeleting error: '%v'", beingUninstalledErr, defErr,
)
}

func (r *ConfigurationPolicyReconciler) definitionIsDeleting() (bool, error) {
Expand Down Expand Up @@ -2808,32 +2763,23 @@ func convertPolicyStatusToString(plc *policyv1.ConfigurationPolicy) (results str
return result
}

func (r *ConfigurationPolicyReconciler) manageDeploymentFinalizer(shouldBeSet bool) error {
func (r *ConfigurationPolicyReconciler) isBeingUninstalled() (bool, error) {
key, err := common.GetOperatorNamespacedName()
if err != nil {
return err
// Running locally
if errors.Is(err, common.ErrNoNamespace) || errors.Is(err, common.ErrRunLocal) {
return false, nil
}

return false, err
}

deployment := appsv1.Deployment{}
if err := r.Client.Get(context.TODO(), key, &deployment); err != nil {
return err
}

if objHasFinalizer(&deployment, pruneObjectFinalizer) {
if shouldBeSet {
return nil
}

deployment.SetFinalizers(removeObjFinalizer(&deployment, pruneObjectFinalizer))
} else {
if !shouldBeSet {
return nil
}

deployment.SetFinalizers(addObjFinalizer(&deployment, pruneObjectFinalizer))
return false, err
}

return r.Update(context.TODO(), &deployment)
return deployment.Annotations[common.UninstallingAnnotation] == "true", nil
}

func recoverFlow() {
Expand Down
1 change: 0 additions & 1 deletion controllers/configurationpolicy_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -549,7 +549,6 @@ func addObjFinalizer(obj metav1.Object, finalizer string) []string {
return append(obj.GetFinalizers(), finalizer)
}

// nolint: unparam
func removeObjFinalizer(obj metav1.Object, finalizer string) []string {
result := []string{}

Expand Down
1 change: 1 addition & 0 deletions deploy/manager/manager.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ spec:
command:
- config-policy-controller
args:
- "controller"
- "--enable-lease=true"
- "--log-level=2"
- "--v=0"
Expand Down
1 change: 1 addition & 0 deletions deploy/operator.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ spec:
spec:
containers:
- args:
- controller
- --enable-lease=true
- --log-level=2
- --v=0
Expand Down
104 changes: 89 additions & 15 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"os"
"runtime"
"strings"
"time"

"github.com/go-logr/zapr"
"github.com/spf13/pflag"
Expand Down Expand Up @@ -43,6 +44,7 @@ import (
policyv1 "open-cluster-management.io/config-policy-controller/api/v1"
"open-cluster-management.io/config-policy-controller/controllers"
"open-cluster-management.io/config-policy-controller/pkg/common"
"open-cluster-management.io/config-policy-controller/pkg/triggeruninstall"
"open-cluster-management.io/config-policy-controller/version"
)

Expand Down Expand Up @@ -74,51 +76,52 @@ func main() {
}

zflags.Bind(flag.CommandLine)
pflag.CommandLine.AddGoFlagSet(flag.CommandLine)

controllerFlagSet := pflag.NewFlagSet("controller", pflag.ExitOnError)

var clusterName, hubConfigPath, targetKubeConfig, metricsAddr, probeAddr string
var frequency uint
var decryptionConcurrency, evaluationConcurrency uint8
var enableLease, enableLeaderElection, legacyLeaderElection, enableMetrics bool

pflag.UintVar(&frequency, "update-frequency", 10,
controllerFlagSet.UintVar(&frequency, "update-frequency", 10,
"The status update frequency (in seconds) of a mutation policy")
pflag.BoolVar(&enableLease, "enable-lease", false,
controllerFlagSet.BoolVar(&enableLease, "enable-lease", false,
"If enabled, the controller will start the lease controller to report its status")
pflag.StringVar(&clusterName, "cluster-name", "acm-managed-cluster", "Name of the cluster")
pflag.StringVar(&hubConfigPath, "hub-kubeconfig-path", "/var/run/klusterlet/kubeconfig",
controllerFlagSet.StringVar(&clusterName, "cluster-name", "acm-managed-cluster", "Name of the cluster")
controllerFlagSet.StringVar(&hubConfigPath, "hub-kubeconfig-path", "/var/run/klusterlet/kubeconfig",
"Path to the hub kubeconfig")
pflag.StringVar(
controllerFlagSet.StringVar(
&targetKubeConfig,
"target-kubeconfig-path",
"",
"A path to an alternative kubeconfig for policy evaluation and enforcement.",
)
pflag.StringVar(
controllerFlagSet.StringVar(
&metricsAddr, "metrics-bind-address", "localhost:8383", "The address the metrics endpoint binds to.",
)
pflag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
pflag.BoolVar(&enableLeaderElection, "leader-elect", true,
controllerFlagSet.StringVar(
&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.",
)
controllerFlagSet.BoolVar(&enableLeaderElection, "leader-elect", true,
"Enable leader election for controller manager. "+
"Enabling this will ensure there is only one active controller manager.")
pflag.BoolVar(&legacyLeaderElection, "legacy-leader-elect", false,
controllerFlagSet.BoolVar(&legacyLeaderElection, "legacy-leader-elect", false,
"Use a legacy leader election method for controller manager instead of the lease API.")
pflag.Uint8Var(
controllerFlagSet.Uint8Var(
&decryptionConcurrency,
"decryption-concurrency",
5,
"The max number of concurrent policy template decryptions",
)
pflag.Uint8Var(
controllerFlagSet.Uint8Var(
&evaluationConcurrency,
"evaluation-concurrency",
// Set a low default to not add too much load to the Kubernetes API server in resource constrained deployments.
2,
"The max number of concurrent configuration policy evaluations",
)
pflag.BoolVar(&enableMetrics, "enable-metrics", true, "Disable custom metrics collection")

pflag.Parse()
controllerFlagSet.BoolVar(&enableMetrics, "enable-metrics", true, "Disable custom metrics collection")

ctrlZap, err := zflags.BuildForCtrl()
if err != nil {
Expand All @@ -142,6 +145,24 @@ func main() {
klog.SetLogger(zapr.NewLogger(klogZap).WithName("klog"))
}

subcommand := ""
if len(os.Args) >= 2 {
subcommand = os.Args[1]
}

switch subcommand {
case "controller":
controllerFlagSet.AddGoFlagSet(flag.CommandLine)
_ = controllerFlagSet.Parse(os.Args[2:])
case "trigger-uninstall":
handleTriggerUninstall()

return
default:
fmt.Fprintln(os.Stderr, "expected 'controller' or 'trigger-uninstall' subcommands")
os.Exit(1)
}

if evaluationConcurrency < 1 {
panic("The --evaluation-concurrency option cannot be less than 1")
}
Expand Down Expand Up @@ -372,3 +393,56 @@ func main() {
os.Exit(1)
}
}

func handleTriggerUninstall() {
triggerUninstallFlagSet := pflag.NewFlagSet("trigger-uninstall", pflag.ExitOnError)

var deploymentName, deploymentNamespace, policyNamespace string
var timeoutSeconds uint

triggerUninstallFlagSet.StringVar(
&deploymentName, "deployment-name", "config-policy-controller", "The name of the controller Deployment object",
)
triggerUninstallFlagSet.StringVar(
&deploymentNamespace,
"deployment-namespace",
"open-cluster-management-agent-addon",
"The namespace of the controller Deployment object",
)
triggerUninstallFlagSet.StringVar(
&policyNamespace, "policy-namespace", "", "The namespace of where ConfigurationPolicy objects are stored",
)
triggerUninstallFlagSet.UintVar(
&timeoutSeconds, "timeout-seconds", 300, "The number of seconds before the operation is canceled",
)
triggerUninstallFlagSet.AddGoFlagSet(flag.CommandLine)

_ = triggerUninstallFlagSet.Parse(os.Args[2:])

if deploymentName == "" || deploymentNamespace == "" || policyNamespace == "" {
fmt.Fprintln(os.Stderr, "--deployment-name, --deployment-namespace, --policy-namespace must all have values")
os.Exit(1)
}

if timeoutSeconds < 30 {
fmt.Fprintln(os.Stderr, "--timeout-seconds must be set to at least 30 seconds")
os.Exit(1)
}

terminatingCtx := ctrl.SetupSignalHandler()
ctx, cancelCtx := context.WithDeadline(terminatingCtx, time.Now().Add(time.Duration(timeoutSeconds)*time.Second))
defer cancelCtx()

// Get a config to talk to the apiserver
cfg, err := config.GetConfig()
if err != nil {
log.Error(err, "Failed to get config")
os.Exit(1)
}

err = triggeruninstall.TriggerUninstall(ctx, cfg, deploymentName, deploymentNamespace, policyNamespace)
if err != nil {
klog.Errorf("Failed to trigger the uninstall due to the error: %s", err)
os.Exit(1)
}
}
Loading

0 comments on commit d768d51

Please sign in to comment.