From aebb20892f2dc8d6b355b3dd2022a3cf5a9119d7 Mon Sep 17 00:00:00 2001 From: Justin Kulikauskas Date: Tue, 4 Jun 2024 14:41:34 -0400 Subject: [PATCH] Patch Subscription status when CSV is unreferenced In some situations when actions are happening quickly, OLM can lose the connection between the Subscription and the ClusterServiceVersion, which leads to a "constraints not satisfiable" condition on the Subscription. It can be hard to reproduce the exact situation we've seen rarely in our tests, but manually deleting and immediately recreating the subscription causes a similar situation. In this change, in those situations, the controller will intervene after 30 seconds by updating the Subscription status directly. The implementation allows for a 10 second window for the intervention, during which the controller may update the status multiple times, to address a case where OLM immediately overwrote the update. If the window is missed, the controller may schedule another time. This is intended to give time to OLM to potentially resolve the situation on its own. Refs: - https://issues.redhat.com/browse/ACM-11453 Signed-off-by: Justin Kulikauskas --- api/v1beta1/operatorpolicy_types.go | 26 ++++ api/v1beta1/zz_generated.deepcopy.go | 4 + controllers/operatorpolicy_controller.go | 134 +++++++++++++++--- ...luster-management.io_operatorpolicies.yaml | 8 ++ ...luster-management.io_operatorpolicies.yaml | 8 ++ test/e2e/case38_install_operator_test.go | 99 ++++++++++++- .../operator-policy-no-group-enforce.yaml | 1 - 7 files changed, 259 insertions(+), 21 deletions(-) diff --git a/api/v1beta1/operatorpolicy_types.go b/api/v1beta1/operatorpolicy_types.go index 9d16f6e9..d61bd074 100644 --- a/api/v1beta1/operatorpolicy_types.go +++ b/api/v1beta1/operatorpolicy_types.go @@ -5,6 +5,7 @@ package v1beta1 import ( "strings" + "time" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" @@ -163,6 +164,12 @@ type OperatorPolicyStatus struct { // The list of overlapping OperatorPolicies (as name.namespace) which all manage the same // subscription, including this policy. When no overlapping is detected, this list will be empty. OverlappingPolicies []string `json:"overlappingPolicies,omitempty"` + + // Timestamp for a possible intervention to help a Subscription stuck with a + // ConstraintsNotSatisfiable condition. Can be in the future, indicating the + // policy is waiting for OLM to resolve the situation. If in the recent past, + // the policy may update the status of the Subscription. + SubscriptionInterventionTime *metav1.Time `json:"subscriptionInterventionTime,omitempty"` } func (status OperatorPolicyStatus) RelatedObjsOfKind(kind string) map[int]policyv1.RelatedObject { @@ -190,6 +197,25 @@ func (status OperatorPolicyStatus) GetCondition(condType string) (int, metav1.Co return -1, metav1.Condition{} } +// Returns true if the SubscriptionInterventionTime is far enough in the past +// to be considered expired, and therefore should be removed from the status. +func (status OperatorPolicyStatus) SubscriptionInterventionExpired() bool { + if status.SubscriptionInterventionTime == nil { + return false + } + + return status.SubscriptionInterventionTime.Time.Before(time.Now().Add(-10 * time.Second)) +} + +// Returns true if the SubscriptionInterventionTime is in the future. +func (status OperatorPolicyStatus) SubscriptionInterventionWaiting() bool { + if status.SubscriptionInterventionTime == nil { + return false + } + + return status.SubscriptionInterventionTime.Time.After(time.Now()) +} + //+kubebuilder:object:root=true //+kubebuilder:subresource:status diff --git a/api/v1beta1/zz_generated.deepcopy.go b/api/v1beta1/zz_generated.deepcopy.go index b0faad9e..3c84a708 100644 --- a/api/v1beta1/zz_generated.deepcopy.go +++ b/api/v1beta1/zz_generated.deepcopy.go @@ -121,6 +121,10 @@ func (in *OperatorPolicyStatus) DeepCopyInto(out *OperatorPolicyStatus) { *out = make([]string, len(*in)) copy(*out, *in) } + if in.SubscriptionInterventionTime != nil { + in, out := &in.SubscriptionInterventionTime, &out.SubscriptionInterventionTime + *out = (*in).DeepCopy() + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new OperatorPolicyStatus. diff --git a/controllers/operatorpolicy_controller.go b/controllers/operatorpolicy_controller.go index dc61009d..5e72096c 100644 --- a/controllers/operatorpolicy_controller.go +++ b/controllers/operatorpolicy_controller.go @@ -14,6 +14,7 @@ import ( "slices" "strconv" "strings" + "time" operatorv1 "github.com/operator-framework/api/pkg/operators/v1" operatorv1alpha1 "github.com/operator-framework/api/pkg/operators/v1alpha1" @@ -46,6 +47,7 @@ import ( const ( OperatorControllerName string = "operator-policy-controller" CatalogSourceReady string = "READY" + olmGracePeriod = 30 * time.Second ) var ( @@ -94,7 +96,8 @@ var ( Version: "v1", Resource: "packagemanifests", } - ErrPackageManifest = errors.New("") + ErrPackageManifest = errors.New("") + unreferencedCSVRegex = regexp.MustCompile(`clusterserviceversion (\S*) exists and is not referenced`) ) // OperatorPolicyReconciler reconciles a OperatorPolicy object @@ -216,14 +219,14 @@ func (r *OperatorPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reque errs = append(errs, err) } + if err := r.Status().Update(ctx, policy); err != nil { + errs = append(errs, err) + } + if conditionChanged { // Add an event for the "final" state of the policy, otherwise this only has the // "early" events (and possibly has zero events). conditionsToEmit = append(conditionsToEmit, calculateComplianceCondition(policy)) - - if err := r.Status().Update(ctx, policy); err != nil { - errs = append(errs, err) - } } for _, cond := range conditionsToEmit { @@ -235,6 +238,14 @@ func (r *OperatorPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reque result := reconcile.Result{} finalErr := utilerrors.NewAggregate(errs) + if len(errs) == 0 { + // Schedule a requeue for the intervention. + // Note: this requeue will be superseded if the Subscription's status is flapping. + if policy.Status.SubscriptionInterventionWaiting() { + result.RequeueAfter = time.Until(policy.Status.SubscriptionInterventionTime.Add(time.Second)) + } + } + opLog.Info("Reconciling complete", "finalErr", finalErr, "conditionChanged", conditionChanged, "eventCount", len(conditionsToEmit)) @@ -1082,6 +1093,8 @@ func (r *OperatorPolicyReconciler) musthaveSubscription( ogCorrect bool, ) (*operatorv1alpha1.Subscription, []metav1.Condition, bool, error) { if foundSub == nil { + policy.Status.SubscriptionInterventionTime = nil + // Missing Subscription: report NonCompliance changed := updateStatus(policy, missingWantedCond("Subscription"), missingWantedObj(desiredSub)) @@ -1135,12 +1148,18 @@ func (r *OperatorPolicyReconciler) musthaveSubscription( subResFailed := mergedSub.Status.GetCondition(operatorv1alpha1.SubscriptionResolutionFailed) if subResFailed.Status == corev1.ConditionTrue { - return r.considerResolutionFailed(ctx, policy, foundSub, mergedSub) + return r.considerResolutionFailed(ctx, policy, mergedSub) + } + + if policy.Status.SubscriptionInterventionExpired() { + policy.Status.SubscriptionInterventionTime = nil } return mergedSub, nil, updateStatus(policy, matchesCond("Subscription"), matchedObj(foundSub)), nil } + policy.Status.SubscriptionInterventionTime = nil + // Specs don't match. if policy.Spec.RemediationAction.IsEnforce() && skipUpdate { changed := updateStatus(policy, mismatchCondUnfixable("Subscription"), mismatchedObj(foundSub)) @@ -1179,7 +1198,6 @@ func (r *OperatorPolicyReconciler) musthaveSubscription( func (r *OperatorPolicyReconciler) considerResolutionFailed( ctx context.Context, policy *policyv1beta1.OperatorPolicy, - foundSub *unstructured.Unstructured, mergedSub *operatorv1alpha1.Subscription, ) (*operatorv1alpha1.Subscription, []metav1.Condition, bool, error) { opLog := ctrl.LoggerFrom(ctx) @@ -1198,12 +1216,20 @@ func (r *OperatorPolicyReconciler) considerResolutionFailed( } if !includesSubscription { + if policy.Status.SubscriptionInterventionExpired() { + policy.Status.SubscriptionInterventionTime = nil + } + return mergedSub, nil, false, nil } // Handle non-ConstraintsNotSatisfiable reasons separately if !strings.EqualFold(subResFailed.Reason, "ConstraintsNotSatisfiable") { - changed := updateStatus(policy, subResFailedCond(subResFailed), nonCompObj(foundSub, subResFailed.Reason)) + changed := updateStatus(policy, subResFailedCond(subResFailed), nonCompObj(mergedSub, subResFailed.Reason)) + + if policy.Status.SubscriptionInterventionExpired() { + policy.Status.SubscriptionInterventionTime = nil + } return mergedSub, nil, changed, nil } @@ -1211,9 +1237,75 @@ func (r *OperatorPolicyReconciler) considerResolutionFailed( // A "constraints not satisfiable" message has nondeterministic clauses, and can be noisy with a list of versions. // Just report a generic condition, which will prevent the OperatorPolicy status from constantly updating // when the details in the Subscription status change. - changed := updateStatus(policy, subConstraintsNotSatisfiableCond, nonCompObj(foundSub, "ConstraintsNotSatisfiable")) + changed := updateStatus(policy, subConstraintsNotSatisfiableCond, + nonCompObj(mergedSub, "ConstraintsNotSatisfiable")) + + if policy.Status.SubscriptionInterventionExpired() || policy.Status.SubscriptionInterventionTime == nil { + interventionTime := metav1.Time{Time: time.Now().Add(olmGracePeriod)} + policy.Status.SubscriptionInterventionTime = &interventionTime + + opLog.V(1).Info("Detected ConstraintsNotSatisfiable, setting an intervention time", + "interventionTime", interventionTime, "subscription", mergedSub) + + return mergedSub, nil, changed, nil + } + + if policy.Status.SubscriptionInterventionWaiting() { + opLog.V(1).Info("Detected ConstraintsNotSatisfiable, giving OLM more time before possibly intervening", + "interventionTime", policy.Status.SubscriptionInterventionTime) + + return mergedSub, nil, changed, nil + } + + // Do the "intervention" - return mergedSub, nil, changed, nil + relatedCSVs := policy.Status.RelatedObjsOfKind("ClusterServiceVersion") + if len(relatedCSVs) != 1 { + opLog.V(1).Info("No related CSVs to use for an intervention") + + return mergedSub, nil, changed, nil + } + + csvIdx := 0 + for idx := range relatedCSVs { + csvIdx = idx + } + + csvName := relatedCSVs[csvIdx].Object.Metadata.Name + + matches := unreferencedCSVRegex.FindStringSubmatch(subResFailed.Message) + if len(matches) < 2 { + opLog.V(1).Info("Subscription condition does not match pattern for an unreferenced CSV", + "subscriptionConditionMessage", subResFailed.Message) + + return mergedSub, nil, changed, nil + } + + if csvName != matches[1] { + opLog.V(1).Info("The CSV found in the Subscription condition does not match the CSV found by label", + "subscriptionConditionMessage", subResFailed.Message, "csvName", csvName, "csvInSub", matches[1]) + + return mergedSub, nil, changed, nil + } + + if mergedSub.Status.LastUpdated.IsZero() { + mergedSub.Status.LastUpdated = metav1.Now() + } + + mergedSub.Status.CurrentCSV = csvName + + opLog.Info("Updating Subscription status to point to CSV", "csvName", csvName) + + if err := r.TargetClient.Status().Update(ctx, mergedSub); err != nil { + return mergedSub, nil, changed, + fmt.Errorf("error updating the Subscription status to point to the CSV: %w", err) + } + + mergedSub.SetGroupVersionKind(subscriptionGVK) // Update might strip this information + + updateStatus(policy, updatedCond("Subscription"), updatedObj(mergedSub)) + + return mergedSub, nil, true, nil } func (r *OperatorPolicyReconciler) mustnothaveSubscription( @@ -1222,6 +1314,8 @@ func (r *OperatorPolicyReconciler) mustnothaveSubscription( desiredSub *operatorv1alpha1.Subscription, foundUnstructSub *unstructured.Unstructured, ) (*operatorv1alpha1.Subscription, []metav1.Condition, bool, error) { + policy.Status.SubscriptionInterventionTime = nil + if foundUnstructSub == nil { // Missing Subscription: report Compliance changed := updateStatus(policy, missingNotWantedCond("Subscription"), missingNotWantedObj(desiredSub)) @@ -1522,8 +1616,11 @@ func (r *OperatorPolicyReconciler) handleCSV( var foundCSV *operatorv1alpha1.ClusterServiceVersion + relatedCSVs := make([]policyv1.RelatedObject, 0) + for _, csv := range csvList { - if csv.GetName() == sub.Status.InstalledCSV { + // If the subscription does not know about the CSV, this can report multiple CSVs as related + if sub.Status.InstalledCSV == "" || sub.Status.InstalledCSV == csv.GetName() { matchedCSV := operatorv1alpha1.ClusterServiceVersion{} err = runtime.DefaultUnstructuredConverter.FromUnstructured(csv.UnstructuredContent(), &matchedCSV) @@ -1531,9 +1628,11 @@ func (r *OperatorPolicyReconciler) handleCSV( return nil, nil, false, err } - foundCSV = &matchedCSV + relatedCSVs = append(relatedCSVs, existingCSVObj(&matchedCSV)) - break + if sub.Status.InstalledCSV == csv.GetName() { + foundCSV = &matchedCSV + } } } @@ -1545,13 +1644,14 @@ func (r *OperatorPolicyReconciler) handleCSV( // CSV has not yet been created by OLM if foundCSV == nil { - changed := updateStatus(policy, - missingWantedCond("ClusterServiceVersion"), missingCSVObj(sub.Name, sub.Namespace)) + if len(relatedCSVs) == 0 { + relatedCSVs = append(relatedCSVs, missingCSVObj(sub.Name, sub.Namespace)) + } - return foundCSV, nil, changed, nil + return foundCSV, nil, updateStatus(policy, missingWantedCond("ClusterServiceVersion"), relatedCSVs...), nil } - return foundCSV, nil, updateStatus(policy, buildCSVCond(foundCSV), existingCSVObj(foundCSV)), nil + return foundCSV, nil, updateStatus(policy, buildCSVCond(foundCSV), relatedCSVs...), nil } func (r *OperatorPolicyReconciler) mustnothaveCSV( diff --git a/deploy/crds/kustomize_operatorpolicy/policy.open-cluster-management.io_operatorpolicies.yaml b/deploy/crds/kustomize_operatorpolicy/policy.open-cluster-management.io_operatorpolicies.yaml index a3a4fda7..84ed9699 100644 --- a/deploy/crds/kustomize_operatorpolicy/policy.open-cluster-management.io_operatorpolicies.yaml +++ b/deploy/crds/kustomize_operatorpolicy/policy.open-cluster-management.io_operatorpolicies.yaml @@ -293,6 +293,14 @@ spec: resolvedSubscriptionLabel: description: The resolved name.namespace of the subscription type: string + subscriptionInterventionTime: + description: |- + Timestamp for a possible intervention to help a Subscription stuck with a + ConstraintsNotSatisfiable condition. Can be in the future, indicating the + policy is waiting for OLM to resolve the situation. If in the recent past, + the policy may update the status of the Subscription. + format: date-time + type: string type: object type: object served: true diff --git a/deploy/crds/policy.open-cluster-management.io_operatorpolicies.yaml b/deploy/crds/policy.open-cluster-management.io_operatorpolicies.yaml index c77382e7..92fc655b 100644 --- a/deploy/crds/policy.open-cluster-management.io_operatorpolicies.yaml +++ b/deploy/crds/policy.open-cluster-management.io_operatorpolicies.yaml @@ -286,6 +286,14 @@ spec: resolvedSubscriptionLabel: description: The resolved name.namespace of the subscription type: string + subscriptionInterventionTime: + description: |- + Timestamp for a possible intervention to help a Subscription stuck with a + ConstraintsNotSatisfiable condition. Can be in the future, indicating the + policy is waiting for OLM to resolve the situation. If in the recent past, + the policy may update the status of the Subscription. + format: date-time + type: string type: object type: object served: true diff --git a/test/e2e/case38_install_operator_test.go b/test/e2e/case38_install_operator_test.go index 9cd3a09f..db24f6a6 100644 --- a/test/e2e/case38_install_operator_test.go +++ b/test/e2e/case38_install_operator_test.go @@ -9,6 +9,7 @@ import ( "slices" "strconv" "strings" + "time" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" @@ -35,7 +36,15 @@ var _ = Describe("Testing OperatorPolicy", Ordered, Label("supports-hosted"), fu ) // checks that the compliance state eventually matches what is desired - checkCompliance := func(polName string, ns string, timeoutSeconds int, comp policyv1.ComplianceState) { + checkCompliance := func( + polName string, + ns string, + timeoutSeconds int, + comp policyv1.ComplianceState, + consistencyArgs ...interface{}, + ) { + GinkgoHelper() + var debugMessage string DeferCleanup(func() { @@ -44,7 +53,7 @@ var _ = Describe("Testing OperatorPolicy", Ordered, Label("supports-hosted"), fu } }) - EventuallyWithOffset(1, func(g Gomega) { + compCheck := func(g Gomega) { unstructPolicy, err := clientManagedDynamic.Resource(gvrOperatorPolicy).Namespace(ns). Get(context.TODO(), polName, metav1.GetOptions{}) g.Expect(err).NotTo(HaveOccurred()) @@ -61,7 +70,13 @@ var _ = Describe("Testing OperatorPolicy", Ordered, Label("supports-hosted"), fu g.Expect(err).NotTo(HaveOccurred()) g.Expect(policy.Status.ComplianceState).To(Equal(comp)) - }, timeoutSeconds, 3).Should(Succeed()) + } + + Eventually(compCheck, timeoutSeconds, 3).Should(Succeed()) + + if len(consistencyArgs) > 0 { + Consistently(compCheck, consistencyArgs...).Should(Succeed()) + } } // checks that the policy has the proper compliance, that the relatedObjects of a given @@ -3239,4 +3254,82 @@ var _ = Describe("Testing OperatorPolicy", Ordered, Label("supports-hosted"), fu ) }) }) + Describe("Testing recovery of sub-csv connection", Ordered, func() { + const ( + opPolYAML = "../resources/case38_operator_install/operator-policy-no-group-enforce.yaml" + opPolName = "oppol-no-group-enforce" + subName = "project-quay" + ) + + scenarioTriggered := true + + BeforeAll(func() { + preFunc() + + createObjWithParent(parentPolicyYAML, parentPolicyName, + opPolYAML, testNamespace, gvrPolicy, gvrOperatorPolicy) + }) + + BeforeEach(func() { + if !scenarioTriggered { + Skip("test scenario was unable to be triggered") + } + }) + + It("should get the 'csv exists and is not referenced' condition", func(ctx SpecContext) { + scenarioTriggered = false + + By("Verifying the policy starts compliant") + checkCompliance(opPolName, testNamespace, olmWaitTimeout*2, policyv1.Compliant) + + By("Periodically deleting the subscription and checking the status") + scenarioDeadline := time.Now().Add(40 * time.Second) + + scenarioTriggerLoop: + for scenarioDeadline.After(time.Now()) { + KubectlTarget("delete", "subscription", subName, "-n", opPolTestNS) + time.Sleep(time.Second) + + sub, err := targetK8sDynamic.Resource(gvrSubscription).Namespace(opPolTestNS). + Get(ctx, subName, metav1.GetOptions{}) + if err != nil || sub == nil { + continue + } + + subConds, _, _ := unstructured.NestedSlice(sub.Object, "status", "conditions") + for _, cond := range subConds { + condMap, ok := cond.(map[string]interface{}) + if !ok { + continue + } + + if condType, _, _ := unstructured.NestedString(condMap, "type"); condType != "ResolutionFailed" { + continue + } + + if condStatus, _, _ := unstructured.NestedString(condMap, "status"); condStatus != "True" { + continue + } + + condMessage, _, _ := unstructured.NestedString(condMap, "message") + notRefRgx := regexp.MustCompile(`clusterserviceversion (\S*) exists and is not referenced`) + if notRefRgx.MatchString(condMessage) { + scenarioTriggered = true + + break scenarioTriggerLoop + } + } + + time.Sleep(5 * time.Second) + } + }) + + It("Verifies the policy eventually fixes the 'not referenced' condition", func() { + By("Sleeping 25s, since OperatorPolicy should wait a while before intervening") + time.Sleep(25 * time.Second) + + By("Verifying the policy becomes compliant") + checkCompliance(opPolName, testNamespace, 2*olmWaitTimeout, policyv1.Compliant, 30, 3) + }) + }) }) diff --git a/test/resources/case38_operator_install/operator-policy-no-group-enforce.yaml b/test/resources/case38_operator_install/operator-policy-no-group-enforce.yaml index 604f4077..400da462 100644 --- a/test/resources/case38_operator_install/operator-policy-no-group-enforce.yaml +++ b/test/resources/case38_operator_install/operator-policy-no-group-enforce.yaml @@ -20,5 +20,4 @@ spec: namespace: operator-policy-testns source: operatorhubio-catalog sourceNamespace: olm - startingCSV: quay-operator.v3.10.0 upgradeApproval: Automatic