diff --git a/api/v1beta1/operatorpolicy_types.go b/api/v1beta1/operatorpolicy_types.go index 9d16f6e9..d61bd074 100644 --- a/api/v1beta1/operatorpolicy_types.go +++ b/api/v1beta1/operatorpolicy_types.go @@ -5,6 +5,7 @@ package v1beta1 import ( "strings" + "time" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" @@ -163,6 +164,12 @@ type OperatorPolicyStatus struct { // The list of overlapping OperatorPolicies (as name.namespace) which all manage the same // subscription, including this policy. When no overlapping is detected, this list will be empty. OverlappingPolicies []string `json:"overlappingPolicies,omitempty"` + + // Timestamp for a possible intervention to help a Subscription stuck with a + // ConstraintsNotSatisfiable condition. Can be in the future, indicating the + // policy is waiting for OLM to resolve the situation. If in the recent past, + // the policy may update the status of the Subscription. + SubscriptionInterventionTime *metav1.Time `json:"subscriptionInterventionTime,omitempty"` } func (status OperatorPolicyStatus) RelatedObjsOfKind(kind string) map[int]policyv1.RelatedObject { @@ -190,6 +197,25 @@ func (status OperatorPolicyStatus) GetCondition(condType string) (int, metav1.Co return -1, metav1.Condition{} } +// Returns true if the SubscriptionInterventionTime is far enough in the past +// to be considered expired, and therefore should be removed from the status. +func (status OperatorPolicyStatus) SubscriptionInterventionExpired() bool { + if status.SubscriptionInterventionTime == nil { + return false + } + + return status.SubscriptionInterventionTime.Time.Before(time.Now().Add(-10 * time.Second)) +} + +// Returns true if the SubscriptionInterventionTime is in the future. +func (status OperatorPolicyStatus) SubscriptionInterventionWaiting() bool { + if status.SubscriptionInterventionTime == nil { + return false + } + + return status.SubscriptionInterventionTime.Time.After(time.Now()) +} + //+kubebuilder:object:root=true //+kubebuilder:subresource:status diff --git a/api/v1beta1/zz_generated.deepcopy.go b/api/v1beta1/zz_generated.deepcopy.go index b0faad9e..3c84a708 100644 --- a/api/v1beta1/zz_generated.deepcopy.go +++ b/api/v1beta1/zz_generated.deepcopy.go @@ -121,6 +121,10 @@ func (in *OperatorPolicyStatus) DeepCopyInto(out *OperatorPolicyStatus) { *out = make([]string, len(*in)) copy(*out, *in) } + if in.SubscriptionInterventionTime != nil { + in, out := &in.SubscriptionInterventionTime, &out.SubscriptionInterventionTime + *out = (*in).DeepCopy() + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new OperatorPolicyStatus. diff --git a/controllers/operatorpolicy_controller.go b/controllers/operatorpolicy_controller.go index dc61009d..dbda4e10 100644 --- a/controllers/operatorpolicy_controller.go +++ b/controllers/operatorpolicy_controller.go @@ -14,6 +14,7 @@ import ( "slices" "strconv" "strings" + "time" operatorv1 "github.com/operator-framework/api/pkg/operators/v1" operatorv1alpha1 "github.com/operator-framework/api/pkg/operators/v1alpha1" @@ -46,6 +47,7 @@ import ( const ( OperatorControllerName string = "operator-policy-controller" CatalogSourceReady string = "READY" + olmGracePeriod = 30 * time.Second ) var ( @@ -94,7 +96,8 @@ var ( Version: "v1", Resource: "packagemanifests", } - ErrPackageManifest = errors.New("") + ErrPackageManifest = errors.New("") + unreferencedCSVRegex = regexp.MustCompile(`clusterserviceversion (\S*) exists and is not referenced`) ) // OperatorPolicyReconciler reconciles a OperatorPolicy object @@ -216,14 +219,14 @@ func (r *OperatorPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reque errs = append(errs, err) } + if err := r.Status().Update(ctx, policy); err != nil { + errs = append(errs, err) + } + if conditionChanged { // Add an event for the "final" state of the policy, otherwise this only has the // "early" events (and possibly has zero events). conditionsToEmit = append(conditionsToEmit, calculateComplianceCondition(policy)) - - if err := r.Status().Update(ctx, policy); err != nil { - errs = append(errs, err) - } } for _, cond := range conditionsToEmit { @@ -235,6 +238,14 @@ func (r *OperatorPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reque result := reconcile.Result{} finalErr := utilerrors.NewAggregate(errs) + if len(errs) == 0 { + // Schedule a requeue for the intervention. + // Note: this requeue will be superseded if the Subscription's status is flapping. + if policy.Status.SubscriptionInterventionWaiting() { + result.RequeueAfter = time.Until(policy.Status.SubscriptionInterventionTime.Add(time.Second)) + } + } + opLog.Info("Reconciling complete", "finalErr", finalErr, "conditionChanged", conditionChanged, "eventCount", len(conditionsToEmit)) @@ -1082,6 +1093,8 @@ func (r *OperatorPolicyReconciler) musthaveSubscription( ogCorrect bool, ) (*operatorv1alpha1.Subscription, []metav1.Condition, bool, error) { if foundSub == nil { + policy.Status.SubscriptionInterventionTime = nil + // Missing Subscription: report NonCompliance changed := updateStatus(policy, missingWantedCond("Subscription"), missingWantedObj(desiredSub)) @@ -1135,12 +1148,18 @@ func (r *OperatorPolicyReconciler) musthaveSubscription( subResFailed := mergedSub.Status.GetCondition(operatorv1alpha1.SubscriptionResolutionFailed) if subResFailed.Status == corev1.ConditionTrue { - return r.considerResolutionFailed(ctx, policy, foundSub, mergedSub) + return r.considerResolutionFailed(ctx, policy, mergedSub) + } + + if policy.Status.SubscriptionInterventionExpired() { + policy.Status.SubscriptionInterventionTime = nil } return mergedSub, nil, updateStatus(policy, matchesCond("Subscription"), matchedObj(foundSub)), nil } + policy.Status.SubscriptionInterventionTime = nil + // Specs don't match. if policy.Spec.RemediationAction.IsEnforce() && skipUpdate { changed := updateStatus(policy, mismatchCondUnfixable("Subscription"), mismatchedObj(foundSub)) @@ -1179,7 +1198,6 @@ func (r *OperatorPolicyReconciler) musthaveSubscription( func (r *OperatorPolicyReconciler) considerResolutionFailed( ctx context.Context, policy *policyv1beta1.OperatorPolicy, - foundSub *unstructured.Unstructured, mergedSub *operatorv1alpha1.Subscription, ) (*operatorv1alpha1.Subscription, []metav1.Condition, bool, error) { opLog := ctrl.LoggerFrom(ctx) @@ -1198,12 +1216,20 @@ func (r *OperatorPolicyReconciler) considerResolutionFailed( } if !includesSubscription { + if policy.Status.SubscriptionInterventionExpired() { + policy.Status.SubscriptionInterventionTime = nil + } + return mergedSub, nil, false, nil } // Handle non-ConstraintsNotSatisfiable reasons separately if !strings.EqualFold(subResFailed.Reason, "ConstraintsNotSatisfiable") { - changed := updateStatus(policy, subResFailedCond(subResFailed), nonCompObj(foundSub, subResFailed.Reason)) + changed := updateStatus(policy, subResFailedCond(subResFailed), nonCompObj(mergedSub, subResFailed.Reason)) + + if policy.Status.SubscriptionInterventionExpired() { + policy.Status.SubscriptionInterventionTime = nil + } return mergedSub, nil, changed, nil } @@ -1211,9 +1237,76 @@ func (r *OperatorPolicyReconciler) considerResolutionFailed( // A "constraints not satisfiable" message has nondeterministic clauses, and can be noisy with a list of versions. // Just report a generic condition, which will prevent the OperatorPolicy status from constantly updating // when the details in the Subscription status change. - changed := updateStatus(policy, subConstraintsNotSatisfiableCond, nonCompObj(foundSub, "ConstraintsNotSatisfiable")) + changed := updateStatus(policy, subConstraintsNotSatisfiableCond, + nonCompObj(mergedSub, "ConstraintsNotSatisfiable")) + + if policy.Status.SubscriptionInterventionExpired() || policy.Status.SubscriptionInterventionTime == nil { + interventionTime := metav1.Time{Time: time.Now().Add(olmGracePeriod)} + policy.Status.SubscriptionInterventionTime = &interventionTime + + opLog.V(1).Info("Detected ConstraintsNotSatisfiable, setting an intervention time", + "interventionTime", interventionTime, "subscription", mergedSub) + + return mergedSub, nil, changed, nil + } + + if policy.Status.SubscriptionInterventionWaiting() { + opLog.V(1).Info("Detected ConstraintsNotSatisfiable, giving OLM more time before possibly intervening", + "interventionTime", policy.Status.SubscriptionInterventionTime) + + return mergedSub, nil, changed, nil + } + + // Do the "intervention" - return mergedSub, nil, changed, nil + relatedCSVs := policy.Status.RelatedObjsOfKind("ClusterServiceVersion") + if len(relatedCSVs) != 1 { + opLog.V(1).Info("No related CSVs to use for an intervention") + + return mergedSub, nil, changed, nil + } + + csvIdx := 0 + for idx := range relatedCSVs { + csvIdx = idx + } + + csvName := relatedCSVs[csvIdx].Object.Metadata.Name + + matches := unreferencedCSVRegex.FindStringSubmatch(subResFailed.Message) + if len(matches) < 2 { + opLog.V(1).Info("Subscription condition does not match pattern for an unreferenced CSV", + "subscriptionConditionMessage", subResFailed.Message) + + return mergedSub, nil, changed, nil + } + + if csvName != matches[1] { + opLog.V(1).Info("The CSV found in the Subscription condition does not match the CSV found by label", + "subscriptionConditionMessage", subResFailed.Message, "csvName", csvName, "csvInSub", matches[1]) + + return mergedSub, nil, changed, nil + } + + mergedSub.Status = operatorv1alpha1.SubscriptionStatus{ + CurrentCSV: csvName, + InstalledCSV: csvName, + State: operatorv1alpha1.SubscriptionStateAtLatest, + LastUpdated: metav1.Now(), + } + + opLog.Info("Updating Subscription status to point to CSV", "csvName", csvName) + + if err := r.TargetClient.Status().Update(ctx, mergedSub); err != nil { + return mergedSub, nil, changed, + fmt.Errorf("error updating the Subscription status to point to the CSV: %w", err) + } + + mergedSub.SetGroupVersionKind(subscriptionGVK) // Update might strip this information + + updateStatus(policy, updatedCond("Subscription"), updatedObj(mergedSub)) + + return mergedSub, nil, true, nil } func (r *OperatorPolicyReconciler) mustnothaveSubscription( @@ -1222,6 +1315,8 @@ func (r *OperatorPolicyReconciler) mustnothaveSubscription( desiredSub *operatorv1alpha1.Subscription, foundUnstructSub *unstructured.Unstructured, ) (*operatorv1alpha1.Subscription, []metav1.Condition, bool, error) { + policy.Status.SubscriptionInterventionTime = nil + if foundUnstructSub == nil { // Missing Subscription: report Compliance changed := updateStatus(policy, missingNotWantedCond("Subscription"), missingNotWantedObj(desiredSub)) @@ -1522,8 +1617,11 @@ func (r *OperatorPolicyReconciler) handleCSV( var foundCSV *operatorv1alpha1.ClusterServiceVersion + relatedCSVs := make([]policyv1.RelatedObject, 0) + for _, csv := range csvList { - if csv.GetName() == sub.Status.InstalledCSV { + // If the subscription does not know about the CSV, this can report multiple CSVs as related + if sub.Status.InstalledCSV == "" || sub.Status.InstalledCSV == csv.GetName() { matchedCSV := operatorv1alpha1.ClusterServiceVersion{} err = runtime.DefaultUnstructuredConverter.FromUnstructured(csv.UnstructuredContent(), &matchedCSV) @@ -1531,9 +1629,11 @@ func (r *OperatorPolicyReconciler) handleCSV( return nil, nil, false, err } - foundCSV = &matchedCSV + relatedCSVs = append(relatedCSVs, existingCSVObj(&matchedCSV)) - break + if sub.Status.InstalledCSV == csv.GetName() { + foundCSV = &matchedCSV + } } } @@ -1545,13 +1645,14 @@ func (r *OperatorPolicyReconciler) handleCSV( // CSV has not yet been created by OLM if foundCSV == nil { - changed := updateStatus(policy, - missingWantedCond("ClusterServiceVersion"), missingCSVObj(sub.Name, sub.Namespace)) + if len(relatedCSVs) == 0 { + relatedCSVs = append(relatedCSVs, missingCSVObj(sub.Name, sub.Namespace)) + } - return foundCSV, nil, changed, nil + return foundCSV, nil, updateStatus(policy, missingWantedCond("ClusterServiceVersion"), relatedCSVs...), nil } - return foundCSV, nil, updateStatus(policy, buildCSVCond(foundCSV), existingCSVObj(foundCSV)), nil + return foundCSV, nil, updateStatus(policy, buildCSVCond(foundCSV), relatedCSVs...), nil } func (r *OperatorPolicyReconciler) mustnothaveCSV( diff --git a/deploy/crds/kustomize_operatorpolicy/policy.open-cluster-management.io_operatorpolicies.yaml b/deploy/crds/kustomize_operatorpolicy/policy.open-cluster-management.io_operatorpolicies.yaml index a3a4fda7..84ed9699 100644 --- a/deploy/crds/kustomize_operatorpolicy/policy.open-cluster-management.io_operatorpolicies.yaml +++ b/deploy/crds/kustomize_operatorpolicy/policy.open-cluster-management.io_operatorpolicies.yaml @@ -293,6 +293,14 @@ spec: resolvedSubscriptionLabel: description: The resolved name.namespace of the subscription type: string + subscriptionInterventionTime: + description: |- + Timestamp for a possible intervention to help a Subscription stuck with a + ConstraintsNotSatisfiable condition. Can be in the future, indicating the + policy is waiting for OLM to resolve the situation. If in the recent past, + the policy may update the status of the Subscription. + format: date-time + type: string type: object type: object served: true diff --git a/deploy/crds/policy.open-cluster-management.io_operatorpolicies.yaml b/deploy/crds/policy.open-cluster-management.io_operatorpolicies.yaml index c77382e7..92fc655b 100644 --- a/deploy/crds/policy.open-cluster-management.io_operatorpolicies.yaml +++ b/deploy/crds/policy.open-cluster-management.io_operatorpolicies.yaml @@ -286,6 +286,14 @@ spec: resolvedSubscriptionLabel: description: The resolved name.namespace of the subscription type: string + subscriptionInterventionTime: + description: |- + Timestamp for a possible intervention to help a Subscription stuck with a + ConstraintsNotSatisfiable condition. Can be in the future, indicating the + policy is waiting for OLM to resolve the situation. If in the recent past, + the policy may update the status of the Subscription. + format: date-time + type: string type: object type: object served: true diff --git a/test/e2e/case38_install_operator_test.go b/test/e2e/case38_install_operator_test.go index 9cd3a09f..db24f6a6 100644 --- a/test/e2e/case38_install_operator_test.go +++ b/test/e2e/case38_install_operator_test.go @@ -9,6 +9,7 @@ import ( "slices" "strconv" "strings" + "time" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" @@ -35,7 +36,15 @@ var _ = Describe("Testing OperatorPolicy", Ordered, Label("supports-hosted"), fu ) // checks that the compliance state eventually matches what is desired - checkCompliance := func(polName string, ns string, timeoutSeconds int, comp policyv1.ComplianceState) { + checkCompliance := func( + polName string, + ns string, + timeoutSeconds int, + comp policyv1.ComplianceState, + consistencyArgs ...interface{}, + ) { + GinkgoHelper() + var debugMessage string DeferCleanup(func() { @@ -44,7 +53,7 @@ var _ = Describe("Testing OperatorPolicy", Ordered, Label("supports-hosted"), fu } }) - EventuallyWithOffset(1, func(g Gomega) { + compCheck := func(g Gomega) { unstructPolicy, err := clientManagedDynamic.Resource(gvrOperatorPolicy).Namespace(ns). Get(context.TODO(), polName, metav1.GetOptions{}) g.Expect(err).NotTo(HaveOccurred()) @@ -61,7 +70,13 @@ var _ = Describe("Testing OperatorPolicy", Ordered, Label("supports-hosted"), fu g.Expect(err).NotTo(HaveOccurred()) g.Expect(policy.Status.ComplianceState).To(Equal(comp)) - }, timeoutSeconds, 3).Should(Succeed()) + } + + Eventually(compCheck, timeoutSeconds, 3).Should(Succeed()) + + if len(consistencyArgs) > 0 { + Consistently(compCheck, consistencyArgs...).Should(Succeed()) + } } // checks that the policy has the proper compliance, that the relatedObjects of a given @@ -3239,4 +3254,82 @@ var _ = Describe("Testing OperatorPolicy", Ordered, Label("supports-hosted"), fu ) }) }) + Describe("Testing recovery of sub-csv connection", Ordered, func() { + const ( + opPolYAML = "../resources/case38_operator_install/operator-policy-no-group-enforce.yaml" + opPolName = "oppol-no-group-enforce" + subName = "project-quay" + ) + + scenarioTriggered := true + + BeforeAll(func() { + preFunc() + + createObjWithParent(parentPolicyYAML, parentPolicyName, + opPolYAML, testNamespace, gvrPolicy, gvrOperatorPolicy) + }) + + BeforeEach(func() { + if !scenarioTriggered { + Skip("test scenario was unable to be triggered") + } + }) + + It("should get the 'csv exists and is not referenced' condition", func(ctx SpecContext) { + scenarioTriggered = false + + By("Verifying the policy starts compliant") + checkCompliance(opPolName, testNamespace, olmWaitTimeout*2, policyv1.Compliant) + + By("Periodically deleting the subscription and checking the status") + scenarioDeadline := time.Now().Add(40 * time.Second) + + scenarioTriggerLoop: + for scenarioDeadline.After(time.Now()) { + KubectlTarget("delete", "subscription", subName, "-n", opPolTestNS) + time.Sleep(time.Second) + + sub, err := targetK8sDynamic.Resource(gvrSubscription).Namespace(opPolTestNS). + Get(ctx, subName, metav1.GetOptions{}) + if err != nil || sub == nil { + continue + } + + subConds, _, _ := unstructured.NestedSlice(sub.Object, "status", "conditions") + for _, cond := range subConds { + condMap, ok := cond.(map[string]interface{}) + if !ok { + continue + } + + if condType, _, _ := unstructured.NestedString(condMap, "type"); condType != "ResolutionFailed" { + continue + } + + if condStatus, _, _ := unstructured.NestedString(condMap, "status"); condStatus != "True" { + continue + } + + condMessage, _, _ := unstructured.NestedString(condMap, "message") + notRefRgx := regexp.MustCompile(`clusterserviceversion (\S*) exists and is not referenced`) + if notRefRgx.MatchString(condMessage) { + scenarioTriggered = true + + break scenarioTriggerLoop + } + } + + time.Sleep(5 * time.Second) + } + }) + + It("Verifies the policy eventually fixes the 'not referenced' condition", func() { + By("Sleeping 25s, since OperatorPolicy should wait a while before intervening") + time.Sleep(25 * time.Second) + + By("Verifying the policy becomes compliant") + checkCompliance(opPolName, testNamespace, 2*olmWaitTimeout, policyv1.Compliant, 30, 3) + }) + }) }) diff --git a/test/resources/case38_operator_install/operator-policy-no-group-enforce.yaml b/test/resources/case38_operator_install/operator-policy-no-group-enforce.yaml index 604f4077..400da462 100644 --- a/test/resources/case38_operator_install/operator-policy-no-group-enforce.yaml +++ b/test/resources/case38_operator_install/operator-policy-no-group-enforce.yaml @@ -20,5 +20,4 @@ spec: namespace: operator-policy-testns source: operatorhubio-catalog sourceNamespace: olm - startingCSV: quay-operator.v3.10.0 upgradeApproval: Automatic