diff --git a/api/v1beta1/operatorpolicy_types.go b/api/v1beta1/operatorpolicy_types.go index cf1a9a14..50ee7386 100644 --- a/api/v1beta1/operatorpolicy_types.go +++ b/api/v1beta1/operatorpolicy_types.go @@ -5,6 +5,7 @@ package v1beta1 import ( "strings" + "time" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" @@ -209,6 +210,12 @@ type OperatorPolicyStatus struct { // The list of overlapping OperatorPolicies (as name.namespace) which all manage the same // subscription, including this policy. When no overlapping is detected, this list will be empty. OverlappingPolicies []string `json:"overlappingPolicies,omitempty"` + + // Timestamp for a possible intervention to help a Subscription stuck with a + // ConstraintsNotSatisfiable condition. Can be in the future, indicating the + // policy is waiting for OLM to resolve the situation. If in the recent past, + // the policy may update the status of the Subscription. + SubscriptionInterventionTime *metav1.Time `json:"subscriptionInterventionTime,omitempty"` } // RelatedObjsOfKind iterates over the related objects in the status and returns a map of the index @@ -238,6 +245,25 @@ func (status OperatorPolicyStatus) GetCondition(condType string) (int, metav1.Co return -1, metav1.Condition{} } +// Returns true if the SubscriptionInterventionTime is far enough in the past +// to be considered expired, and therefore should be removed from the status. +func (status OperatorPolicyStatus) SubscriptionInterventionExpired() bool { + if status.SubscriptionInterventionTime == nil { + return false + } + + return status.SubscriptionInterventionTime.Time.Before(time.Now().Add(-10 * time.Second)) +} + +// Returns true if the SubscriptionInterventionTime is in the future. +func (status OperatorPolicyStatus) SubscriptionInterventionWaiting() bool { + if status.SubscriptionInterventionTime == nil { + return false + } + + return status.SubscriptionInterventionTime.Time.After(time.Now()) +} + // OperatorPolicy is the schema for the operatorpolicies API. You can use the operator policy to // manage operators by providing automation for their management and reporting on the status across // the various operator objects. diff --git a/api/v1beta1/zz_generated.deepcopy.go b/api/v1beta1/zz_generated.deepcopy.go index 76c5d43c..61572441 100644 --- a/api/v1beta1/zz_generated.deepcopy.go +++ b/api/v1beta1/zz_generated.deepcopy.go @@ -137,6 +137,10 @@ func (in *OperatorPolicyStatus) DeepCopyInto(out *OperatorPolicyStatus) { *out = make([]string, len(*in)) copy(*out, *in) } + if in.SubscriptionInterventionTime != nil { + in, out := &in.SubscriptionInterventionTime, &out.SubscriptionInterventionTime + *out = (*in).DeepCopy() + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new OperatorPolicyStatus. diff --git a/controllers/operatorpolicy_controller.go b/controllers/operatorpolicy_controller.go index 58a12c75..8e70a064 100644 --- a/controllers/operatorpolicy_controller.go +++ b/controllers/operatorpolicy_controller.go @@ -14,6 +14,7 @@ import ( "slices" "strconv" "strings" + "time" operatorv1 "github.com/operator-framework/api/pkg/operators/v1" operatorv1alpha1 "github.com/operator-framework/api/pkg/operators/v1alpha1" @@ -46,6 +47,7 @@ import ( const ( OperatorControllerName string = "operator-policy-controller" CatalogSourceReady string = "READY" + olmGracePeriod = 30 * time.Second ) var ( @@ -94,7 +96,8 @@ var ( Version: "v1", Resource: "packagemanifests", } - ErrPackageManifest = errors.New("") + ErrPackageManifest = errors.New("") + unreferencedCSVRegex = regexp.MustCompile(`clusterserviceversion (\S*) exists and is not referenced`) ) // OperatorPolicyReconciler reconciles a OperatorPolicy object @@ -168,7 +171,7 @@ var _ reconcile.Reconciler = &OperatorPolicyReconciler{} // For more details, check Reconcile and its Result here: // - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.14.4/pkg/reconcile func (r *OperatorPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { - OpLog := ctrl.LoggerFrom(ctx) + opLog := ctrl.LoggerFrom(ctx) policy := &policyv1beta1.OperatorPolicy{} watcher := opPolIdentifier(req.Namespace, req.Name) @@ -176,25 +179,27 @@ func (r *OperatorPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reque err := r.Get(ctx, req.NamespacedName, policy) if err != nil { if k8serrors.IsNotFound(err) { - OpLog.Info("Operator policy could not be found") + opLog.Info("Operator policy could not be found") err = r.DynamicWatcher.RemoveWatcher(watcher) if err != nil { - OpLog.Error(err, "Error updating dependency watcher. Ignoring the failure.") + opLog.Error(err, "Error updating dependency watcher. Ignoring the failure.") } return reconcile.Result{}, nil } - OpLog.Error(err, "Failed to get operator policy") + opLog.Error(err, "Failed to get operator policy") return reconcile.Result{}, err } + originalStatus := *policy.Status.DeepCopy() + // Start query batch for caching and watching related objects err = r.DynamicWatcher.StartQueryBatch(watcher) if err != nil { - OpLog.Error(err, "Could not start query batch for the watcher") + opLog.Error(err, "Could not start query batch for the watcher") return reconcile.Result{}, err } @@ -202,12 +207,12 @@ func (r *OperatorPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reque defer func() { err := r.DynamicWatcher.EndQueryBatch(watcher) if err != nil { - OpLog.Error(err, "Could not end query batch for the watcher") + opLog.Error(err, "Could not end query batch for the watcher") } }() // handle the policy - OpLog.Info("Reconciling OperatorPolicy") + opLog.Info("Reconciling OperatorPolicy") errs := make([]error, 0) @@ -216,14 +221,16 @@ func (r *OperatorPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reque errs = append(errs, err) } + if conditionChanged || !reflect.DeepEqual(policy.Status, originalStatus) { + if err := r.Status().Update(ctx, policy); err != nil { + errs = append(errs, err) + } + } + if conditionChanged { // Add an event for the "final" state of the policy, otherwise this only has the // "early" events (and possibly has zero events). conditionsToEmit = append(conditionsToEmit, calculateComplianceCondition(policy)) - - if err := r.Status().Update(ctx, policy); err != nil { - errs = append(errs, err) - } } for _, cond := range conditionsToEmit { @@ -232,7 +239,21 @@ func (r *OperatorPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reque } } - return reconcile.Result{}, utilerrors.NewAggregate(errs) + result := reconcile.Result{} + finalErr := utilerrors.NewAggregate(errs) + + if len(errs) == 0 { + // Schedule a requeue for the intervention. + // Note: this requeue will be superseded if the Subscription's status is flapping. + if policy.Status.SubscriptionInterventionWaiting() { + result.RequeueAfter = time.Until(policy.Status.SubscriptionInterventionTime.Add(time.Second)) + } + } + + opLog.Info("Reconciling complete", "finalErr", finalErr, + "conditionChanged", conditionChanged, "eventCount", len(conditionsToEmit)) + + return result, finalErr } // handleResources determines the current desired state based on the policy, and @@ -247,7 +268,7 @@ func (r *OperatorPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reque func (r *OperatorPolicyReconciler) handleResources(ctx context.Context, policy *policyv1beta1.OperatorPolicy) ( earlyComplianceEvents []metav1.Condition, condChanged bool, err error, ) { - OpLog := ctrl.LoggerFrom(ctx) + opLog := ctrl.LoggerFrom(ctx) earlyComplianceEvents = make([]metav1.Condition, 0) @@ -255,7 +276,7 @@ func (r *OperatorPolicyReconciler) handleResources(ctx context.Context, policy * condChanged = changed if err != nil { - OpLog.Error(err, "Error building desired resources") + opLog.Error(err, "Error building desired resources") return earlyComplianceEvents, condChanged, err } @@ -270,7 +291,7 @@ func (r *OperatorPolicyReconciler) handleResources(ctx context.Context, policy * condChanged = condChanged || changed if err != nil { - OpLog.Error(err, "Error handling OperatorGroup") + opLog.Error(err, "Error handling OperatorGroup") return earlyComplianceEvents, condChanged, err } @@ -280,7 +301,7 @@ func (r *OperatorPolicyReconciler) handleResources(ctx context.Context, policy * condChanged = condChanged || changed if err != nil { - OpLog.Error(err, "Error handling Subscription") + opLog.Error(err, "Error handling Subscription") return earlyComplianceEvents, condChanged, err } @@ -289,7 +310,7 @@ func (r *OperatorPolicyReconciler) handleResources(ctx context.Context, policy * condChanged = condChanged || changed if err != nil { - OpLog.Error(err, "Error handling InstallPlan") + opLog.Error(err, "Error handling InstallPlan") return earlyComplianceEvents, condChanged, err } @@ -299,7 +320,7 @@ func (r *OperatorPolicyReconciler) handleResources(ctx context.Context, policy * condChanged = condChanged || changed if err != nil { - OpLog.Error(err, "Error handling ClusterServiceVersions") + opLog.Error(err, "Error handling ClusterServiceVersions") return earlyComplianceEvents, condChanged, err } @@ -309,7 +330,7 @@ func (r *OperatorPolicyReconciler) handleResources(ctx context.Context, policy * condChanged = condChanged || changed if err != nil { - OpLog.Error(err, "Error handling CustomResourceDefinitions") + opLog.Error(err, "Error handling CustomResourceDefinitions") return earlyComplianceEvents, condChanged, err } @@ -318,7 +339,7 @@ func (r *OperatorPolicyReconciler) handleResources(ctx context.Context, policy * condChanged = condChanged || changed if err != nil { - OpLog.Error(err, "Error handling Deployments") + opLog.Error(err, "Error handling Deployments") return earlyComplianceEvents, condChanged, err } @@ -327,7 +348,7 @@ func (r *OperatorPolicyReconciler) handleResources(ctx context.Context, policy * condChanged = condChanged || changed if err != nil { - OpLog.Error(err, "Error handling CatalogSource") + opLog.Error(err, "Error handling CatalogSource") return earlyComplianceEvents, condChanged, err } @@ -347,17 +368,16 @@ func (r *OperatorPolicyReconciler) buildResources(ctx context.Context, policy *p *operatorv1alpha1.Subscription, *operatorv1.OperatorGroup, bool, error, ) { var returnedErr error + var tmplResolver *templates.TemplateResolver + opLog := ctrl.LoggerFrom(ctx) validationErrors := make([]error, 0) - disableTemplates := false if disableAnnotation, ok := policy.GetAnnotations()["policy.open-cluster-management.io/disable-templates"]; ok { disableTemplates, _ = strconv.ParseBool(disableAnnotation) // on error, templates will not be disabled } - var tmplResolver *templates.TemplateResolver - if !disableTemplates { var err error @@ -365,6 +385,8 @@ func (r *OperatorPolicyReconciler) buildResources(ctx context.Context, policy *p if err != nil { validationErrors = append(validationErrors, fmt.Errorf("unable to create template resolver: %w", err)) } + } else { + opLog.V(1).Info("Templates disabled by annotation") } sub, subErr := buildSubscription(policy, r.DefaultNamespace, tmplResolver) @@ -431,7 +453,7 @@ func (r *OperatorPolicyReconciler) checkSubOverlap( ) (statusChanged bool, validationErr error, apiErr error) { resolvedSubLabel := "" if sub != nil { - resolvedSubLabel = sub.Name + "." + sub.Namespace + resolvedSubLabel = opLabelName(sub.Name, sub.Namespace) } if policy.Status.ResolvedSubscriptionLabel != resolvedSubLabel { @@ -494,6 +516,7 @@ func (r *OperatorPolicyReconciler) checkSubOverlap( func (r *OperatorPolicyReconciler) applySubscriptionDefaults( ctx context.Context, subscription *operatorv1alpha1.Subscription, ) error { + opLog := ctrl.LoggerFrom(ctx) subSpec := subscription.Spec defaultsNeeded := subSpec.Channel == "" || subSpec.CatalogSource == "" || subSpec.CatalogSourceNamespace == "" @@ -502,6 +525,8 @@ func (r *OperatorPolicyReconciler) applySubscriptionDefaults( return nil } + opLog.V(1).Info("Determining defaults for the subscription based on the PackageManifest") + // PackageManifests come from an API server and not a Kubernetes resource, so the DynamicWatcher can't be used since // it utilizes watches. The namespace doesn't have any meaning but is required. packageManifest, err := r.DynamicClient.Resource(packageManifestGVR).Namespace("default").Get( @@ -761,6 +786,10 @@ func (r *OperatorPolicyReconciler) musthaveOpGroup( desiredOpGroup *operatorv1.OperatorGroup, foundOpGroups []unstructured.Unstructured, ) (bool, []metav1.Condition, bool, error) { + opLog := ctrl.LoggerFrom(ctx) + + opLog.V(2).Info("Entered musthaveOpGroup", "foundOpGroupsLen", len(foundOpGroups)) + switch len(foundOpGroups) { case 0: // Missing OperatorGroup: report NonCompliance @@ -776,7 +805,7 @@ func (r *OperatorPolicyReconciler) musthaveOpGroup( earlyConds = append(earlyConds, calculateComplianceCondition(policy)) } - err := r.createWithNamespace(ctx, policy, desiredOpGroup) + err := r.createWithNamespace(ctx, desiredOpGroup) if err != nil { return false, nil, changed, fmt.Errorf("error creating the OperatorGroup: %w", err) } @@ -861,6 +890,8 @@ func (r *OperatorPolicyReconciler) musthaveOpGroup( desiredOpGroup.ResourceVersion = opGroup.GetResourceVersion() + opLog.Info("Updating OperatorGroup to match desired state", "opGroupName", opGroup.GetName()) + err = r.TargetClient.Update(ctx, &opGroup) if err != nil { return false, nil, changed, fmt.Errorf("error updating the OperatorGroup: %w", err) @@ -879,9 +910,12 @@ func (r *OperatorPolicyReconciler) musthaveOpGroup( } // createWithNamespace will create the input object and the object's namespace if needed. -func (r *OperatorPolicyReconciler) createWithNamespace( - ctx context.Context, policy *policyv1beta1.OperatorPolicy, object client.Object, -) error { +func (r *OperatorPolicyReconciler) createWithNamespace(ctx context.Context, object client.Object) error { + opLog := ctrl.LoggerFrom(ctx) + + opLog.Info("Creating resource", "resourceGVK", object.GetObjectKind().GroupVersionKind(), + "resourceName", object.GetName(), "resourceNamespace", object.GetNamespace()) + err := r.TargetClient.Create(ctx, object) if err == nil { return nil @@ -892,7 +926,7 @@ func (r *OperatorPolicyReconciler) createWithNamespace( return err } - log.Info("Creating the namespace since it didn't exist", "name", object.GetNamespace(), "policy", policy.Name) + opLog.Info("Creating the namespace since it didn't exist", "name", object.GetNamespace()) ns := corev1.Namespace{ ObjectMeta: metav1.ObjectMeta{ @@ -1015,6 +1049,9 @@ func (r *OperatorPolicyReconciler) mustnothaveOpGroup( earlyConds = append(earlyConds, calculateComplianceCondition(policy)) } + opLog := ctrl.LoggerFrom(ctx) + opLog.Info("Deleting OperatorGroup", "opGroupName", desiredOpGroup.Name) + err := r.TargetClient.Delete(ctx, desiredOpGroup) if err != nil { return earlyConds, changed, fmt.Errorf("error deleting the OperatorGroup: %w", err) @@ -1060,6 +1097,8 @@ func (r *OperatorPolicyReconciler) musthaveSubscription( ogCorrect bool, ) (*operatorv1alpha1.Subscription, []metav1.Condition, bool, error) { if foundSub == nil { + policy.Status.SubscriptionInterventionTime = nil + // Missing Subscription: report NonCompliance changed := updateStatus(policy, missingWantedCond("Subscription"), missingWantedObj(desiredSub)) @@ -1074,7 +1113,7 @@ func (r *OperatorPolicyReconciler) musthaveSubscription( earlyConds = append(earlyConds, calculateComplianceCondition(policy)) } - err := r.createWithNamespace(ctx, policy, desiredSub) + err := r.createWithNamespace(ctx, desiredSub) if err != nil { return nil, nil, changed, fmt.Errorf("error creating the Subscription: %w", err) } @@ -1112,38 +1151,19 @@ func (r *OperatorPolicyReconciler) musthaveSubscription( if !updateNeeded { subResFailed := mergedSub.Status.GetCondition(operatorv1alpha1.SubscriptionResolutionFailed) - // OLM includes the status of all subscriptions in the namespace. For example, if you have two subscriptions, - // where one is referencing a valid operator and the other isn't, both will have a failed subscription - // resolution condition. if subResFailed.Status == corev1.ConditionTrue { - includesSubscription, err := messageIncludesSubscription(mergedSub, subResFailed.Message) - if err != nil { - log.Info( - "Failed to determine if the condition applied to this subscription. Assuming it does.", - "error", err.Error(), "subscription", mergedSub.Name, "package", mergedSub.Spec.Package, - "message", subResFailed.Message, - ) - - includesSubscription = true - } - - if includesSubscription { - // a "constraints not satisfiable" message has nondeterministic clauses, and thus - // need to be sorted in order to check that they are not duplicates of the current message. - if constraintMessageMatch(policy, &subResFailed) { - return mergedSub, nil, false, nil - } - - return mergedSub, nil, updateStatus( - policy, subResFailedCond(subResFailed), nonCompObj(foundSub, subResFailed.Reason)), nil - } + return r.considerResolutionFailed(ctx, policy, mergedSub) + } - return mergedSub, nil, false, nil + if policy.Status.SubscriptionInterventionExpired() { + policy.Status.SubscriptionInterventionTime = nil } return mergedSub, nil, updateStatus(policy, matchesCond("Subscription"), matchedObj(foundSub)), nil } + policy.Status.SubscriptionInterventionTime = nil + // Specs don't match. if policy.Spec.RemediationAction.IsEnforce() && skipUpdate { changed := updateStatus(policy, mismatchCondUnfixable("Subscription"), mismatchedObj(foundSub)) @@ -1163,6 +1183,10 @@ func (r *OperatorPolicyReconciler) musthaveSubscription( earlyConds = append(earlyConds, calculateComplianceCondition(policy)) } + opLog := ctrl.LoggerFrom(ctx) + opLog.Info("Updating Subscription to match the desired state", "subName", foundSub.GetName(), + "subNamespace", foundSub.GetNamespace()) + err = r.TargetClient.Update(ctx, foundSub) if err != nil { return mergedSub, nil, changed, fmt.Errorf("error updating the Subscription: %w", err) @@ -1175,12 +1199,135 @@ func (r *OperatorPolicyReconciler) musthaveSubscription( return mergedSub, earlyConds, true, nil } +func (r *OperatorPolicyReconciler) considerResolutionFailed( + ctx context.Context, + policy *policyv1beta1.OperatorPolicy, + mergedSub *operatorv1alpha1.Subscription, +) (*operatorv1alpha1.Subscription, []metav1.Condition, bool, error) { + opLog := ctrl.LoggerFrom(ctx) + subResFailed := mergedSub.Status.GetCondition(operatorv1alpha1.SubscriptionResolutionFailed) + + // The resolution failed, but OLM includes the status of all subscriptions in the namespace. + // For example, if you have two subscriptions, where one is referencing a valid operator and the other isn't, + // both will have a failed subscription resolution condition. So check for 'this' subscription. + includesSubscription, err := messageIncludesSubscription(mergedSub, subResFailed.Message) + if err != nil { + opLog.Info("Failed to determine if the condition applied to this subscription. Assuming it does.", + "error", err.Error(), "subscription", mergedSub.Name, "package", mergedSub.Spec.Package, + "message", subResFailed.Message) + + includesSubscription = true + } + + if !includesSubscription { + if policy.Status.SubscriptionInterventionExpired() { + policy.Status.SubscriptionInterventionTime = nil + } + + return mergedSub, nil, false, nil + } + + // Handle non-ConstraintsNotSatisfiable reasons separately + if !strings.EqualFold(subResFailed.Reason, "ConstraintsNotSatisfiable") { + changed := updateStatus(policy, subResFailedCond(subResFailed), nonCompObj(mergedSub, subResFailed.Reason)) + + if policy.Status.SubscriptionInterventionExpired() { + policy.Status.SubscriptionInterventionTime = nil + } + + return mergedSub, nil, changed, nil + } + + // A "constraints not satisfiable" message has nondeterministic clauses, and can be noisy with a list of versions. + // Just report a generic condition, which will prevent the OperatorPolicy status from constantly updating + // when the details in the Subscription status change. + changed := updateStatus(policy, subConstraintsNotSatisfiableCond, + nonCompObj(mergedSub, "ConstraintsNotSatisfiable")) + + unrefCSVMatches := unreferencedCSVRegex.FindStringSubmatch(subResFailed.Message) + if len(unrefCSVMatches) < 2 { + opLog.V(1).Info("Subscription condition does not match pattern for an unreferenced CSV", + "subscriptionConditionMessage", subResFailed.Message) + + if policy.Status.SubscriptionInterventionExpired() { + policy.Status.SubscriptionInterventionTime = nil + } + + return mergedSub, nil, changed, nil + } + + if policy.Status.SubscriptionInterventionExpired() || policy.Status.SubscriptionInterventionTime == nil { + interventionTime := metav1.Time{Time: time.Now().Add(olmGracePeriod)} + policy.Status.SubscriptionInterventionTime = &interventionTime + + opLog.V(1).Info("Detected ConstraintsNotSatisfiable, setting an intervention time", + "interventionTime", interventionTime, "subscription", mergedSub) + + return mergedSub, nil, changed, nil + } + + if policy.Status.SubscriptionInterventionWaiting() { + opLog.V(1).Info("Detected ConstraintsNotSatisfiable, giving OLM more time before possibly intervening", + "interventionTime", policy.Status.SubscriptionInterventionTime) + + return mergedSub, nil, changed, nil + } + + // Do the "intervention" + + watcher := opPolIdentifier(policy.Namespace, policy.Name) + + existingCSV, err := r.DynamicWatcher.Get(watcher, clusterServiceVersionGVK, mergedSub.Namespace, unrefCSVMatches[1]) + if err != nil { + return mergedSub, nil, changed, fmt.Errorf("error getting the existing CSV in the subscription status: %w", err) + } + + if existingCSV == nil { + opLog.Info("The CSV mentioned in the subscription status could not be found, not intervening", + "subscriptionConditionMessage", subResFailed.Message, "csvName", unrefCSVMatches[1]) + + return mergedSub, nil, changed, nil + } + + // This check is based on the olm check, but does not require fully unmarshalling the csv. + reason, found, err := unstructured.NestedString(existingCSV.Object, "status", "reason") + hasReasonCopied := found && err == nil && reason == string(operatorv1alpha1.CSVReasonCopied) + + if hasReasonCopied || operatorv1alpha1.IsCopied(existingCSV) { + opLog.Info("The CSV mentioned in the subscription status is a copy, not intervening", + "subscriptionConditionMessage", subResFailed.Message, "csvName", unrefCSVMatches[1]) + + return mergedSub, nil, changed, nil + } + + if mergedSub.Status.LastUpdated.IsZero() { + mergedSub.Status.LastUpdated = metav1.Now() + } + + mergedSub.Status.CurrentCSV = existingCSV.GetName() + + opLog.Info("Updating Subscription status to point to CSV", "csvName", existingCSV.GetName()) + + if err := r.TargetClient.Status().Update(ctx, mergedSub); err != nil { + return mergedSub, nil, changed, + fmt.Errorf("error updating the Subscription status to point to the CSV: %w", err) + } + + mergedSub.SetGroupVersionKind(subscriptionGVK) // Update might strip this information + + updateStatus(policy, updatedCond("Subscription"), updatedObj(mergedSub)) + + return mergedSub, nil, true, nil +} + func (r *OperatorPolicyReconciler) mustnothaveSubscription( ctx context.Context, policy *policyv1beta1.OperatorPolicy, desiredSub *operatorv1alpha1.Subscription, foundUnstructSub *unstructured.Unstructured, ) (*operatorv1alpha1.Subscription, []metav1.Condition, bool, error) { + policy.Status.SubscriptionInterventionTime = nil + if foundUnstructSub == nil { // Missing Subscription: report Compliance changed := updateStatus(policy, missingNotWantedCond("Subscription"), missingNotWantedObj(desiredSub)) @@ -1217,6 +1364,10 @@ func (r *OperatorPolicyReconciler) mustnothaveSubscription( earlyConds = append(earlyConds, calculateComplianceCondition(policy)) } + opLog := ctrl.LoggerFrom(ctx) + opLog.Info("Deleting Subscription", "subName", foundUnstructSub.GetName(), + "subNamespace", foundUnstructSub.GetNamespace()) + err := r.TargetClient.Delete(ctx, foundUnstructSub) if err != nil { return foundSub, earlyConds, changed, fmt.Errorf("error deleting the Subscription: %w", err) @@ -1252,37 +1403,6 @@ func messageIncludesSubscription(subscription *operatorv1alpha1.Subscription, me return regexp.MatchString(regex, message) } -// constraintMessageMatch checks if the ConstraintsNotSatisfiable message is actually different -// from the old one by sorting the clauses of the message -func constraintMessageMatch(policy *policyv1beta1.OperatorPolicy, cond *operatorv1alpha1.SubscriptionCondition) bool { - const cnfPrefix = "constraints not satisfiable: " - - var policyMessage, subMessage string - - for _, statusCond := range policy.Status.Conditions { - if strings.Contains(statusCond.Message, cnfPrefix) { - policyMessage = statusCond.Message - } - } - - if policyMessage == "" || !strings.Contains(cond.Message, cnfPrefix) { - return false - } - - policyMessage = strings.TrimPrefix(policyMessage, cnfPrefix) - subMessage = strings.TrimPrefix(cond.Message, cnfPrefix) - - // The ConstraintsNotSatisfiable message is always formatted as follows: - // constraints not satisfiable: clause1, clause2, clause3 ... - policyMessageSlice := strings.Split(policyMessage, ", ") - slices.Sort(policyMessageSlice) - - subMessageSlice := strings.Split(subMessage, ", ") - slices.Sort(subMessageSlice) - - return reflect.DeepEqual(policyMessageSlice, subMessageSlice) -} - func (r *OperatorPolicyReconciler) handleInstallPlan( ctx context.Context, policy *policyv1beta1.OperatorPolicy, sub *operatorv1alpha1.Subscription, ) (bool, error) { @@ -1321,7 +1441,7 @@ func (r *OperatorPolicyReconciler) musthaveInstallPlan( sub *operatorv1alpha1.Subscription, ownedInstallPlans []unstructured.Unstructured, ) (bool, error) { - OpLog := ctrl.LoggerFrom(ctx) + opLog := ctrl.LoggerFrom(ctx) relatedInstallPlans := make([]policyv1.RelatedObject, 0, len(ownedInstallPlans)) ipsRequiringApproval := make([]unstructured.Unstructured, 0) anyInstalling := false @@ -1336,7 +1456,7 @@ func (r *OperatorPolicyReconciler) musthaveInstallPlan( } if err != nil { - OpLog.Error(err, "Unable to determine the phase of the related InstallPlan", + opLog.Error(err, "Unable to determine the phase of the related InstallPlan", "InstallPlan.Name", installPlan.GetName()) // The InstallPlan will be added as unknown @@ -1361,6 +1481,9 @@ func (r *OperatorPolicyReconciler) musthaveInstallPlan( existingInstallPlanObj(&ownedInstallPlans[i], phase, complianceConfig)) } + opLog.V(2).Info("InstallPlans examined", "currentPlanFailed", currentPlanFailed, "anyInstalling", anyInstalling, + "ipsRequiringApprovalLen", len(ipsRequiringApproval)) + if currentPlanFailed { return updateStatus(policy, installPlanFailed, relatedInstallPlans...), nil } @@ -1383,7 +1506,7 @@ func (r *OperatorPolicyReconciler) musthaveInstallPlan( } if err != nil { - OpLog.Error(err, "Unable to determine the csv names of the related InstallPlan", + opLog.Error(err, "Unable to determine the csv names of the related InstallPlan", "InstallPlan.Name", installPlan.GetName()) csvNames = []string{"unknown"} @@ -1413,7 +1536,7 @@ func (r *OperatorPolicyReconciler) musthaveInstallPlan( } if err != nil { - OpLog.Error(err, "Unable to determine the csv names of the related InstallPlan", + opLog.Error(err, "Unable to determine the csv names of the related InstallPlan", "InstallPlan.Name", installPlan.GetName()) continue @@ -1446,6 +1569,8 @@ func (r *OperatorPolicyReconciler) musthaveInstallPlan( } } + opLog.V(2).Info("Determined approvable InstallPlans", "count", len(approvableInstallPlans)) + if len(approvableInstallPlans) != 1 { changed := updateStatus( policy, @@ -1456,6 +1581,9 @@ func (r *OperatorPolicyReconciler) musthaveInstallPlan( return changed, nil } + opLog.Info("Approving InstallPlan", "InstallPlanName", approvableInstallPlans[0].GetName(), + "InstallPlanNamespace", approvableInstallPlans[0].GetNamespace()) + if err := unstructured.SetNestedField(approvableInstallPlans[0].Object, true, "spec", "approved"); err != nil { return false, fmt.Errorf("error approving InstallPlan: %w", err) } @@ -1504,8 +1632,11 @@ func (r *OperatorPolicyReconciler) handleCSV( var foundCSV *operatorv1alpha1.ClusterServiceVersion + relatedCSVs := make([]policyv1.RelatedObject, 0) + for _, csv := range csvList { - if csv.GetName() == sub.Status.InstalledCSV { + // If the subscription does not know about the CSV, this can report multiple CSVs as related + if sub.Status.InstalledCSV == "" || sub.Status.InstalledCSV == csv.GetName() { matchedCSV := operatorv1alpha1.ClusterServiceVersion{} err = runtime.DefaultUnstructuredConverter.FromUnstructured(csv.UnstructuredContent(), &matchedCSV) @@ -1513,9 +1644,11 @@ func (r *OperatorPolicyReconciler) handleCSV( return nil, nil, false, err } - foundCSV = &matchedCSV + relatedCSVs = append(relatedCSVs, existingCSVObj(&matchedCSV)) - break + if sub.Status.InstalledCSV == csv.GetName() { + foundCSV = &matchedCSV + } } } @@ -1527,13 +1660,14 @@ func (r *OperatorPolicyReconciler) handleCSV( // CSV has not yet been created by OLM if foundCSV == nil { - changed := updateStatus(policy, - missingWantedCond("ClusterServiceVersion"), missingCSVObj(sub.Name, sub.Namespace)) + if len(relatedCSVs) == 0 { + relatedCSVs = append(relatedCSVs, missingCSVObj(sub.Name, sub.Namespace)) + } - return foundCSV, nil, changed, nil + return foundCSV, nil, updateStatus(policy, missingWantedCond("ClusterServiceVersion"), relatedCSVs...), nil } - return foundCSV, nil, updateStatus(policy, buildCSVCond(foundCSV), existingCSVObj(foundCSV)), nil + return foundCSV, nil, updateStatus(policy, buildCSVCond(foundCSV), relatedCSVs...), nil } func (r *OperatorPolicyReconciler) mustnothaveCSV( @@ -1542,6 +1676,8 @@ func (r *OperatorPolicyReconciler) mustnothaveCSV( csvList []unstructured.Unstructured, namespace string, ) ([]metav1.Condition, bool, error) { + opLog := ctrl.LoggerFrom(ctx) + if len(csvList) == 0 { changed := updateStatus(policy, missingNotWantedCond("ClusterServiceVersion"), missingNotWantedCSVObj(namespace)) @@ -1581,7 +1717,10 @@ func (r *OperatorPolicyReconciler) mustnothaveCSV( anyAlreadyDeleting := false for i := range csvList { - if csvList[i].GetDeletionTimestamp() != nil { + if deletionTS := csvList[i].GetDeletionTimestamp(); deletionTS != nil { + opLog.V(1).Info("Found DeletionTimestamp on ClusterServiceVersion", "csvName", csvList[i].GetName(), + "csvNamespace", csvList[i].GetNamespace(), "deletionTimestamp", deletionTS) + anyAlreadyDeleting = true relatedCSVs[i] = deletingObj(&csvList[i]) @@ -1598,6 +1737,9 @@ func (r *OperatorPolicyReconciler) mustnothaveCSV( continue } + opLog.Info("Deleting ClusterServiceVersion", "csvName", csvList[i].GetName(), + "csvNamespace", csvList[i].GetNamespace()) + err := r.TargetClient.Delete(ctx, &csvList[i]) if err != nil { changed := updateStatus(policy, foundNotWantedCond("ClusterServiceVersion", csvNames...), relatedCSVs...) @@ -1641,7 +1783,7 @@ func (r *OperatorPolicyReconciler) handleDeployment( return updateStatus(policy, notApplicableCond("Deployment")), nil } - OpLog := ctrl.LoggerFrom(ctx) + opLog := ctrl.LoggerFrom(ctx) watcher := opPolIdentifier(policy.Namespace, policy.Name) @@ -1670,7 +1812,7 @@ func (r *OperatorPolicyReconciler) handleDeployment( err = runtime.DefaultUnstructuredConverter.FromUnstructured(unstructured, &dep) if err != nil { - OpLog.Error(err, "Unable to convert unstructured Deployment to typed", "Deployment.Name", dep.Name) + opLog.Error(err, "Unable to convert unstructured Deployment to typed", "Deployment.Name", dep.Name) continue } @@ -1706,6 +1848,7 @@ func (r *OperatorPolicyReconciler) handleCRDs( return nil, updateStatus(policy, noCRDCond, noExistingCRDObj), nil } + opLog := ctrl.LoggerFrom(ctx) watcher := opPolIdentifier(policy.Namespace, policy.Name) selector := subLabelSelector(sub) @@ -1756,7 +1899,10 @@ func (r *OperatorPolicyReconciler) handleCRDs( anyAlreadyDeleting := false for i := range crdList { - if crdList[i].GetDeletionTimestamp() != nil { + if deletionTS := crdList[i].GetDeletionTimestamp(); deletionTS != nil { + opLog.V(1).Info("Found DeletionTimestamp on CustomResourceDefinition", "crdName", crdList[i].GetName(), + "deletionTimestamp", deletionTS) + anyAlreadyDeleting = true relatedCRDs[i] = deletingObj(&crdList[i]) @@ -1770,6 +1916,8 @@ func (r *OperatorPolicyReconciler) handleCRDs( continue } + opLog.Info("Deleting CustomResourceDefinition", "crdName", crdList[i].GetName()) + err := r.TargetClient.Delete(ctx, &crdList[i]) if err != nil { changed := updateStatus(policy, foundNotWantedCond("CustomResourceDefinition"), relatedCRDs...) @@ -1963,7 +2111,7 @@ func (r *OperatorPolicyReconciler) mergeObjects( func subLabelSelector(sub *operatorv1alpha1.Subscription) labels.Selector { sel, err := metav1.LabelSelectorAsSelector(&metav1.LabelSelector{ MatchExpressions: []metav1.LabelSelectorRequirement{{ - Key: fmt.Sprintf("operators.coreos.com/%v.%v", sub.Name, sub.Namespace), + Key: "operators.coreos.com/" + opLabelName(sub.Name, sub.Namespace), Operator: metav1.LabelSelectorOpExists, }}, }) @@ -1973,3 +2121,30 @@ func subLabelSelector(sub *operatorv1alpha1.Subscription) labels.Selector { return sel } + +// opLabelName returns the 'name' part of label put on operator resources by OLM. This is the part +// of the label after the slash, based on the name and namespace of the operator. It is limited to +// 63 characters in length, and guaranteed to end with an alphanumeric character. See +// https://github.com/operator-framework/operator-lifecycle-manager/blob/556637d4144fb782e93c207f55975b743ec2d419/pkg/controller/operators/decorators/operator.go#L127 +func opLabelName(name, namespace string) string { + labelName := name + "." + namespace + + if len(labelName) > 63 { + // Truncate + labelName = labelName[0:63] + + // Remove trailing illegal characters + idx := len(labelName) - 1 + for ; idx >= 0; idx-- { + lastChar := labelName[idx] + if lastChar != '.' && lastChar != '_' && lastChar != '-' { + break + } + } + + // Update Label + labelName = labelName[0 : idx+1] + } + + return labelName +} diff --git a/controllers/operatorpolicy_controller_test.go b/controllers/operatorpolicy_controller_test.go index d0ec4a81..677bb3ad 100644 --- a/controllers/operatorpolicy_controller_test.go +++ b/controllers/operatorpolicy_controller_test.go @@ -258,61 +258,3 @@ func TestMessageIncludesSubscription(t *testing.T) { ) } } - -func TestMessageContentOrderMatching(t *testing.T) { - t.Parallel() - - testPolicy := &policyv1beta1.OperatorPolicy{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-policy", - Namespace: "default", - }, - Spec: policyv1beta1.OperatorPolicySpec{ - Severity: "low", - RemediationAction: "enforce", - ComplianceType: "musthave", - Subscription: runtime.RawExtension{ - Raw: []byte(`{ - "source": "my-catalog", - "sourceNamespace": "my-ns", - "name": "my-operator", - "channel": "stable", - "startingCSV": "my-operator-v1" - }`), - }, - }, - Status: policyv1beta1.OperatorPolicyStatus{ - ComplianceState: "NonCompliant", - Conditions: []metav1.Condition{ - { - Type: "SubscriptionCompliant", - Status: "False", - ObservedGeneration: 0, - LastTransitionTime: metav1.Now(), - Reason: "ConstraintsNotSatisfiable", - Message: "constraints not satisfiable: " + - "no operators found in package gatekeeper-operator-product " + - "in the catalog referenced by subscription gatekeeper-operator-product, " + - "clusterserviceversion gatekeeper-operator-product.v3.11.1 exists " + - "and is not referenced by a subscription, " + - "subscription gatekeeper-operator-product4 exists", - }, - }, - }, - } - - testCond := &operatorv1alpha1.SubscriptionCondition{ - Type: "ResolutionFailed", - Status: "True", - Reason: "ConstraintsNotSatisfiable", - Message: "constraints not satisfiable: " + - "subscription gatekeeper-operator-product4 exists, " + - "no operators found in package gatekeeper-operator-product " + - "in the catalog referenced by subscription gatekeeper-operator-product, " + - "clusterserviceversion gatekeeper-operator-product.v3.11.1 exists " + - "and is not referenced by a subscription", - } - - ret := constraintMessageMatch(testPolicy, testCond) - assert.Equal(t, true, ret) -} diff --git a/controllers/operatorpolicy_status.go b/controllers/operatorpolicy_status.go index 657eb2f7..ee2b98dd 100644 --- a/controllers/operatorpolicy_status.go +++ b/controllers/operatorpolicy_status.go @@ -590,6 +590,15 @@ func validationCond(validationErrors []error) metav1.Condition { } } +// subConstraintsNotSatisfiableCond is a NonCompliant condition with Reason 'ConstraintsNotSatisfiable', +// and Message 'constraints not satisfiable: refer to the Subscription for more details'. +var subConstraintsNotSatisfiableCond = metav1.Condition{ + Type: subConditionType, + Status: metav1.ConditionFalse, + Reason: "ConstraintsNotSatisfiable", + Message: "constraints not satisfiable: refer to the Subscription for more details", +} + // subResFailedCond takes a failed SubscriptionCondition and converts it to a generic Condition func subResFailedCond(subFailedCond operatorv1alpha1.SubscriptionCondition) metav1.Condition { cond := metav1.Condition{ diff --git a/deploy/crds/kustomize_operatorpolicy/policy.open-cluster-management.io_operatorpolicies.yaml b/deploy/crds/kustomize_operatorpolicy/policy.open-cluster-management.io_operatorpolicies.yaml index bd3c7b5d..431ae2df 100644 --- a/deploy/crds/kustomize_operatorpolicy/policy.open-cluster-management.io_operatorpolicies.yaml +++ b/deploy/crds/kustomize_operatorpolicy/policy.open-cluster-management.io_operatorpolicies.yaml @@ -379,6 +379,14 @@ spec: resolvedSubscriptionLabel: description: The resolved name.namespace of the subscription type: string + subscriptionInterventionTime: + description: |- + Timestamp for a possible intervention to help a Subscription stuck with a + ConstraintsNotSatisfiable condition. Can be in the future, indicating the + policy is waiting for OLM to resolve the situation. If in the recent past, + the policy may update the status of the Subscription. + format: date-time + type: string type: object type: object served: true diff --git a/deploy/crds/policy.open-cluster-management.io_operatorpolicies.yaml b/deploy/crds/policy.open-cluster-management.io_operatorpolicies.yaml index 09c660e9..8c48cc3a 100644 --- a/deploy/crds/policy.open-cluster-management.io_operatorpolicies.yaml +++ b/deploy/crds/policy.open-cluster-management.io_operatorpolicies.yaml @@ -369,6 +369,14 @@ spec: resolvedSubscriptionLabel: description: The resolved name.namespace of the subscription type: string + subscriptionInterventionTime: + description: |- + Timestamp for a possible intervention to help a Subscription stuck with a + ConstraintsNotSatisfiable condition. Can be in the future, indicating the + policy is waiting for OLM to resolve the situation. If in the recent past, + the policy may update the status of the Subscription. + format: date-time + type: string type: object type: object served: true diff --git a/test/e2e/case38_install_operator_test.go b/test/e2e/case38_install_operator_test.go index 6fc6e597..460cec4d 100644 --- a/test/e2e/case38_install_operator_test.go +++ b/test/e2e/case38_install_operator_test.go @@ -9,6 +9,7 @@ import ( "slices" "strconv" "strings" + "time" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" @@ -35,7 +36,15 @@ var _ = Describe("Testing OperatorPolicy", Ordered, Label("supports-hosted"), fu ) // checks that the compliance state eventually matches what is desired - checkCompliance := func(polName string, ns string, timeoutSeconds int, comp policyv1.ComplianceState) { + checkCompliance := func( + polName string, + ns string, + timeoutSeconds int, + comp policyv1.ComplianceState, + consistencyArgs ...interface{}, + ) { + GinkgoHelper() + var debugMessage string DeferCleanup(func() { @@ -44,7 +53,7 @@ var _ = Describe("Testing OperatorPolicy", Ordered, Label("supports-hosted"), fu } }) - EventuallyWithOffset(1, func(g Gomega) { + compCheck := func(g Gomega) { unstructPolicy, err := clientManagedDynamic.Resource(gvrOperatorPolicy).Namespace(ns). Get(context.TODO(), polName, metav1.GetOptions{}) g.Expect(err).NotTo(HaveOccurred()) @@ -61,7 +70,13 @@ var _ = Describe("Testing OperatorPolicy", Ordered, Label("supports-hosted"), fu g.Expect(err).NotTo(HaveOccurred()) g.Expect(policy.Status.ComplianceState).To(Equal(comp)) - }, timeoutSeconds, 3).Should(Succeed()) + } + + Eventually(compCheck, timeoutSeconds, 3).Should(Succeed()) + + if len(consistencyArgs) > 0 { + Consistently(compCheck, consistencyArgs...).Should(Succeed()) + } } // checks that the policy has the proper compliance, that the relatedObjects of a given @@ -752,11 +767,10 @@ var _ = Describe("Testing OperatorPolicy", Ordered, Label("supports-hosted"), fu Reason: "ConstraintsNotSatisfiable", }}, metav1.Condition{ - Type: "SubscriptionCompliant", - Status: metav1.ConditionFalse, - Reason: "ConstraintsNotSatisfiable", - Message: "no operators found from catalog operatorhubio-catalog in namespace fake " + - "referenced by subscription project-quay", + Type: "SubscriptionCompliant", + Status: metav1.ConditionFalse, + Reason: "ConstraintsNotSatisfiable", + Message: "constraints not satisfiable: refer to the Subscription for more details", }, "the Subscription was updated to match the policy", ) @@ -922,12 +936,10 @@ var _ = Describe("Testing OperatorPolicy", Ordered, Label("supports-hosted"), fu Reason: "ConstraintsNotSatisfiable", }}, metav1.Condition{ - Type: "SubscriptionCompliant", - Status: metav1.ConditionFalse, - Reason: "ConstraintsNotSatisfiable", - Message: "constraints not satisfiable: no operators found in package project-quay-does-not-exist" + - " in the catalog referenced by subscription project-quay-does-not-exist, subscription " + - "project-quay-does-not-exist exists", + Type: "SubscriptionCompliant", + Status: metav1.ConditionFalse, + Reason: "ConstraintsNotSatisfiable", + Message: "constraints not satisfiable: refer to the Subscription for more details", }, "constraints not satisfiable", ) @@ -1261,12 +1273,10 @@ var _ = Describe("Testing OperatorPolicy", Ordered, Label("supports-hosted"), fu Reason: "ConstraintsNotSatisfiable", }}, metav1.Condition{ - Type: "SubscriptionCompliant", - Status: metav1.ConditionFalse, - Reason: "ConstraintsNotSatisfiable", - Message: "no operators found with name strimzi-cluster-operator.v0.0.0.1337 in channel " + - "strimzi-0.36.x of package strimzi-kafka-operator in the catalog referenced by " + - "subscription strimzi-kafka-operator", + Type: "SubscriptionCompliant", + Status: metav1.ConditionFalse, + Reason: "ConstraintsNotSatisfiable", + Message: "constraints not satisfiable: refer to the Subscription for more details", }, "constraints not satisfiable", ) @@ -3332,4 +3342,82 @@ var _ = Describe("Testing OperatorPolicy", Ordered, Label("supports-hosted"), fu ) }) }) + Describe("Testing recovery of sub-csv connection", Ordered, func() { + const ( + opPolYAML = "../resources/case38_operator_install/operator-policy-no-group-enforce.yaml" + opPolName = "oppol-no-group-enforce" + subName = "project-quay" + ) + + scenarioTriggered := true + + BeforeAll(func() { + preFunc() + + createObjWithParent(parentPolicyYAML, parentPolicyName, + opPolYAML, testNamespace, gvrPolicy, gvrOperatorPolicy) + }) + + BeforeEach(func() { + if !scenarioTriggered { + Skip("test scenario was unable to be triggered") + } + }) + + It("should get the 'csv exists and is not referenced' condition", func(ctx SpecContext) { + scenarioTriggered = false + + By("Verifying the policy starts compliant") + checkCompliance(opPolName, testNamespace, olmWaitTimeout*2, policyv1.Compliant) + + By("Periodically deleting the subscription and checking the status") + scenarioDeadline := time.Now().Add(40 * time.Second) + + scenarioTriggerLoop: + for scenarioDeadline.After(time.Now()) { + KubectlTarget("delete", "subscription", subName, "-n", opPolTestNS) + time.Sleep(time.Second) + + sub, err := targetK8sDynamic.Resource(gvrSubscription).Namespace(opPolTestNS). + Get(ctx, subName, metav1.GetOptions{}) + if err != nil || sub == nil { + continue + } + + subConds, _, _ := unstructured.NestedSlice(sub.Object, "status", "conditions") + for _, cond := range subConds { + condMap, ok := cond.(map[string]interface{}) + if !ok { + continue + } + + if condType, _, _ := unstructured.NestedString(condMap, "type"); condType != "ResolutionFailed" { + continue + } + + if condStatus, _, _ := unstructured.NestedString(condMap, "status"); condStatus != "True" { + continue + } + + condMessage, _, _ := unstructured.NestedString(condMap, "message") + notRefRgx := regexp.MustCompile(`clusterserviceversion (\S*) exists and is not referenced`) + if notRefRgx.MatchString(condMessage) { + scenarioTriggered = true + + break scenarioTriggerLoop + } + } + + time.Sleep(5 * time.Second) + } + }) + + It("Verifies the policy eventually fixes the 'not referenced' condition", func() { + By("Sleeping 25s, since OperatorPolicy should wait a while before intervening") + time.Sleep(25 * time.Second) + + By("Verifying the policy becomes compliant") + checkCompliance(opPolName, testNamespace, 2*olmWaitTimeout, policyv1.Compliant, 30, 3) + }) + }) }) diff --git a/test/resources/case38_operator_install/operator-policy-no-group-enforce.yaml b/test/resources/case38_operator_install/operator-policy-no-group-enforce.yaml index 604f4077..400da462 100644 --- a/test/resources/case38_operator_install/operator-policy-no-group-enforce.yaml +++ b/test/resources/case38_operator_install/operator-policy-no-group-enforce.yaml @@ -20,5 +20,4 @@ spec: namespace: operator-policy-testns source: operatorhubio-catalog sourceNamespace: olm - startingCSV: quay-operator.v3.10.0 upgradeApproval: Automatic