Skip to content

Commit

Permalink
Patch Subscription status when CSV is unreferenced
Browse files Browse the repository at this point in the history
In some situations when actions are happening quickly, OLM can lose the
connection between the Subscription and the ClusterServiceVersion, which
leads to a "constraints not satisfiable" condition on the Subscription.
It can be hard to reproduce the exact situation we've seen rarely in our
tests, but manually deleting and immediately recreating the subscription
causes a similar situation.

In this change, in those situations, the controller will intervene after
30 seconds by updating the Subscription status directly. The
implementation allows for a 10 second window for the intervention,
during which the controller may update the status multiple times, to
address a case where OLM immediately overwrote the update. If the window
is missed, the controller may schedule another time. This is intended to
give time to OLM to potentially resolve the situation on its own.

Refs:
 - https://issues.redhat.com/browse/ACM-11453

Signed-off-by: Justin Kulikauskas <jkulikau@redhat.com>
  • Loading branch information
JustinKuli committed Jun 3, 2024
1 parent 4d28db3 commit ce197ba
Show file tree
Hide file tree
Showing 7 changed files with 260 additions and 21 deletions.
26 changes: 26 additions & 0 deletions api/v1beta1/operatorpolicy_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package v1beta1

import (
"strings"
"time"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
Expand Down Expand Up @@ -163,6 +164,12 @@ type OperatorPolicyStatus struct {
// The list of overlapping OperatorPolicies (as name.namespace) which all manage the same
// subscription, including this policy. When no overlapping is detected, this list will be empty.
OverlappingPolicies []string `json:"overlappingPolicies,omitempty"`

// Timestamp for a possible intervention to help a Subscription stuck with a
// ConstraintsNotSatisfiable condition. Can be in the future, indicating the
// policy is waiting for OLM to resolve the situation. If in the recent past,
// the policy may update the status of the Subscription.
SubscriptionInterventionTime *metav1.Time `json:"subscriptionInterventionTime,omitempty"`
}

func (status OperatorPolicyStatus) RelatedObjsOfKind(kind string) map[int]policyv1.RelatedObject {
Expand Down Expand Up @@ -190,6 +197,25 @@ func (status OperatorPolicyStatus) GetCondition(condType string) (int, metav1.Co
return -1, metav1.Condition{}
}

// Returns true if the SubscriptionInterventionTime is far enough in the past
// to be considered expired, and therefore should be removed from the status.
func (status OperatorPolicyStatus) SubscriptionInterventionExpired() bool {
if status.SubscriptionInterventionTime == nil {
return false
}

return status.SubscriptionInterventionTime.Time.Before(time.Now().Add(-10 * time.Second))
}

// Returns true if the SubscriptionInterventionTime is in the future.
func (status OperatorPolicyStatus) SubscriptionInterventionWaiting() bool {
if status.SubscriptionInterventionTime == nil {
return false
}

return status.SubscriptionInterventionTime.Time.After(time.Now())
}

//+kubebuilder:object:root=true
//+kubebuilder:subresource:status

Expand Down
4 changes: 4 additions & 0 deletions api/v1beta1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

135 changes: 118 additions & 17 deletions controllers/operatorpolicy_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"slices"
"strconv"
"strings"
"time"

operatorv1 "github.com/operator-framework/api/pkg/operators/v1"
operatorv1alpha1 "github.com/operator-framework/api/pkg/operators/v1alpha1"
Expand Down Expand Up @@ -46,6 +47,7 @@ import (
const (
OperatorControllerName string = "operator-policy-controller"
CatalogSourceReady string = "READY"
olmGracePeriod = 30 * time.Second
)

var (
Expand Down Expand Up @@ -94,7 +96,8 @@ var (
Version: "v1",
Resource: "packagemanifests",
}
ErrPackageManifest = errors.New("")
ErrPackageManifest = errors.New("")
unreferencedCSVRegex = regexp.MustCompile(`clusterserviceversion (\S*) exists and is not referenced`)
)

// OperatorPolicyReconciler reconciles a OperatorPolicy object
Expand Down Expand Up @@ -216,14 +219,14 @@ func (r *OperatorPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reque
errs = append(errs, err)
}

if err := r.Status().Update(ctx, policy); err != nil {
errs = append(errs, err)
}

if conditionChanged {
// Add an event for the "final" state of the policy, otherwise this only has the
// "early" events (and possibly has zero events).
conditionsToEmit = append(conditionsToEmit, calculateComplianceCondition(policy))

if err := r.Status().Update(ctx, policy); err != nil {
errs = append(errs, err)
}
}

for _, cond := range conditionsToEmit {
Expand All @@ -235,6 +238,14 @@ func (r *OperatorPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reque
result := reconcile.Result{}
finalErr := utilerrors.NewAggregate(errs)

if len(errs) == 0 {
// Schedule a requeue for the intervention.
// Note: this requeue will be superseded if the Subscription's status is flapping.
if policy.Status.SubscriptionInterventionWaiting() {
result.RequeueAfter = time.Until(policy.Status.SubscriptionInterventionTime.Add(time.Second))
}
}

opLog.Info("Reconciling complete", "finalErr", finalErr,
"conditionChanged", conditionChanged, "eventCount", len(conditionsToEmit))

Expand Down Expand Up @@ -1082,6 +1093,8 @@ func (r *OperatorPolicyReconciler) musthaveSubscription(
ogCorrect bool,
) (*operatorv1alpha1.Subscription, []metav1.Condition, bool, error) {
if foundSub == nil {
policy.Status.SubscriptionInterventionTime = nil

// Missing Subscription: report NonCompliance
changed := updateStatus(policy, missingWantedCond("Subscription"), missingWantedObj(desiredSub))

Expand Down Expand Up @@ -1135,12 +1148,18 @@ func (r *OperatorPolicyReconciler) musthaveSubscription(
subResFailed := mergedSub.Status.GetCondition(operatorv1alpha1.SubscriptionResolutionFailed)

if subResFailed.Status == corev1.ConditionTrue {
return r.considerResolutionFailed(ctx, policy, foundSub, mergedSub)
return r.considerResolutionFailed(ctx, policy, mergedSub)
}

if policy.Status.SubscriptionInterventionExpired() {
policy.Status.SubscriptionInterventionTime = nil
}

return mergedSub, nil, updateStatus(policy, matchesCond("Subscription"), matchedObj(foundSub)), nil
}

policy.Status.SubscriptionInterventionTime = nil

// Specs don't match.
if policy.Spec.RemediationAction.IsEnforce() && skipUpdate {
changed := updateStatus(policy, mismatchCondUnfixable("Subscription"), mismatchedObj(foundSub))
Expand Down Expand Up @@ -1179,7 +1198,6 @@ func (r *OperatorPolicyReconciler) musthaveSubscription(
func (r *OperatorPolicyReconciler) considerResolutionFailed(
ctx context.Context,
policy *policyv1beta1.OperatorPolicy,
foundSub *unstructured.Unstructured,
mergedSub *operatorv1alpha1.Subscription,
) (*operatorv1alpha1.Subscription, []metav1.Condition, bool, error) {
opLog := ctrl.LoggerFrom(ctx)
Expand All @@ -1198,22 +1216,97 @@ func (r *OperatorPolicyReconciler) considerResolutionFailed(
}

if !includesSubscription {
if policy.Status.SubscriptionInterventionExpired() {
policy.Status.SubscriptionInterventionTime = nil
}

return mergedSub, nil, false, nil
}

// Handle non-ConstraintsNotSatisfiable reasons separately
if !strings.EqualFold(subResFailed.Reason, "ConstraintsNotSatisfiable") {
changed := updateStatus(policy, subResFailedCond(subResFailed), nonCompObj(foundSub, subResFailed.Reason))
changed := updateStatus(policy, subResFailedCond(subResFailed), nonCompObj(mergedSub, subResFailed.Reason))

if policy.Status.SubscriptionInterventionExpired() {
policy.Status.SubscriptionInterventionTime = nil
}

return mergedSub, nil, changed, nil
}

// A "constraints not satisfiable" message has nondeterministic clauses, and can be noisy with a list of versions.
// Just report a generic condition, which will prevent the OperatorPolicy status from constantly updating
// when the details in the Subscription status change.
changed := updateStatus(policy, subConstraintsNotSatisfiableCond, nonCompObj(foundSub, "ConstraintsNotSatisfiable"))
changed := updateStatus(policy, subConstraintsNotSatisfiableCond,
nonCompObj(mergedSub, "ConstraintsNotSatisfiable"))

if policy.Status.SubscriptionInterventionExpired() || policy.Status.SubscriptionInterventionTime == nil {
interventionTime := metav1.Time{Time: time.Now().Add(olmGracePeriod)}
policy.Status.SubscriptionInterventionTime = &interventionTime

opLog.V(1).Info("Detected ConstraintsNotSatisfiable, setting an intervention time",
"interventionTime", interventionTime, "subscription", mergedSub)

return mergedSub, nil, changed, nil
}

if policy.Status.SubscriptionInterventionWaiting() {
opLog.V(1).Info("Detected ConstraintsNotSatisfiable, giving OLM more time before possibly intervening",
"interventionTime", policy.Status.SubscriptionInterventionTime)

return mergedSub, nil, changed, nil
}

// Do the "intervention"

return mergedSub, nil, changed, nil
relatedCSVs := policy.Status.RelatedObjsOfKind("ClusterServiceVersion")
if len(relatedCSVs) != 1 {
opLog.V(1).Info("No related CSVs to use for an intervention")

return mergedSub, nil, changed, nil
}

csvIdx := 0
for idx := range relatedCSVs {
csvIdx = idx
}

csvName := relatedCSVs[csvIdx].Object.Metadata.Name

matches := unreferencedCSVRegex.FindStringSubmatch(subResFailed.Message)
if len(matches) < 2 {
opLog.V(1).Info("Subscription condition does not match pattern for an unreferenced CSV",
"subscriptionConditionMessage", subResFailed.Message)

return mergedSub, nil, changed, nil
}

if csvName != matches[1] {
opLog.V(1).Info("The CSV found in the Subscription condition does not match the CSV found by label",
"subscriptionConditionMessage", subResFailed.Message, "csvName", csvName, "csvInSub", matches[1])

return mergedSub, nil, changed, nil
}

mergedSub.Status = operatorv1alpha1.SubscriptionStatus{
CurrentCSV: csvName,
InstalledCSV: csvName,
State: operatorv1alpha1.SubscriptionStateAtLatest,
LastUpdated: metav1.Now(),
}

opLog.Info("Updating Subscription status to point to CSV", "csvName", csvName)

if err := r.TargetClient.Status().Update(ctx, mergedSub); err != nil {
return mergedSub, nil, changed,
fmt.Errorf("error updating the Subscription status to point to the CSV: %w", err)
}

mergedSub.SetGroupVersionKind(subscriptionGVK) // Update might strip this information

updateStatus(policy, updatedCond("Subscription"), updatedObj(mergedSub))

return mergedSub, nil, true, nil
}

func (r *OperatorPolicyReconciler) mustnothaveSubscription(
Expand All @@ -1222,6 +1315,8 @@ func (r *OperatorPolicyReconciler) mustnothaveSubscription(
desiredSub *operatorv1alpha1.Subscription,
foundUnstructSub *unstructured.Unstructured,
) (*operatorv1alpha1.Subscription, []metav1.Condition, bool, error) {
policy.Status.SubscriptionInterventionTime = nil

if foundUnstructSub == nil {
// Missing Subscription: report Compliance
changed := updateStatus(policy, missingNotWantedCond("Subscription"), missingNotWantedObj(desiredSub))
Expand Down Expand Up @@ -1522,18 +1617,23 @@ func (r *OperatorPolicyReconciler) handleCSV(

var foundCSV *operatorv1alpha1.ClusterServiceVersion

relatedCSVs := make([]policyv1.RelatedObject, 0)

for _, csv := range csvList {
if csv.GetName() == sub.Status.InstalledCSV {
// If the subscription does not know about the CSV, this can report multiple CSVs as related
if sub.Status.InstalledCSV == "" || sub.Status.InstalledCSV == csv.GetName() {
matchedCSV := operatorv1alpha1.ClusterServiceVersion{}

err = runtime.DefaultUnstructuredConverter.FromUnstructured(csv.UnstructuredContent(), &matchedCSV)
if err != nil {
return nil, nil, false, err
}

foundCSV = &matchedCSV
relatedCSVs = append(relatedCSVs, existingCSVObj(&matchedCSV))

break
if sub.Status.InstalledCSV == csv.GetName() {
foundCSV = &matchedCSV
}
}
}

Expand All @@ -1545,13 +1645,14 @@ func (r *OperatorPolicyReconciler) handleCSV(

// CSV has not yet been created by OLM
if foundCSV == nil {
changed := updateStatus(policy,
missingWantedCond("ClusterServiceVersion"), missingCSVObj(sub.Name, sub.Namespace))
if len(relatedCSVs) == 0 {
relatedCSVs = append(relatedCSVs, missingCSVObj(sub.Name, sub.Namespace))
}

return foundCSV, nil, changed, nil
return foundCSV, nil, updateStatus(policy, missingWantedCond("ClusterServiceVersion"), relatedCSVs...), nil
}

return foundCSV, nil, updateStatus(policy, buildCSVCond(foundCSV), existingCSVObj(foundCSV)), nil
return foundCSV, nil, updateStatus(policy, buildCSVCond(foundCSV), relatedCSVs...), nil
}

func (r *OperatorPolicyReconciler) mustnothaveCSV(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,14 @@ spec:
resolvedSubscriptionLabel:
description: The resolved name.namespace of the subscription
type: string
subscriptionInterventionTime:
description: |-
Timestamp for a possible intervention to help a Subscription stuck with a
ConstraintsNotSatisfiable condition. Can be in the future, indicating the
policy is waiting for OLM to resolve the situation. If in the recent past,
the policy may update the status of the Subscription.
format: date-time
type: string
type: object
type: object
served: true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,14 @@ spec:
resolvedSubscriptionLabel:
description: The resolved name.namespace of the subscription
type: string
subscriptionInterventionTime:
description: |-
Timestamp for a possible intervention to help a Subscription stuck with a
ConstraintsNotSatisfiable condition. Can be in the future, indicating the
policy is waiting for OLM to resolve the situation. If in the recent past,
the policy may update the status of the Subscription.
format: date-time
type: string
type: object
type: object
served: true
Expand Down
Loading

0 comments on commit ce197ba

Please sign in to comment.