Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug 1995913: Degraded status in the OCM controller #486

Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 16 additions & 5 deletions pkg/controller/operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,11 @@ func (s *Operator) Run(ctx context.Context, controller *controllercmd.Controller
statusReporter.AddSources(reportGatherer)
go reportGatherer.Run(ctx)

runOCMController(ctx, configClient, kubeClient, configObserver, insightsClient)
ocmController := initiateOCMController(ctx, gatherKubeConfig, kubeClient, configObserver, insightsClient)
if ocmController != nil {
statusReporter.AddSources(ocmController)
go ocmController.Run()
}
klog.Warning("started")

<-ctx.Done()
Expand Down Expand Up @@ -205,20 +209,27 @@ func isRunning(ctx context.Context, kubeConfig *rest.Config) wait.ConditionFunc
}
}

// runOCMController checks the "InsightsOperatorPullingSCA" feature and if it's enabled then run the OCM controller
func runOCMController(ctx context.Context, configClient *configv1client.ConfigV1Client,
kubeClient *kubernetes.Clientset, configObserver *configobserver.Controller, insightsClient *insightsclient.Client) {
// initiateOCMController checks the "InsightsOperatorPullingSCA" feature and if it's enabled then create and retun the OCM controller
func initiateOCMController(ctx context.Context, kubeConfig *rest.Config,
kubeClient *kubernetes.Clientset, configObserver *configobserver.Controller, insightsClient *insightsclient.Client) *ocm.Controller {
configClient, err := configv1client.NewForConfig(kubeConfig)
if err != nil {
klog.Error(err)
return nil
}
ocmEnabled, err := featureEnabled(ctx, configClient, "InsightsOperatorPullingSCA")
if err != nil {
klog.Errorf("Pulling of SCA certs from the OCM is disabled. Unable to get cluster FeatureGate: %v", err)
return nil
}
if ocmEnabled {
klog.Info("Pulling of SCA certs from the OCM is enabled.")
// OMC controller periodically checks and pull data from the OCM API
// the data is exposed in the OpenShift API
ocmController := ocm.New(ctx, kubeClient.CoreV1(), configObserver, insightsClient)
go ocmController.Run()
return ocmController
}
return nil
}

// featureEnabled checks if the feature is enabled in the "cluster" FeatureGate
Expand Down
27 changes: 26 additions & 1 deletion pkg/controller/status/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ const (
uploadFailuresCountThreshold = 5
// GatherFailuresCountThreshold defines how many gatherings can fail in a row before we report Degraded
GatherFailuresCountThreshold = 5
// OCMAPIFailureCountThreshold defines how many unsuccessful responses from the OCM API in a row is tolerated
// before the operator is marked as Degraded
OCMAPIFailureCountThreshold = 5
)

type Reported struct {
Expand Down Expand Up @@ -141,7 +144,8 @@ func (c *Controller) merge(existing *configv1.ClusterOperator) *configv1.Cluster
var last time.Time
var reason string
var errs []string
var uploadErrorReason, uploadErrorMessage, disabledReason, disabledMessage, downloadReason, downloadMessage string
var uploadErrorReason, uploadErrorMessage, disabledReason, disabledMessage, downloadReason, downloadMessage,
ocmErrorReason, ocmErrorMsg string
allReady := true
for i, source := range c.Sources() {
summary, ready := source.CurrentStatus()
Expand Down Expand Up @@ -175,6 +179,12 @@ func (c *Controller) merge(existing *configv1.ClusterOperator) *configv1.Cluster
klog.V(4).Info("Failed to download Insights report")
downloadReason = summary.Reason
downloadMessage = summary.Message
} else if summary.Operation == controllerstatus.PullingSCACerts {
klog.V(4).Infof("Failed to download the SCA certs within the threshold %d with exponential backoff. Marking as degraded.",
OCMAPIFailureCountThreshold)
degradingFailure = true
ocmErrorMsg = summary.Message
ocmErrorReason = summary.Reason
}

if degradingFailure {
Expand Down Expand Up @@ -300,6 +310,21 @@ func (c *Controller) merge(existing *configv1.ClusterOperator) *configv1.Cluster
} else {
removeOperatorStatusCondition(&existing.Status.Conditions, InsightsDownloadDegraded)
}
if len(ocmErrorMsg) > 0 {
setOperatorStatusCondition(&existing.Status.Conditions, configv1.ClusterOperatorStatusCondition{
Type: configv1.OperatorDegraded,
Status: configv1.ConditionTrue,
LastTransitionTime: metav1.Time{Time: last},
Reason: ocmErrorReason,
Message: ocmErrorMsg,
})
} else {
setOperatorStatusCondition(&existing.Status.Conditions, configv1.ClusterOperatorStatusCondition{
Type: configv1.OperatorDegraded,
Status: configv1.ConditionFalse,
Reason: "AsExpected",
})
}
tremes marked this conversation as resolved.
Show resolved Hide resolved
}

// once the operator is running it is always considered available
Expand Down
2 changes: 2 additions & 0 deletions pkg/controllerstatus/controllerstatus.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ const (
Uploading Operation = "Uploading"
// GatheringReport specific for gathering the report from the cluster
GatheringReport Operation = "GatheringReport"
// PullingSCACerts is specific operation for pulling the SCA certs data from the OCM API
PullingSCACerts Operation = "PullingSCACerts"
)

// Summary represents the status summary of an Operation
Expand Down
19 changes: 11 additions & 8 deletions pkg/insights/insightsclient/insightsclient.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,19 +63,19 @@ type Source struct {
Contents io.Reader
}

// InsightsError is helper error type to have HTTP error status code
type InsightsError struct {
// HttpError is helper error type to have HTTP error status code
type HttpError struct {
Err error
StatusCode int
}

func (e InsightsError) Error() string {
func (e HttpError) Error() string {
return e.Err.Error()
}

func IsInsightsError(err error) bool {
func IsHttpError(err error) bool {
switch err.(type) {
case InsightsError:
case HttpError:
return true
default:
return false
Expand Down Expand Up @@ -342,7 +342,7 @@ func (c Client) RecvReport(ctx context.Context, endpoint string) (*io.ReadCloser
if len(body) > 1024 {
body = body[:1024]
}
notFoundErr := InsightsError{
notFoundErr := HttpError{
StatusCode: resp.StatusCode,
Err: fmt.Errorf("not found: %s (request=%s): %s", resp.Request.URL, requestID, string(body)),
}
Expand Down Expand Up @@ -391,7 +391,7 @@ func (c Client) RecvSCACerts(ctx context.Context, endpoint string) ([]byte, erro
return nil, fmt.Errorf("unable to retrieve SCA certs data from %s: %v", endpoint, err)
}

if res.StatusCode >= 300 || res.StatusCode < 200 {
if res.StatusCode > 399 || res.StatusCode < 200 {
return nil, ocmErrorMessage(res.Request.URL, res)
}

Expand All @@ -416,7 +416,10 @@ func ocmErrorMessage(url *url.URL, r *http.Response) error {
if r.StatusCode == http.StatusUnauthorized || r.StatusCode == http.StatusForbidden {
return authorizer.Error{Err: err}
}
return err
return HttpError{
Err: err,
StatusCode: r.StatusCode,
}
}

var (
Expand Down
4 changes: 2 additions & 2 deletions pkg/insights/insightsreport/insightsreport.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,9 @@ func (c *Controller) PullSmartProxy() (bool, error) {
} else if err == insightsclient.ErrWaitingForVersion {
klog.Error(err)
return false, err
} else if insightsclient.IsInsightsError(err) {
} else if insightsclient.IsHttpError(err) {

ie := err.(insightsclient.InsightsError)
ie := err.(insightsclient.HttpError)
klog.Errorf("Unexpected error retrieving the report: %s", ie)
// if there's a 404 response then retry
if ie.StatusCode == http.StatusNotFound {
Expand Down
70 changes: 68 additions & 2 deletions pkg/ocm/ocm.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,18 @@ package ocm
import (
"context"
"encoding/json"
"fmt"
"net/http"
"time"

"github.com/openshift/insights-operator/pkg/config"
"github.com/openshift/insights-operator/pkg/controller/status"
"github.com/openshift/insights-operator/pkg/controllerstatus"
"github.com/openshift/insights-operator/pkg/insights/insightsclient"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
corev1client "k8s.io/client-go/kubernetes/typed/core/v1"
"k8s.io/klog/v2"
)
Expand All @@ -21,6 +26,7 @@ const (

// Controller holds all the required resources to be able to communicate with OCM API
type Controller struct {
controllerstatus.Simple
coreClient corev1client.CoreV1Interface
ctx context.Context
configurator Configurator
Expand All @@ -45,6 +51,7 @@ type ScaResponse struct {
func New(ctx context.Context, coreClient corev1client.CoreV1Interface, configurator Configurator,
insightsClient *insightsclient.Client) *Controller {
return &Controller{
Simple: controllerstatus.Simple{Name: "ocmcontroller"},
coreClient: coreClient,
ctx: ctx,
configurator: configurator,
Expand Down Expand Up @@ -79,9 +86,25 @@ func (c *Controller) Run() {
}

func (c *Controller) requestDataAndCheckSecret(endpoint string) {
data, err := c.client.RecvSCACerts(c.ctx, endpoint)
data, err := c.requestSCAWithExpBackoff(endpoint)
if err != nil {
klog.Errorf("Failed to retrieve data: %v", err)
// in case of any error other than 404 mark the operator as degraded
c.Simple.UpdateStatus(controllerstatus.Summary{
Operation: controllerstatus.PullingSCACerts,
Reason: "FailedToPullSCACerts",
Message: fmt.Sprintf("Failed to pull SCA certs from %s: %v", endpoint, err),
})
return
}
// handle the case with HTTP 404
if len(data) == 0 {
msg := fmt.Sprintf("Received no SCA certs from the %s. Please check if it's enabled for your organization.", endpoint)
klog.Info(msg)
c.Simple.UpdateStatus(controllerstatus.Summary{
Operation: controllerstatus.PullingSCACerts,
Message: msg,
Healthy: true,
})
return
}
var ocmRes ScaResponse
Expand All @@ -98,6 +121,11 @@ func (c *Controller) requestDataAndCheckSecret(endpoint string) {
return
}
klog.Infof("%s secret successfully updated", secretName)
c.Simple.UpdateStatus(controllerstatus.Summary{
Operation: controllerstatus.PullingSCACerts,
Message: fmt.Sprintf("SCA certs successfully updated in the %s secret", secretName),
Healthy: true,
})
}

// checkSecret checks "etc-pki-entitlement" secret in the "openshift-config-managed" namespace.
Expand Down Expand Up @@ -156,3 +184,41 @@ func (c *Controller) updateSecret(s *v1.Secret, ocmData *ScaResponse) (*v1.Secre
}
return s, nil
}

// requestSCAWithExpBackoff queries OCM API with exponential backoff and returns
// an error only in case of an HTTP error other than 404 received from the OCM API.
// Data return value still can be an empty array in case of HTTP 404 error.
func (c *Controller) requestSCAWithExpBackoff(endpoint string) ([]byte, error) {
bo := wait.Backoff{
Duration: 10 * time.Minute,
Factor: 2,
Jitter: 0,
Steps: status.OCMAPIFailureCountThreshold,
Cap: c.configurator.Config().OCMConfig.Interval,
}
var data []byte
err := wait.ExponentialBackoff(bo, func() (bool, error) {
var err error
data, err = c.client.RecvSCACerts(c.ctx, endpoint)
if err != nil {
// don't try again in case it's not an HTTP error - it could mean we're in disconnected env
if !insightsclient.IsHttpError(err) {
klog.Errorf("Failed to request the SCA certs: %v", err)
return true, nil
}
httpErr := err.(insightsclient.HttpError)
// don't try again in case of 404
if httpErr.StatusCode == http.StatusNotFound {
return true, nil
}
klog.Errorf("%v. Trying again in %s", httpErr, bo.Step())
return false, nil
}
return true, nil
})
// exp. backoff timeouted -> error
if err != nil {
return nil, fmt.Errorf("timed out waiting for the successful response from %s", endpoint)
}
return data, nil
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This method is little bit complicated (especially due the fact that 404 and offline env. are not considered as errors from the "degraded" perspective) compared to counting the threshold in the controllerstatus and the checking it in the status.go (similarly as here)

The approach with exp. backoff is more "agressive":

  • if HTTP error other than 404 then try again in 5 min (if this is still true then try again in 10 min, 20min, 40min => mark as degraded)

Compared to the other approach:

  • if HTTP error other than 404 then only increase the count and do nothing
  • try again in default period (which is 8h in this case) if there's still an error then increase the count again
  • if count higher than some threshold (let's say 2 or 3) then mark IO as degraded

WIth the first approach we can go degraded in ~1h15min, with the second one it would be after 16h-24h. I am not sure what is better TBH. The certs are valid for a long time (1 year by default), but on the other hand, if the content changes then you may want to update the certs quickly.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I updated the initial backoff step to c.configurator.Config().OCMConfig.Interval / 32 (that is 15 minutes by default) so the times are bit longer than mentioned above.

}