Skip to content

Commit

Permalink
Bug 1995913: Degraded status in the OCM controller (#486)
Browse files Browse the repository at this point in the history
* Mark as degraded when the pulling of SCA certs failed

* Degraded with exponential backoff

* Update the error cases and add some logging

* Update the backoff a little

* Fix

* Remove unneccesary block
  • Loading branch information
tremes authored Aug 31, 2021
1 parent db300fd commit d97e3c8
Show file tree
Hide file tree
Showing 6 changed files with 106 additions and 17 deletions.
21 changes: 16 additions & 5 deletions pkg/controller/operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,11 @@ func (s *Operator) Run(ctx context.Context, controller *controllercmd.Controller
statusReporter.AddSources(reportGatherer)
go reportGatherer.Run(ctx)

runOCMController(ctx, configClient, kubeClient, configObserver, insightsClient)
ocmController := initiateOCMController(ctx, gatherKubeConfig, kubeClient, configObserver, insightsClient)
if ocmController != nil {
statusReporter.AddSources(ocmController)
go ocmController.Run()
}
klog.Warning("started")

<-ctx.Done()
Expand Down Expand Up @@ -205,20 +209,27 @@ func isRunning(ctx context.Context, kubeConfig *rest.Config) wait.ConditionFunc
}
}

// runOCMController checks the "InsightsOperatorPullingSCA" feature and if it's enabled then run the OCM controller
func runOCMController(ctx context.Context, configClient *configv1client.ConfigV1Client,
kubeClient *kubernetes.Clientset, configObserver *configobserver.Controller, insightsClient *insightsclient.Client) {
// initiateOCMController checks the "InsightsOperatorPullingSCA" feature and if it's enabled then create and retun the OCM controller
func initiateOCMController(ctx context.Context, kubeConfig *rest.Config,
kubeClient *kubernetes.Clientset, configObserver *configobserver.Controller, insightsClient *insightsclient.Client) *ocm.Controller {
configClient, err := configv1client.NewForConfig(kubeConfig)
if err != nil {
klog.Error(err)
return nil
}
ocmEnabled, err := featureEnabled(ctx, configClient, "InsightsOperatorPullingSCA")
if err != nil {
klog.Errorf("Pulling of SCA certs from the OCM is disabled. Unable to get cluster FeatureGate: %v", err)
return nil
}
if ocmEnabled {
klog.Info("Pulling of SCA certs from the OCM is enabled.")
// OMC controller periodically checks and pull data from the OCM API
// the data is exposed in the OpenShift API
ocmController := ocm.New(ctx, kubeClient.CoreV1(), configObserver, insightsClient)
go ocmController.Run()
return ocmController
}
return nil
}

// featureEnabled checks if the feature is enabled in the "cluster" FeatureGate
Expand Down
7 changes: 7 additions & 0 deletions pkg/controller/status/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ const (
uploadFailuresCountThreshold = 5
// GatherFailuresCountThreshold defines how many gatherings can fail in a row before we report Degraded
GatherFailuresCountThreshold = 5
// OCMAPIFailureCountThreshold defines how many unsuccessful responses from the OCM API in a row is tolerated
// before the operator is marked as Degraded
OCMAPIFailureCountThreshold = 5
)

type Reported struct {
Expand Down Expand Up @@ -175,6 +178,10 @@ func (c *Controller) merge(existing *configv1.ClusterOperator) *configv1.Cluster
klog.V(4).Info("Failed to download Insights report")
downloadReason = summary.Reason
downloadMessage = summary.Message
} else if summary.Operation == controllerstatus.PullingSCACerts {
klog.V(4).Infof("Failed to download the SCA certs within the threshold %d with exponential backoff. Marking as degraded.",
OCMAPIFailureCountThreshold)
degradingFailure = true
}

if degradingFailure {
Expand Down
2 changes: 2 additions & 0 deletions pkg/controllerstatus/controllerstatus.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ const (
Uploading Operation = "Uploading"
// GatheringReport specific for gathering the report from the cluster
GatheringReport Operation = "GatheringReport"
// PullingSCACerts is specific operation for pulling the SCA certs data from the OCM API
PullingSCACerts Operation = "PullingSCACerts"
)

// Summary represents the status summary of an Operation
Expand Down
19 changes: 11 additions & 8 deletions pkg/insights/insightsclient/insightsclient.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,19 +63,19 @@ type Source struct {
Contents io.Reader
}

// InsightsError is helper error type to have HTTP error status code
type InsightsError struct {
// HttpError is helper error type to have HTTP error status code
type HttpError struct {
Err error
StatusCode int
}

func (e InsightsError) Error() string {
func (e HttpError) Error() string {
return e.Err.Error()
}

func IsInsightsError(err error) bool {
func IsHttpError(err error) bool {
switch err.(type) {
case InsightsError:
case HttpError:
return true
default:
return false
Expand Down Expand Up @@ -342,7 +342,7 @@ func (c Client) RecvReport(ctx context.Context, endpoint string) (*io.ReadCloser
if len(body) > 1024 {
body = body[:1024]
}
notFoundErr := InsightsError{
notFoundErr := HttpError{
StatusCode: resp.StatusCode,
Err: fmt.Errorf("not found: %s (request=%s): %s", resp.Request.URL, requestID, string(body)),
}
Expand Down Expand Up @@ -391,7 +391,7 @@ func (c Client) RecvSCACerts(ctx context.Context, endpoint string) ([]byte, erro
return nil, fmt.Errorf("unable to retrieve SCA certs data from %s: %v", endpoint, err)
}

if res.StatusCode >= 300 || res.StatusCode < 200 {
if res.StatusCode > 399 || res.StatusCode < 200 {
return nil, ocmErrorMessage(res.Request.URL, res)
}

Expand All @@ -416,7 +416,10 @@ func ocmErrorMessage(url *url.URL, r *http.Response) error {
if r.StatusCode == http.StatusUnauthorized || r.StatusCode == http.StatusForbidden {
return authorizer.Error{Err: err}
}
return err
return HttpError{
Err: err,
StatusCode: r.StatusCode,
}
}

var (
Expand Down
4 changes: 2 additions & 2 deletions pkg/insights/insightsreport/insightsreport.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,9 @@ func (c *Controller) PullSmartProxy() (bool, error) {
} else if err == insightsclient.ErrWaitingForVersion {
klog.Error(err)
return false, err
} else if insightsclient.IsInsightsError(err) {
} else if insightsclient.IsHttpError(err) {

ie := err.(insightsclient.InsightsError)
ie := err.(insightsclient.HttpError)
klog.Errorf("Unexpected error retrieving the report: %s", ie)
// if there's a 404 response then retry
if ie.StatusCode == http.StatusNotFound {
Expand Down
70 changes: 68 additions & 2 deletions pkg/ocm/ocm.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,18 @@ package ocm
import (
"context"
"encoding/json"
"fmt"
"net/http"
"time"

"github.com/openshift/insights-operator/pkg/config"
"github.com/openshift/insights-operator/pkg/controller/status"
"github.com/openshift/insights-operator/pkg/controllerstatus"
"github.com/openshift/insights-operator/pkg/insights/insightsclient"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
corev1client "k8s.io/client-go/kubernetes/typed/core/v1"
"k8s.io/klog/v2"
)
Expand All @@ -21,6 +26,7 @@ const (

// Controller holds all the required resources to be able to communicate with OCM API
type Controller struct {
controllerstatus.Simple
coreClient corev1client.CoreV1Interface
ctx context.Context
configurator Configurator
Expand All @@ -45,6 +51,7 @@ type ScaResponse struct {
func New(ctx context.Context, coreClient corev1client.CoreV1Interface, configurator Configurator,
insightsClient *insightsclient.Client) *Controller {
return &Controller{
Simple: controllerstatus.Simple{Name: "ocmcontroller"},
coreClient: coreClient,
ctx: ctx,
configurator: configurator,
Expand Down Expand Up @@ -79,9 +86,25 @@ func (c *Controller) Run() {
}

func (c *Controller) requestDataAndCheckSecret(endpoint string) {
data, err := c.client.RecvSCACerts(c.ctx, endpoint)
data, err := c.requestSCAWithExpBackoff(endpoint)
if err != nil {
klog.Errorf("Failed to retrieve data: %v", err)
// in case of any error other than 404 mark the operator as degraded
c.Simple.UpdateStatus(controllerstatus.Summary{
Operation: controllerstatus.PullingSCACerts,
Reason: "FailedToPullSCACerts",
Message: fmt.Sprintf("Failed to pull SCA certs from %s: %v", endpoint, err),
})
return
}
// handle the case with HTTP 404
if len(data) == 0 {
msg := fmt.Sprintf("Received no SCA certs from the %s. Please check if it's enabled for your organization.", endpoint)
klog.Info(msg)
c.Simple.UpdateStatus(controllerstatus.Summary{
Operation: controllerstatus.PullingSCACerts,
Message: msg,
Healthy: true,
})
return
}
var ocmRes ScaResponse
Expand All @@ -98,6 +121,11 @@ func (c *Controller) requestDataAndCheckSecret(endpoint string) {
return
}
klog.Infof("%s secret successfully updated", secretName)
c.Simple.UpdateStatus(controllerstatus.Summary{
Operation: controllerstatus.PullingSCACerts,
Message: fmt.Sprintf("SCA certs successfully updated in the %s secret", secretName),
Healthy: true,
})
}

// checkSecret checks "etc-pki-entitlement" secret in the "openshift-config-managed" namespace.
Expand Down Expand Up @@ -156,3 +184,41 @@ func (c *Controller) updateSecret(s *v1.Secret, ocmData *ScaResponse) (*v1.Secre
}
return s, nil
}

// requestSCAWithExpBackoff queries OCM API with exponential backoff and returns
// an error only in case of an HTTP error other than 404 received from the OCM API.
// Data return value still can be an empty array in case of HTTP 404 error.
func (c *Controller) requestSCAWithExpBackoff(endpoint string) ([]byte, error) {
bo := wait.Backoff{
Duration: c.configurator.Config().OCMConfig.Interval / 32, // 15 min by default
Factor: 2,
Jitter: 0,
Steps: status.OCMAPIFailureCountThreshold,
Cap: c.configurator.Config().OCMConfig.Interval,
}
var data []byte
err := wait.ExponentialBackoff(bo, func() (bool, error) {
var err error
data, err = c.client.RecvSCACerts(c.ctx, endpoint)
if err != nil {
// don't try again in case it's not an HTTP error - it could mean we're in disconnected env
if !insightsclient.IsHttpError(err) {
klog.Errorf("Failed to request the SCA certs: %v", err)
return true, nil
}
httpErr := err.(insightsclient.HttpError)
// don't try again in case of 404
if httpErr.StatusCode == http.StatusNotFound {
return true, nil
}
klog.Errorf("%v. Trying again in %s", httpErr, bo.Step())
return false, nil
}
return true, nil
})
// exp. backoff timeouted -> error
if err != nil {
return nil, fmt.Errorf("timed out waiting for the successful response from %s", endpoint)
}
return data, nil
}

0 comments on commit d97e3c8

Please sign in to comment.