Skip to content

Commit

Permalink
Add metric for user and system errors
Browse files Browse the repository at this point in the history
This PR creates new metrics for user and system errors and sends them when a config policy is invalid (no spec or remediationAction) [user], when the object template CRD cannot be found [user] and when the API call to update status fails [system]. It also adds the check for config policies not having a spec, which didn't exist before and caused some weird undefined behavior in policies without specs.

- ref: https://issues.redhat.com/browse/ACM-2216?filter=-1

Signed-off-by: Will Kutler <wkutler@redhat.com>
  • Loading branch information
JustinKuli authored and willkutler committed Dec 6, 2022
1 parent a1554d5 commit 49723a1
Show file tree
Hide file tree
Showing 9 changed files with 205 additions and 26 deletions.
2 changes: 1 addition & 1 deletion api/v1/configurationpolicy_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ type ConfigurationPolicy struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata,omitempty"`

Spec ConfigurationPolicySpec `json:"spec,omitempty"`
Spec *ConfigurationPolicySpec `json:"spec,omitempty"`
Status ConfigurationPolicyStatus `json:"status,omitempty"`
}

Expand Down
35 changes: 33 additions & 2 deletions api/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

71 changes: 53 additions & 18 deletions controllers/configurationpolicy_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,8 @@ func (r *ConfigurationPolicyReconciler) Reconcile(ctx context.Context, request c
_ = compareObjSecondsCounter.DeletePartialMatch(prometheus.Labels{"config_policy_name": request.Name})
_ = policyRelatedObjectGauge.DeletePartialMatch(
prometheus.Labels{"policy": fmt.Sprintf("%s/%s", request.Namespace, request.Name)})
_ = policyUserErrorsCounter.DeletePartialMatch(prometheus.Labels{"template": request.Name})
_ = policySystemErrorsCounter.DeletePartialMatch(prometheus.Labels{"template": request.Name})
}

return reconcile.Result{}, nil
Expand Down Expand Up @@ -313,9 +315,9 @@ func shouldEvaluatePolicy(policy *policyv1.ConfigurationPolicy) bool {

var interval time.Duration

if policy.Status.ComplianceState == policyv1.Compliant {
if policy.Status.ComplianceState == policyv1.Compliant && policy.Spec != nil {
interval, err = policy.Spec.EvaluationInterval.GetCompliantInterval()
} else if policy.Status.ComplianceState == policyv1.NonCompliant {
} else if policy.Status.ComplianceState == policyv1.NonCompliant && policy.Spec != nil {
interval, err = policy.Spec.EvaluationInterval.GetNonCompliantInterval()
} else {
log.V(2).Info("The policy has an unknown compliance. Will evaluate it now.")
Expand Down Expand Up @@ -563,11 +565,18 @@ func (r *ConfigurationPolicyReconciler) handleObjectTemplates(plc policyv1.Confi
relatedObjects := []policyv1.RelatedObject{}
parentStatusUpdateNeeded := false

// error if no remediationAction is specified
if plc.Spec.RemediationAction == "" {
message := "Policy does not have a RemediationAction specified"
validationErr := ""
if plc.Spec == nil {
validationErr = "Policy does not have a Spec specified"
} else if plc.Spec.RemediationAction == "" {
validationErr = "Policy does not have a RemediationAction specified"
}

// error if no spec or remediationAction is specified
if validationErr != "" {
message := validationErr
log.Info(message)
statusChanged := addConditionToStatus(&plc, 0, false, "No RemediationAction", message)
statusChanged := addConditionToStatus(&plc, 0, false, "Invalid spec", message)

if statusChanged {
r.Recorder.Event(&plc, eventWarning,
Expand All @@ -576,6 +585,13 @@ func (r *ConfigurationPolicyReconciler) handleObjectTemplates(plc policyv1.Confi

r.checkRelatedAndUpdate(plc, relatedObjects, oldRelated, statusChanged)

parent := ""
if len(plc.OwnerReferences) > 0 {
parent = plc.OwnerReferences[0].Name
}

policyUserErrorsCounter.WithLabelValues(parent, plc.GetName(), "invalid-template").Add(1)

return
}

Expand Down Expand Up @@ -1051,11 +1067,11 @@ func addConditionToStatus(

log := log.WithValues("policy", plc.GetName(), "complianceState", complianceState)

if compliant && plc.Spec.EvaluationInterval.Compliant == "never" {
if compliant && plc.Spec != nil && plc.Spec.EvaluationInterval.Compliant == "never" {
msg := `This policy will not be evaluated again due to spec.evaluationInterval.compliant being set to "never"`
log.Info(msg)
cond.Message += fmt.Sprintf(". %s.", msg)
} else if !compliant && plc.Spec.EvaluationInterval.NonCompliant == "never" {
} else if !compliant && plc.Spec != nil && plc.Spec.EvaluationInterval.NonCompliant == "never" {
msg := "This policy will not be evaluated again due to spec.evaluationInterval.noncompliant " +
`being set to "never"`
log.Info(msg)
Expand Down Expand Up @@ -1616,7 +1632,15 @@ func (r *ConfigurationPolicyReconciler) getMapping(
kind := afterPrefix[0:(strings.Index(afterPrefix, "\" "))]
mappingErrMsg = "couldn't find mapping resource with kind " + kind +
", please check if you have CRD deployed"

log.Error(err, "Could not map resource, do you have the CRD deployed?", "kind", kind)

parent := ""
if len(policy.OwnerReferences) > 0 {
parent = policy.OwnerReferences[0].Name
}

policyUserErrorsCounter.WithLabelValues(parent, policy.GetName(), "no-object-CRD").Add(1)
}

errMsg := err.Error()
Expand Down Expand Up @@ -2392,12 +2416,16 @@ func IsSimilarToLastCondition(oldCond policyv1.Condition, newCond policyv1.Condi
func (r *ConfigurationPolicyReconciler) addForUpdate(policy *policyv1.ConfigurationPolicy, sendEvent bool) {
compliant := true

for index := range policy.Spec.ObjectTemplates {
if index < len(policy.Status.CompliancyDetails) {
if policy.Status.CompliancyDetails[index].ComplianceState == policyv1.NonCompliant {
compliant = false
if policy.Spec == nil {
compliant = false
} else {
for index := range policy.Spec.ObjectTemplates {
if index < len(policy.Status.CompliancyDetails) {
if policy.Status.CompliancyDetails[index].ComplianceState == policyv1.NonCompliant {
compliant = false

break
break
}
}
}
}
Expand All @@ -2407,7 +2435,7 @@ func (r *ConfigurationPolicyReconciler) addForUpdate(policy *policyv1.Configurat
if policy.ObjectMeta.DeletionTimestamp != nil {
policy.Status.ComplianceState = policyv1.Terminating
} else if len(policy.Status.CompliancyDetails) == 0 {
policy.Status.ComplianceState = "Undetermined"
policy.Status.ComplianceState = policyv1.UnknownCompliancy
} else if compliant {
policy.Status.ComplianceState = policyv1.Compliant
} else {
Expand All @@ -2434,6 +2462,13 @@ func (r *ConfigurationPolicyReconciler) addForUpdate(policy *policyv1.Configurat
policyLog.Error(err, "Tried to re-update status before previous update could be applied, retrying next loop")
} else if err != nil {
policyLog.Error(err, "Could not update status, retrying next loop")

parent := ""
if len(policy.OwnerReferences) > 0 {
parent = policy.OwnerReferences[0].Name
}

policySystemErrorsCounter.WithLabelValues(parent, policy.GetName(), "status-update-failed").Add(1)
}
}

Expand All @@ -2443,7 +2478,7 @@ func (r *ConfigurationPolicyReconciler) updatePolicyStatus(
policy *policyv1.ConfigurationPolicy,
sendEvent bool,
) (*policyv1.ConfigurationPolicy, error) {
if policy.Status.ComplianceState != "Undetermined" && sendEvent {
if sendEvent {
log.V(1).Info("Sending parent policy compliance event")

// If the compliance event can't be created, then don't update the ConfigurationPolicy
Expand Down Expand Up @@ -2516,7 +2551,7 @@ func (r *ConfigurationPolicyReconciler) sendComplianceEvent(instance *policyv1.C
ReportingInstance: r.InstanceName,
}

if instance.Status.ComplianceState == policyv1.NonCompliant {
if instance.Status.ComplianceState != policyv1.Compliant {
event.Type = "Warning"
}

Expand All @@ -2525,8 +2560,8 @@ func (r *ConfigurationPolicyReconciler) sendComplianceEvent(instance *policyv1.C

// convertPolicyStatusToString to be able to pass the status as event
func convertPolicyStatusToString(plc *policyv1.ConfigurationPolicy) (results string) {
if plc.Status.ComplianceState == "" {
return "ComplianceState is still undetermined"
if plc.Status.ComplianceState == "" || plc.Status.ComplianceState == policyv1.UnknownCompliancy {
return "ComplianceState is still unknown"
}

result := string(plc.Status.ComplianceState)
Expand Down
2 changes: 1 addition & 1 deletion controllers/configurationpolicy_controller_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ func getSamplePolicy() policyv1.ConfigurationPolicy {
Name: "foo",
Namespace: "default",
},
Spec: policyv1.ConfigurationPolicySpec{
Spec: &policyv1.ConfigurationPolicySpec{
Severity: "low",
NamespaceSelector: policyv1.Target{
Include: []policyv1.NonEmptyString{"default", "kube-*"},
Expand Down
7 changes: 4 additions & 3 deletions controllers/configurationpolicy_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ func TestReconcile(t *testing.T) {
Name: "foo",
Namespace: "default",
},
Spec: policyv1.ConfigurationPolicySpec{
Spec: &policyv1.ConfigurationPolicySpec{
Severity: "low",
NamespaceSelector: policyv1.Target{
Include: []policyv1.NonEmptyString{"default", "kube-*"},
Expand Down Expand Up @@ -386,7 +386,7 @@ func TestSortRelatedObjectsAndUpdate(t *testing.T) {
Name: "foo",
Namespace: "default",
},
Spec: policyv1.ConfigurationPolicySpec{
Spec: &policyv1.ConfigurationPolicySpec{
Severity: "low",
NamespaceSelector: policyv1.Target{
Include: []policyv1.NonEmptyString{"default", "kube-*"},
Expand Down Expand Up @@ -440,7 +440,7 @@ func TestCreateInformStatus(t *testing.T) {
Name: "foo",
Namespace: "default",
},
Spec: policyv1.ConfigurationPolicySpec{
Spec: &policyv1.ConfigurationPolicySpec{
Severity: "low",
NamespaceSelector: policyv1.Target{
Include: []policyv1.NonEmptyString{"test1", "test2"},
Expand Down Expand Up @@ -536,6 +536,7 @@ func TestShouldEvaluatePolicy(t *testing.T) {
Namespace: "managed",
Generation: 2,
},
Spec: &policyv1.ConfigurationPolicySpec{},
}

// Add a 60 second buffer to avoid race conditions
Expand Down
2 changes: 1 addition & 1 deletion controllers/configurationpolicy_utils_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ func TestAddConditionToStatusNeverEvalInterval(t *testing.T) {
t.Parallel()

policy := &policyv1.ConfigurationPolicy{
Spec: policyv1.ConfigurationPolicySpec{
Spec: &policyv1.ConfigurationPolicySpec{
EvaluationInterval: test.evaluationInterval,
},
}
Expand Down
35 changes: 35 additions & 0 deletions controllers/metric.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package controllers

import (
"errors"
"fmt"
"sync"

Expand Down Expand Up @@ -64,6 +65,28 @@ var (
"policy",
},
)
policyUserErrorsCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "policy_user_errors_total",
Help: "The number of user errors encountered while processing policies",
},
[]string{
"policy",
"template",
"type",
},
)
policySystemErrorsCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "policy_system_errors_total",
Help: "The number of system errors encountered while processing policies",
},
[]string{
"policy",
"template",
"type",
},
)
)

func init() {
Expand All @@ -74,6 +97,18 @@ func init() {
metrics.Registry.MustRegister(compareObjSecondsCounter)
metrics.Registry.MustRegister(compareObjEvalCounter)
metrics.Registry.MustRegister(policyRelatedObjectGauge)
// Error metrics may already be registered by template sync
alreadyReg := &prometheus.AlreadyRegisteredError{}

regErr := metrics.Registry.Register(policySystemErrorsCounter)
if regErr != nil && !errors.As(regErr, alreadyReg) {
panic(regErr)
}

regErr = metrics.Registry.Register(policyUserErrorsCounter)
if regErr != nil && !errors.As(regErr, alreadyReg) {
panic(regErr)
}
}

// updateRelatedObjectMetric iterates through the collected related object map, deletes any metrics
Expand Down
Loading

0 comments on commit 49723a1

Please sign in to comment.