diff --git a/controllers/configurationpolicy_controller.go b/controllers/configurationpolicy_controller.go index a2e98308..287f981e 100644 --- a/controllers/configurationpolicy_controller.go +++ b/controllers/configurationpolicy_controller.go @@ -106,6 +106,8 @@ type ConfigurationPolicyReconciler struct { // where the controller is running. TargetK8sClient kubernetes.Interface TargetK8sConfig *rest.Config + // Whether custom metrics collection is enabled + EnableMetrics bool discoveryInfo // This is used to fetch and parse OpenAPI documents to perform client-side validation of object definitions. openAPIParser *openapi.CachedOpenAPIParser @@ -127,6 +129,8 @@ func (r *ConfigurationPolicyReconciler) Reconcile(ctx context.Context, request c _ = policyEvalCounter.DeleteLabelValues(request.Name) _ = compareObjEvalCounter.DeletePartialMatch(prometheus.Labels{"config_policy_name": request.Name}) _ = compareObjSecondsCounter.DeletePartialMatch(prometheus.Labels{"config_policy_name": request.Name}) + _ = policyRelatedObjectGauge.DeletePartialMatch( + prometheus.Labels{"policy": fmt.Sprintf("%s/%s", request.Namespace, request.Name)}) } return reconcile.Result{}, nil @@ -180,6 +184,9 @@ func (r *ConfigurationPolicyReconciler) PeriodicallyExecConfigPolicies(freq uint if !skipLoop { log.Info("Processing the policies", "count", len(policiesList.Items)) + // Initialize the related object map + policyRelatedObjectMap = map[string][]string{} + for i := 0; i < int(r.EvaluationConcurrency); i++ { wg.Add(1) @@ -200,6 +207,11 @@ func (r *ConfigurationPolicyReconciler) PeriodicallyExecConfigPolicies(freq uint close(policyQueue) wg.Wait() + // Update the related object metric after policy processing + if r.EnableMetrics { + updateRelatedObjectMetric() + } + // Update the evaluation histogram with the elapsed time elapsed := time.Since(start).Seconds() evalLoopHistogram.Observe(elapsed) // making sure that if processing is > freq we don't sleep @@ -919,14 +931,14 @@ func (r *ConfigurationPolicyReconciler) handleObjectTemplates(plc policyv1.Confi func (r *ConfigurationPolicyReconciler) checkRelatedAndUpdate( plc policyv1.ConfigurationPolicy, related, oldRelated []policyv1.RelatedObject, sendEvent bool, ) { - sortRelatedObjectsAndUpdate(&plc, related, oldRelated) + sortRelatedObjectsAndUpdate(&plc, related, oldRelated, r.EnableMetrics) // An update always occurs to account for the lastEvaluated status field r.addForUpdate(&plc, sendEvent) } // helper function to check whether related objects has changed func sortRelatedObjectsAndUpdate( - plc *policyv1.ConfigurationPolicy, related, oldRelated []policyv1.RelatedObject, + plc *policyv1.ConfigurationPolicy, related, oldRelated []policyv1.RelatedObject, collectMetrics bool, ) { sort.SliceStable(related, func(i, j int) bool { if related[i].Object.Kind != related[j].Object.Kind { @@ -941,7 +953,26 @@ func sortRelatedObjectsAndUpdate( update := false + // Instantiate found objects for the related object metric + found := map[string]bool{} + + if collectMetrics { + for _, obj := range oldRelated { + found[getObjectString(obj)] = false + } + } + + // Format policy for related object metric + policyVal := fmt.Sprintf("%s/%s", plc.Namespace, plc.Name) + for i, newEntry := range related { + var objKey string + // Collect the policy and related object for related object metric + if collectMetrics { + objKey = getObjectString(newEntry) + policyRelatedObjectMap[objKey] = append(policyRelatedObjectMap[objKey], policyVal) + } + for _, oldEntry := range oldRelated { // Get matching objects if gocmp.Equal(newEntry.Object, oldEntry.Object) { @@ -952,12 +983,26 @@ func sortRelatedObjectsAndUpdate( // Use the old properties if they existed and this is not a newly created resource related[i].Properties = oldEntry.Properties + if collectMetrics { + found[objKey] = true + } + break } } } } + // Clean up old related object metrics if the related object list changed + if collectMetrics { + for _, obj := range oldRelated { + objString := getObjectString(obj) + if !found[objString] { + _ = policyRelatedObjectGauge.DeleteLabelValues(objString, policyVal) + } + } + } + if len(oldRelated) == len(related) { for i, entry := range oldRelated { if !gocmp.Equal(entry, related[i]) { diff --git a/controllers/configurationpolicy_controller_test.go b/controllers/configurationpolicy_controller_test.go index bdd5c55d..399affba 100644 --- a/controllers/configurationpolicy_controller_test.go +++ b/controllers/configurationpolicy_controller_test.go @@ -387,14 +387,14 @@ func TestSortRelatedObjectsAndUpdate(t *testing.T) { empty := []policyv1.RelatedObject{} - sortRelatedObjectsAndUpdate(policy, relatedList, empty) + sortRelatedObjectsAndUpdate(policy, relatedList, empty, false) assert.True(t, relatedList[0].Object.Metadata.Name == "bar") // append another object named bar but also with namespace bar relatedList = append(relatedList, addRelatedObjects(true, rsrc, "ConfigurationPolicy", "bar", true, []string{name}, "reason", nil)...) - sortRelatedObjectsAndUpdate(policy, relatedList, empty) + sortRelatedObjectsAndUpdate(policy, relatedList, empty, false) assert.True(t, relatedList[0].Object.Metadata.Namespace == "bar") // clear related objects and test sorting with no namespace @@ -405,7 +405,7 @@ func TestSortRelatedObjectsAndUpdate(t *testing.T) { relatedList = append(relatedList, addRelatedObjects(true, rsrc, "ConfigurationPolicy", "", false, []string{name}, "reason", nil)...) - sortRelatedObjectsAndUpdate(policy, relatedList, empty) + sortRelatedObjectsAndUpdate(policy, relatedList, empty, false) assert.True(t, relatedList[0].Object.Metadata.Name == "bar") } diff --git a/controllers/metric.go b/controllers/metric.go index fdcebb38..8aba1f4c 100644 --- a/controllers/metric.go +++ b/controllers/metric.go @@ -5,6 +5,8 @@ import ( "github.com/prometheus/client_golang/prometheus" "sigs.k8s.io/controller-runtime/pkg/metrics" + + policyv1 "open-cluster-management.io/config-policy-controller/api/v1" ) var ( @@ -47,6 +49,20 @@ var ( }, []string{"config_policy_name", "namespace", "object"}, ) + // The policyRelatedObjectMap collects a map of related objects to policies + // in order to populate the gauge: + // : [] + policyRelatedObjectMap map[string][]string + policyRelatedObjectGauge = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "common_related_objects", + Help: "A gauge vector of related objects managed by multiple policies.", + }, + []string{ + "relatedObject", + "policy", + }, + ) ) func init() { @@ -58,3 +74,37 @@ func init() { metrics.Registry.MustRegister(compareObjEvalCounter) metrics.Registry.MustRegister(policyRelatedObjectGauge) } + +// updateRelatedObjectMetric iterates through the collected related object map, deletes any metrics +// that aren't duplications, and sets a metric for any related object that is handled by multiple +// policies to the number of policies that currently handles it. +func updateRelatedObjectMetric() { + log.V(3).Info("Updating common_related_objects metric ...") + + for relatedObj, policies := range policyRelatedObjectMap { + for _, policy := range policies { + if len(policies) == 1 { + policyRelatedObjectGauge.DeleteLabelValues(relatedObj, policy) + + continue + } + + gaugeInstance, err := policyRelatedObjectGauge.GetMetricWithLabelValues(relatedObj, policy) + if err != nil { + log.V(3).Error(err, "Failed to retrieve related object gauge") + + continue + } + + gaugeInstance.Set(float64(len(policies))) + } + } +} + +// getObjectString returns a string formatted as: +// .// +func getObjectString(obj policyv1.RelatedObject) string { + return fmt.Sprintf("%s.%s/%s/%s", + obj.Object.Kind, obj.Object.APIVersion, + obj.Object.Metadata.Namespace, obj.Object.Metadata.Name) +} diff --git a/main.go b/main.go index 106cc043..132f595f 100644 --- a/main.go +++ b/main.go @@ -72,7 +72,7 @@ func main() { var clusterName, hubConfigPath, targetKubeConfig, metricsAddr, probeAddr string var frequency uint var decryptionConcurrency, evaluationConcurrency uint8 - var enableLease, enableLeaderElection, legacyLeaderElection bool + var enableLease, enableLeaderElection, legacyLeaderElection, enableMetrics bool pflag.UintVar(&frequency, "update-frequency", 10, "The status update frequency (in seconds) of a mutation policy") @@ -109,6 +109,7 @@ func main() { 2, "The max number of concurrent configuration policy evaluations", ) + pflag.BoolVar(&enableMetrics, "enable-metrics", true, "Disable custom metrics collection") pflag.Parse() @@ -252,6 +253,7 @@ func main() { InstanceName: instanceName, TargetK8sClient: targetK8sClient, TargetK8sConfig: targetK8sConfig, + EnableMetrics: enableMetrics, } if err = reconciler.SetupWithManager(mgr); err != nil { log.Error(err, "Unable to create controller", "controller", "ConfigurationPolicy") diff --git a/test/e2e/case25_related_object_metric_test.go b/test/e2e/case25_related_object_metric_test.go new file mode 100644 index 00000000..b455eed1 --- /dev/null +++ b/test/e2e/case25_related_object_metric_test.go @@ -0,0 +1,78 @@ +package e2e + +import ( + "fmt" + "os/exec" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "open-cluster-management.io/config-policy-controller/test/utils" +) + +var _ = Describe("Test related object metrics", Ordered, func() { + const ( + policy1Name = "case25-test-policy-1" + policy2Name = "case25-test-policy-2" + relatedObject = "case25-configmap" + policyYaml = "../resources/case25_related_object_metric/case25-test-policy.yaml" + ) + It("should create policies and related objects", func() { + By("Creating " + policyYaml) + utils.Kubectl("apply", + "-f", policyYaml, + "-n", testNamespace) + By("Verifying the policies were created") + plc1 := utils.GetWithTimeout( + clientManagedDynamic, gvrConfigPolicy, policy1Name, testNamespace, true, defaultTimeoutSeconds, + ) + Expect(plc1).NotTo(BeNil()) + plc2 := utils.GetWithTimeout( + clientManagedDynamic, gvrConfigPolicy, policy2Name, testNamespace, true, defaultTimeoutSeconds, + ) + By("Verifying the related object was created") + Expect(plc2).NotTo(BeNil()) + obj := utils.GetWithTimeout( + clientManagedDynamic, gvrConfigMap, relatedObject, "default", true, defaultTimeoutSeconds, + ) + Expect(obj).NotTo(BeNil()) + }) + + It("should correctly report common related objects", func() { + By("Checking metric endpoint for relate object gauge for policy " + policy1Name) + Eventually(func() interface{} { + return utils.GetMetrics( + "common_related_objects", fmt.Sprintf(`policy=\"%s/%s\"`, testNamespace, policy1Name)) + }, defaultTimeoutSeconds, 1).Should(Equal([]string{"2"})) + By("Checking metric endpoint for relate object gauge for policy " + policy2Name) + Eventually(func() interface{} { + return utils.GetMetrics( + "common_related_objects", fmt.Sprintf(`policy=\"%s/%s\"`, testNamespace, policy2Name)) + }, defaultTimeoutSeconds, 1).Should(Equal([]string{"2"})) + }) + + cleanup := func() { + // Delete the policies and ignore any errors (in case it was deleted previously) + cmd := exec.Command("kubectl", "delete", + "-f", policyYaml, + "-n", testNamespace) + _, _ = cmd.CombinedOutput() + opt := metav1.ListOptions{} + utils.ListWithTimeout( + clientManagedDynamic, gvrConfigPolicy, opt, 0, false, defaultTimeoutSeconds) + utils.GetWithTimeout( + clientManagedDynamic, gvrConfigMap, relatedObject, "default", false, defaultTimeoutSeconds) + } + + It("should clean up", cleanup) + + It("should have no common related object metrics after clean up", func() { + By("Checking metric endpoint for related object gauges") + Eventually(func() interface{} { + return utils.GetMetrics("common_related_objects") + }, defaultTimeoutSeconds, 1).Should(Equal([]string{})) + }) + + AfterAll(cleanup) +}) diff --git a/test/resources/case25_related_object_metric/case25-test-policy.yaml b/test/resources/case25_related_object_metric/case25-test-policy.yaml new file mode 100644 index 00000000..bfe5d152 --- /dev/null +++ b/test/resources/case25_related_object_metric/case25-test-policy.yaml @@ -0,0 +1,37 @@ +--- +apiVersion: policy.open-cluster-management.io/v1 +kind: ConfigurationPolicy +metadata: + name: case25-test-policy-1 +spec: + remediationAction: enforce + pruneObjectBehavior: DeleteAll + object-templates: + - complianceType: musthave + objectDefinition: + apiVersion: v1 + kind: ConfigMap + metadata: + name: case25-configmap + namespace: default + data: + name: testvalue +--- +apiVersion: policy.open-cluster-management.io/v1 +kind: ConfigurationPolicy +metadata: + name: case25-test-policy-2 +spec: + remediationAction: enforce + pruneObjectBehavior: DeleteAll + namespaceSelector: + include: ["default"] + object-templates: + - complianceType: musthave + objectDefinition: + apiVersion: v1 + kind: ConfigMap + metadata: + name: case25-configmap + data: + name: testvalue diff --git a/test/utils/utils.go b/test/utils/utils.go index dd6be517..22b2cf20 100644 --- a/test/utils/utils.go +++ b/test/utils/utils.go @@ -9,6 +9,7 @@ import ( "os" "os/exec" "regexp" + "strings" "time" "github.com/ghodss/yaml" @@ -242,3 +243,52 @@ func GetLastEvaluated(configPolicy *unstructured.Unstructured) (string, int64) { return lastEvaluated, lastEvaluatedGeneration } + +// GetMetrics execs into the propagator pod and curls the metrics endpoint, filters +// the response with the given patterns, and returns the value(s) for the matching +// metric(s). +func GetMetrics(metricPatterns ...string) []string { + podCmd := exec.Command("kubectl", "get", "pod", "-n=open-cluster-management-agent-addon", + "-l=name=config-policy-controller", "--no-headers") + + propPodInfo, err := podCmd.Output() + if err != nil { + return []string{err.Error()} + } + + var cmd *exec.Cmd + + metricFilter := " | grep " + strings.Join(metricPatterns, " | grep ") + metricsCmd := `curl localhost:8383/metrics` + metricFilter + + // The pod name is "No" when the response is "No resources found" + propPodName := strings.Split(string(propPodInfo), " ")[0] + if propPodName == "No" { + // A missing pod could mean the controller is running locally + cmd = exec.Command("bash", "-c", metricsCmd) + } else { + cmd = exec.Command("kubectl", "exec", "-n=open-cluster-management-agent-addon", propPodName, "-c", + "config-policy-controller", "--", "bash", "-c", metricsCmd) + } + + matchingMetricsRaw, err := cmd.Output() + if err != nil { + if err.Error() == "exit status 1" { + return []string{} // exit 1 indicates that grep couldn't find a match. + } + + return []string{err.Error()} + } + + matchingMetrics := strings.Split(strings.TrimSpace(string(matchingMetricsRaw)), "\n") + values := make([]string, len(matchingMetrics)) + + for i, metric := range matchingMetrics { + fields := strings.Fields(metric) + if len(fields) > 0 { + values[i] = fields[len(fields)-1] + } + } + + return values +}