diff --git a/controllers/ssp_controller.go b/controllers/ssp_controller.go index 916bc9d8b..ed759ffa5 100644 --- a/controllers/ssp_controller.go +++ b/controllers/ssp_controller.go @@ -132,7 +132,7 @@ func (r *sspReconciler) setupController(mgr ctrl.Manager) error { func (r *sspReconciler) Reconcile(ctx context.Context, req ctrl.Request) (res ctrl.Result, err error) { defer func() { if err != nil { - common.SSPOperatorReconcilingProperly.Set(0) + common.SSPOperatorReconcileSucceeded.Set(0) } }() reqLogger := r.log.WithValues("ssp", req.NamespacedName) @@ -230,9 +230,9 @@ func (r *sspReconciler) Reconcile(ctx context.Context, req ctrl.Request) (res ct sspRequest.Logger.Info("CR status updated") if sspRequest.Instance.Status.Phase == lifecycleapi.PhaseDeployed { - common.SSPOperatorReconcilingProperly.Set(1) + common.SSPOperatorReconcileSucceeded.Set(1) } else { - common.SSPOperatorReconcilingProperly.Set(0) + common.SSPOperatorReconcileSucceeded.Set(0) } return ctrl.Result{}, nil diff --git a/docs/metrics.md b/docs/metrics.md index 6c5cbab8d..52d4919e6 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -3,15 +3,21 @@ This document aims to help users that are not familiar with metrics exposed by t All metrics documented here are auto-generated by the utility tool `tools/metricsdocs` and reflects exactly what is being exposed. ## SSP Operator Metrics List +### kubevirt_ssp_common_templates_restored_increase +The increase in the number of common templates restored by the operator back to their original state, over the last hour. Type: Gauge. ### kubevirt_ssp_common_templates_restored_total The total number of common templates restored by the operator back to their original state. Type: Counter. -### kubevirt_ssp_num_of_operator_reconciling_properly +### kubevirt_ssp_operator_reconcile_succeeded +Set to 1 if the reconcile process of all operands completes with no errors, and to 0 otherwise. Type: Gauge. +### kubevirt_ssp_operator_reconcile_succeeded_aggregated The total number of ssp-operator pods reconciling with no errors. Type: Gauge. -### kubevirt_ssp_operator_up_total +### kubevirt_ssp_operator_up The total number of running ssp-operator pods. Type: Gauge. -### kubevirt_ssp_rejected_vms_total -The total number of vms rejected by virt-template-validator. Type: Counter. -### kubevirt_ssp_template_validator_up_total +### kubevirt_ssp_template_validator_rejected_increase +The increase in the number of rejected template validators, over the last hour. Type: Gauge. +### kubevirt_ssp_template_validator_rejected_total +The total number of rejected template validators. Type: Counter. +### kubevirt_ssp_template_validator_up The total number of running virt-template-validator pods. Type: Gauge. ## Developing new metrics After developing new metrics or changing old ones, please run `make generate-doc` to regenerate this document. diff --git a/internal/common/resource.go b/internal/common/resource.go index 3960758fd..c5ca94857 100644 --- a/internal/common/resource.go +++ b/internal/common/resource.go @@ -115,8 +115,8 @@ type reconcileBuilder struct { var _ ReconcileBuilder = &reconcileBuilder{} var ( - SSPOperatorReconcilingProperly = prometheus.NewGauge(prometheus.GaugeOpts{ - Name: "ssp_operator_reconciling_properly", + SSPOperatorReconcileSucceeded = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "kubevirt_ssp_operator_reconcile_succeeded", Help: "Set to 1 if the reconcile process of all operands completes with no errors, and to 0 otherwise", }) ) diff --git a/internal/operands/common-templates/reconcile.go b/internal/operands/common-templates/reconcile.go index 34852838e..fcef6ad33 100644 --- a/internal/operands/common-templates/reconcile.go +++ b/internal/operands/common-templates/reconcile.go @@ -22,7 +22,7 @@ import ( var ( CommonTemplatesRestored = prometheus.NewCounter(prometheus.CounterOpts{ - Name: "total_restored_common_templates", + Name: "kubevirt_ssp_common_templates_restored_total", Help: "The total number of common templates restored by the operator back to their original state", }) ) diff --git a/internal/operands/common-templates/reconcile_test.go b/internal/operands/common-templates/reconcile_test.go index 5ee6634ed..2d76e914d 100644 --- a/internal/operands/common-templates/reconcile_test.go +++ b/internal/operands/common-templates/reconcile_test.go @@ -99,7 +99,7 @@ var _ = Describe("Common-Templates operand", func() { } desc, value := getCommonTemplatesRestoredMetric() - Expect(desc).To(ContainSubstring("total_restored_common_templates")) + Expect(desc).To(ContainSubstring("kubevirt_ssp_common_templates_restored_total")) Expect(value).To(BeZero()) }) @@ -132,7 +132,7 @@ var _ = Describe("Common-Templates operand", func() { } desc, value := getCommonTemplatesRestoredMetric() - Expect(desc).To(ContainSubstring("total_restored_common_templates")) + Expect(desc).To(ContainSubstring("kubevirt_ssp_common_templates_restored_total")) Expect(value).To(Equal(float64(len(testTemplates)))) }) @@ -278,7 +278,7 @@ var _ = Describe("Common-Templates operand", func() { }) }) - Context("total_restored_common_templates metric", func() { + Context("kubevirt_ssp_common_templates_restored_total metric", func() { var template *templatev1.Template var initialMetricValue float64 @@ -290,7 +290,7 @@ var _ = Describe("Common-Templates operand", func() { template.Namespace = namespace desc, value := getCommonTemplatesRestoredMetric() - Expect(desc).To(ContainSubstring("total_restored_common_templates")) + Expect(desc).To(ContainSubstring("kubevirt_ssp_common_templates_restored_total")) initialMetricValue = value }) @@ -306,7 +306,7 @@ var _ = Describe("Common-Templates operand", func() { Expect(updatedTpl.Labels[TemplateTypeLabel]).To(Equal(testTemplates[0].Labels[TemplateTypeLabel])) desc, value := getCommonTemplatesRestoredMetric() - Expect(desc).To(ContainSubstring("total_restored_common_templates")) + Expect(desc).To(ContainSubstring("kubevirt_ssp_common_templates_restored_total")) Expect(value).To(Equal(initialMetricValue + 1)) }) @@ -323,7 +323,7 @@ var _ = Describe("Common-Templates operand", func() { Expect(updatedTpl.Labels[TemplateTypeLabel]).To(Equal(testTemplates[0].Labels[TemplateTypeLabel])) desc, value := getCommonTemplatesRestoredMetric() - Expect(desc).To(ContainSubstring("total_restored_common_templates")) + Expect(desc).To(ContainSubstring("kubevirt_ssp_common_templates_restored_total")) Expect(value).To(Equal(initialMetricValue)) }) }) diff --git a/internal/operands/metrics/resources.go b/internal/operands/metrics/resources.go index 6ffa37797..e8752daa3 100644 --- a/internal/operands/metrics/resources.go +++ b/internal/operands/metrics/resources.go @@ -32,8 +32,8 @@ const ( ) const ( - Total_restored_common_templates_increase_query = "sum(increase(total_restored_common_templates{pod=~'ssp-operator.*'}[1h]))" - Total_rejected_vms_increase_query = "sum(increase(total_rejected_vms{pod=~'virt-template-validator.*'}[1h]))" + CommonTemplatesRestoredIncreaseQuery = "sum(increase(kubevirt_ssp_common_templates_restored_total{pod=~'ssp-operator.*'}[1h]))" + TemplateValidatorRejectedIncreaseQuery = "sum(increase(kubevirt_ssp_template_validator_rejected_total{pod=~'virt-template-validator.*'}[1h]))" ) // RecordRulesDesc represent SSP Operator Prometheus Record Rules @@ -47,34 +47,34 @@ type RecordRulesDesc struct { // RecordRulesDescList lists all SSP Operator Prometheus Record Rules var RecordRulesDescList = []RecordRulesDesc{ { - Name: "kubevirt_ssp_operator_up_total", + Name: "kubevirt_ssp_operator_up", Expr: intstr.FromString("sum(up{pod=~'ssp-operator.*'}) OR on() vector(0)"), Description: "The total number of running ssp-operator pods", Type: "Gauge", }, { - Name: "kubevirt_ssp_template_validator_up_total", + Name: "kubevirt_ssp_template_validator_up", Expr: intstr.FromString("sum(up{pod=~'virt-template-validator.*'}) OR on() vector(0)"), Description: "The total number of running virt-template-validator pods", Type: "Gauge", }, { - Name: "kubevirt_ssp_num_of_operator_reconciling_properly", - Expr: intstr.FromString("sum(ssp_operator_reconciling_properly)"), + Name: "kubevirt_ssp_operator_reconcile_succeeded_aggregated", + Expr: intstr.FromString("sum(kubevirt_ssp_operator_reconcile_succeeded)"), Description: "The total number of ssp-operator pods reconciling with no errors", Type: "Gauge", }, { - Name: "kubevirt_ssp_rejected_vms_total", - Expr: intstr.FromString(Total_rejected_vms_increase_query + " OR on() vector(0)"), - Description: "The total number of vms rejected by virt-template-validator", - Type: "Counter", + Name: "kubevirt_ssp_template_validator_rejected_increase", + Expr: intstr.FromString(TemplateValidatorRejectedIncreaseQuery + " OR on() vector(0)"), + Description: "The increase in the number of rejected template validators, over the last hour", + Type: "Gauge", }, { - Name: "kubevirt_ssp_common_templates_restored_total", - Expr: intstr.FromString(Total_restored_common_templates_increase_query + " OR on() vector(0)"), - Description: "The total number of common templates restored by the operator back to their original state", - Type: "Counter", + Name: "kubevirt_ssp_common_templates_restored_increase", + Expr: intstr.FromString(CommonTemplatesRestoredIncreaseQuery + " OR on() vector(0)"), + Description: "The increase in the number of common templates restored by the operator back to their original state, over the last hour", + Type: "Gauge", }, } @@ -91,7 +91,7 @@ func getAlertRules() ([]promv1.Rule, error) { }, { Alert: "SSPDown", - Expr: intstr.FromString("kubevirt_ssp_operator_up_total == 0"), + Expr: intstr.FromString("kubevirt_ssp_operator_up == 0"), For: "5m", Annotations: map[string]string{ "summary": "All SSP operator pods are down.", @@ -106,7 +106,7 @@ func getAlertRules() ([]promv1.Rule, error) { }, { Alert: "SSPTemplateValidatorDown", - Expr: intstr.FromString("kubevirt_ssp_template_validator_up_total == 0"), + Expr: intstr.FromString("kubevirt_ssp_template_validator_up == 0"), For: "5m", Annotations: map[string]string{ "summary": "All Template Validator pods are down.", @@ -121,7 +121,7 @@ func getAlertRules() ([]promv1.Rule, error) { }, { Alert: "SSPFailingToReconcile", - Expr: intstr.FromString("(kubevirt_ssp_num_of_operator_reconciling_properly == 0) and (kubevirt_ssp_operator_up_total > 0)"), + Expr: intstr.FromString("(kubevirt_ssp_operator_reconcile_succeeded_aggregated == 0) and (kubevirt_ssp_operator_up > 0)"), For: "5m", Annotations: map[string]string{ "summary": "The ssp-operator pod is up but failing to reconcile", @@ -136,7 +136,7 @@ func getAlertRules() ([]promv1.Rule, error) { }, { Alert: "SSPHighRateRejectedVms", - Expr: intstr.FromString("kubevirt_ssp_rejected_vms_total > 5"), + Expr: intstr.FromString("kubevirt_ssp_template_validator_rejected_increase > 5"), For: "5m", Annotations: map[string]string{ "summary": "High rate of rejected Vms", @@ -151,7 +151,7 @@ func getAlertRules() ([]promv1.Rule, error) { }, { Alert: "SSPCommonTemplatesModificationReverted", - Expr: intstr.FromString("kubevirt_ssp_common_templates_restored_total > 0"), + Expr: intstr.FromString("kubevirt_ssp_common_templates_restored_increase > 0"), For: "0m", Annotations: map[string]string{ "summary": "Common Templates manual modifications were reverted by the operator", diff --git a/internal/template-validator/webhooks/hook.go b/internal/template-validator/webhooks/hook.go index 18fe6145b..e6bced859 100644 --- a/internal/template-validator/webhooks/hook.go +++ b/internal/template-validator/webhooks/hook.go @@ -37,9 +37,9 @@ import ( ) var ( - vmsRejected = promauto.NewCounter(prometheus.CounterOpts{ - Name: "total_rejected_vms", - Help: "The total number of rejected vms", + templateValidatorRejected = promauto.NewCounter(prometheus.CounterOpts{ + Name: "kubevirt_ssp_template_validator_rejected_total", + Help: "The total number of rejected template validators", }) ) @@ -102,7 +102,7 @@ func (w *webhooks) admitVm(ar *admissionv1.AdmissionReview) *admissionv1.Admissi causes := ValidateVm(rules, vm) if len(causes) > 0 { - vmsRejected.Inc() + templateValidatorRejected.Inc() return ToAdmissionResponse(causes) } diff --git a/main.go b/main.go index 22bbc40e1..cff2ea6b4 100644 --- a/main.go +++ b/main.go @@ -69,7 +69,7 @@ const ( func runPrometheusServer(metricsAddr string, tlsOptions common.SSPTLSOptions) error { setupLog.Info("Starting Prometheus metrics endpoint server with TLS") metrics.Registry.MustRegister(common_templates.CommonTemplatesRestored) - metrics.Registry.MustRegister(common.SSPOperatorReconcilingProperly) + metrics.Registry.MustRegister(common.SSPOperatorReconcileSucceeded) handler := promhttp.HandlerFor(metrics.Registry, promhttp.HandlerOpts{}) mux := http.NewServeMux() mux.Handle("/metrics", handler) diff --git a/tests/metrics_test.go b/tests/metrics_test.go index c4ffb4276..cb13e3ec4 100644 --- a/tests/metrics_test.go +++ b/tests/metrics_test.go @@ -166,7 +166,7 @@ var _ = Describe("Metrics", func() { template = &getTemplates().Items[0] }) - It("[test_id:TODO]should increment total_restored_common_templates during normal reconcile", func() { + It("[test_id:TODO]should increment kubevirt_ssp_common_templates_restored_total during normal reconcile", func() { skipIfUpgradeLane() restoredCount := totalRestoredTemplatesCount() @@ -179,7 +179,7 @@ var _ = Describe("Metrics", func() { }, 5*time.Minute, 10*time.Second).Should(Equal(restoredCount + 1)) }) - It("[test_id:TODO]should not increment total_restored_common_templates during upgrades", func() { + It("[test_id:TODO]should not increment kubevirt_ssp_common_templates_restored_total during upgrades", func() { restoredCount := totalRestoredTemplatesCount() template.Labels[common_templates.TemplateTypeLabel] = "test" diff --git a/tests/metrics_test_utils.go b/tests/metrics_test_utils.go index 8b990ef08..5739a6ac9 100644 --- a/tests/metrics_test_utils.go +++ b/tests/metrics_test_utils.go @@ -15,9 +15,9 @@ import ( ) var regexpForMetrics = map[string]*regexp.Regexp{ - "total_rejected_vms": regexp.MustCompile(`total_rejected_vms ([0-9]+)`), - "total_restored_common_templates": regexp.MustCompile(`total_restored_common_templates ([0-9]+)`), - "ssp_operator_reconciling_properly": regexp.MustCompile(`ssp_operator_reconciling_properly ([0-9]+)`), + "kubevirt_ssp_template_validator_rejected_total": regexp.MustCompile(`kubevirt_ssp_template_validator_rejected_total ([0-9]+)`), + "kubevirt_ssp_common_templates_restored_total": regexp.MustCompile(`kubevirt_ssp_common_templates_restored_total ([0-9]+)`), + "kubevirt_ssp_operator_reconcile_succeeded": regexp.MustCompile(`kubevirt_ssp_operator_reconcile_succeeded ([0-9]+)`), } func intMetricValue(metricName string, metricsPort uint16, pod *v1.Pod) int { diff --git a/tests/misc_test.go b/tests/misc_test.go index 3f8af1ccb..72537aa94 100644 --- a/tests/misc_test.go +++ b/tests/misc_test.go @@ -98,7 +98,7 @@ var _ = Describe("Observed generation", func() { }) }) -var _ = Describe("SSPOperatorReconcilingProperly metric", func() { +var _ = Describe("SSPOperatorReconcileSucceeded metric", func() { var ( deploymentRes testResource finalizerName = "ssp.kubernetes.io/temp-protection" @@ -117,7 +117,7 @@ var _ = Describe("SSPOperatorReconcilingProperly metric", func() { waitUntilDeployed() }) - It("[test_id:7369] should set SSPOperatorReconcilingProperly metrics to 0 on failing to reconcile", func() { + It("[test_id:7369] should set SSPOperatorReconcileSucceeded metrics to 0 on failing to reconcile", func() { // add a finalizer to the validator deployment, do that it can't be deleted addFinalizer(deploymentRes, finalizerName) // send a request to delete the validator deployment @@ -168,7 +168,7 @@ func validateSspIsFailingToReconcileMetric() { }) // the reconcile cycle should now be failing, so the ssp_operator_reconciling_properly metric should be 0 Eventually(func() int { - return sspOperatorReconcilingProperlyCount() + return sspOperatorReconcileSucceededCount() }, env.ShortTimeout(), time.Second).Should(Equal(0)) } diff --git a/tests/monitoring_test.go b/tests/monitoring_test.go index 056269f10..fa7053607 100644 --- a/tests/monitoring_test.go +++ b/tests/monitoring_test.go @@ -43,7 +43,7 @@ var _ = Describe("Prometheus Alerts", func() { }) It("[test_id:8363] Should fire SSPCommonTemplatesModificationReverted", func() { // we have to wait for prometheus to pick up the series before we increase it. - waitForSeriesToBeDetected(metrics.Total_restored_common_templates_increase_query) + waitForSeriesToBeDetected(metrics.CommonTemplatesRestoredIncreaseQuery) expectTemplateUpdateToIncreaseTotalRestoredTemplatesCount(testTemplate) waitForAlertToActivate("SSPCommonTemplatesModificationReverted") }) @@ -66,7 +66,7 @@ var _ = Describe("Prometheus Alerts", func() { deploymentRes = testDeploymentResource() }) - It("[test_id:8364] should set SSPOperatorReconcilingProperly metrics to 0 on failing to reconcile", func() { + It("[test_id:8364] should set SSPOperatorReconcileSucceeded metrics to 0 on failing to reconcile", func() { // add a finalizer to the validator deployment, do that it can't be deleted addFinalizer(deploymentRes, finalizerName) // send a request to delete the validator deployment @@ -108,7 +108,7 @@ var _ = Describe("Prometheus Alerts", func() { }) It("[test_id:8377] Should fire SSPHighRateRejectedVms", func() { - waitForSeriesToBeDetected(metrics.Total_rejected_vms_increase_query) + waitForSeriesToBeDetected(metrics.TemplateValidatorRejectedIncreaseQuery) Expect(apiClient.Create(ctx, template)).ToNot(HaveOccurred(), "Failed to create template: %s", template.Name) for range [6]int{} { time.Sleep(time.Second * 5) diff --git a/tests/tests_common_test.go b/tests/tests_common_test.go index 80674f819..0c1570c96 100644 --- a/tests/tests_common_test.go +++ b/tests/tests_common_test.go @@ -113,10 +113,10 @@ func expectRecreateAfterDelete(res *testResource) { Expect(err).ToNot(HaveOccurred()) } -func sspOperatorReconcilingProperlyCount() (sum int) { +func sspOperatorReconcileSucceededCount() (sum int) { operatorPods, operatorMetricsPort := operatorPodsWithMetricsPort() for _, sspOperator := range operatorPods { - sum += intMetricValue("ssp_operator_reconciling_properly", operatorMetricsPort, &sspOperator) + sum += intMetricValue("kubevirt_ssp_operator_reconcile_succeeded", operatorMetricsPort, &sspOperator) } return } @@ -124,7 +124,7 @@ func sspOperatorReconcilingProperlyCount() (sum int) { func totalRestoredTemplatesCount() (sum int) { operatorPods, operatorMetricsPort := operatorPodsWithMetricsPort() for _, sspOperator := range operatorPods { - sum += intMetricValue("total_restored_common_templates", operatorMetricsPort, &sspOperator) + sum += intMetricValue("kubevirt_ssp_common_templates_restored_total", operatorMetricsPort, &sspOperator) } return } diff --git a/tests/validator_test.go b/tests/validator_test.go index f220294e5..c0579ea60 100644 --- a/tests/validator_test.go +++ b/tests/validator_test.go @@ -943,7 +943,7 @@ func totalRejectedVmsMetricsValue() (sum int) { Expect(err).ToNot(HaveOccurred(), "Could not find the validator pods") Expect(pods.Items).ToNot(BeEmpty()) for _, validatorPod := range pods.Items { - sum += intMetricValue("total_rejected_vms", validator.MetricsPort, &validatorPod) + sum += intMetricValue("kubevirt_ssp_template_validator_rejected_total", validator.MetricsPort, &validatorPod) } return } diff --git a/tools/metricsdocs/metricsdocs.go b/tools/metricsdocs/metricsdocs.go index 4a821392b..236701c0d 100644 --- a/tools/metricsdocs/metricsdocs.go +++ b/tools/metricsdocs/metricsdocs.go @@ -25,7 +25,8 @@ const ( ) func main() { - metricsList := recordRulesDescToMetricList(metrics.RecordRulesDescList) + metricsList := getMetricsList() + metricsList = append(metricsList, recordRulesDescToMetricList(metrics.RecordRulesDescList)...) sort.Sort(metricsList) printMetrics(metricsList) } @@ -59,6 +60,28 @@ func metricDescriptionToMetric(rrd metrics.RecordRulesDesc) metric { } } +func getMetricsList() metricList { + metrics := metricList{ + { + name: "kubevirt_ssp_template_validator_rejected_total", + description: "The total number of rejected template validators", + mtype: "Counter", + }, + { + name: "kubevirt_ssp_operator_reconcile_succeeded", + description: "Set to 1 if the reconcile process of all operands completes with no errors, and to 0 otherwise", + mtype: "Gauge", + }, + { + name: "kubevirt_ssp_common_templates_restored_total", + description: "The total number of common templates restored by the operator back to their original state", + mtype: "Counter", + }, + } + + return metrics +} + func (m metric) writeOut() { fmt.Println("###", m.name) fmt.Println(m.description + ". Type: " + m.mtype + ".")