From 82528f6c24d381613e6a3d319657509dba77c727 Mon Sep 17 00:00:00 2001 From: dsionov Date: Wed, 25 Sep 2024 16:16:21 +0300 Subject: [PATCH] pkg/monitoring/metrics: add alert for VMs using outdated machine type - Introduce new alert for VMs using an outdated machine type. - Machine types are considered outdated if they are no longer compatible due to changes in the virt-launcher OS version. These VMs must be updated with supported machine types to ensure compatibility and avoid potential issues. - Add a functional test to verify the alert is triggered when VMs with outdated machine types are detected. Signed-off-by: Daniel Sionov --- pkg/components/components.go | 49 +++++----- .../rules/alerts/operator_alerts.go | 53 ++++++++-- tests/func-tests/monitoring_test.go | 77 +++++++++++++++ tools/csv-merger/csv-merger.go | 44 +++++---- .../manifest-templator/manifest-templator.go | 96 ++++++++++--------- 5 files changed, 219 insertions(+), 100 deletions(-) diff --git a/pkg/components/components.go b/pkg/components/components.go index 8d0661df6..e200adb44 100644 --- a/pkg/components/components.go +++ b/pkg/components/components.go @@ -62,30 +62,31 @@ var deploymentType = metav1.TypeMeta{ } type DeploymentOperatorParams struct { - Namespace string - Image string - WebhookImage string - CliDownloadsImage string - KVUIPluginImage string - KVUIProxyImage string - ImagePullPolicy string - ConversionContainer string - VmwareContainer string - VirtIOWinContainer string - Smbios string - Machinetype string - Amd64MachineType string - Arm64MachineType string - HcoKvIoVersion string - KubevirtVersion string - CdiVersion string - CnaoVersion string - SspVersion string - HppoVersion string - MtqVersion string - AaqVersion string - PrimaryUDNImage string - Env []corev1.EnvVar + Namespace string + Image string + WebhookImage string + CliDownloadsImage string + KVUIPluginImage string + KVUIProxyImage string + ImagePullPolicy string + ConversionContainer string + VmwareContainer string + VirtIOWinContainer string + Smbios string + Machinetype string + Amd64MachineType string + Arm64MachineType string + HcoKvIoVersion string + KubevirtVersion string + KubevirtVirtLancherOsVersion string + CdiVersion string + CnaoVersion string + SspVersion string + HppoVersion string + MtqVersion string + AaqVersion string + PrimaryUDNImage string + Env []corev1.EnvVar } func GetDeploymentOperator(params *DeploymentOperatorParams) appsv1.Deployment { diff --git a/pkg/monitoring/rules/alerts/operator_alerts.go b/pkg/monitoring/rules/alerts/operator_alerts.go index 30102412f..d6673da99 100644 --- a/pkg/monitoring/rules/alerts/operator_alerts.go +++ b/pkg/monitoring/rules/alerts/operator_alerts.go @@ -1,23 +1,31 @@ package alerts import ( + "os" + "strconv" + promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" "k8s.io/apimachinery/pkg/util/intstr" "k8s.io/utils/ptr" + logf "sigs.k8s.io/controller-runtime/pkg/log" ) const ( - outOfBandUpdateAlert = "KubeVirtCRModified" - unsafeModificationAlert = "UnsupportedHCOModification" - installationNotCompletedAlert = "HCOInstallationIncomplete" - singleStackIPv6Alert = "SingleStackIPv6Unsupported" - MisconfiguredDeschedulerAlert = "HCOMisconfiguredDescheduler" - severityAlertLabelKey = "severity" - healthImpactAlertLabelKey = "operator_health_impact" + outOfBandUpdateAlert = "KubeVirtCRModified" + unsafeModificationAlert = "UnsupportedHCOModification" + installationNotCompletedAlert = "HCOInstallationIncomplete" + singleStackIPv6Alert = "SingleStackIPv6Unsupported" + MisconfiguredDeschedulerAlert = "HCOMisconfiguredDescheduler" + VMOutdatedMachineTypeAlert = "VMHasOutdatedMachineType" + minSupportedVirtLauncherOSVersion = 8 + severityAlertLabelKey = "severity" + healthImpactAlertLabelKey = "operator_health_impact" ) func operatorAlerts() []promv1.Rule { - return []promv1.Rule{ + logger := logf.Log.WithName("operator-alerts") + + rules := []promv1.Rule{ { Alert: outOfBandUpdateAlert, Expr: intstr.FromString("sum by(component_name) ((round(increase(kubevirt_hco_out_of_band_modifications_total[10m]))>0 and kubevirt_hco_out_of_band_modifications_total offset 10m) or (kubevirt_hco_out_of_band_modifications_total != 0 unless kubevirt_hco_out_of_band_modifications_total offset 10m))"), @@ -80,4 +88,33 @@ func operatorAlerts() []promv1.Rule { }, }, } + + rhelVersion, exists := os.LookupEnv("VIRT_LAUNCHER_OS_VERSION") + if !exists { + return rules + } + + virtLauncherOSVersion, err := strconv.Atoi(rhelVersion) + if err != nil { + logger.Error(err, "Error parsing VIRT_LAUNCHER_OS_VERSION") + return rules + } + + if virtLauncherOSVersion > minSupportedVirtLauncherOSVersion { + rules = append(rules, promv1.Rule{ + Alert: VMOutdatedMachineTypeAlert, + Expr: intstr.FromString(`count(kubevirt_vmi_info{guest_os_machine=~".*rhel8.*"} + and on(name, namespace) kubevirt_vm_info{status=~"Running|Stopped"}) > 0`), + Annotations: map[string]string{ + "description": "There are virtual machines using an outdated machine type that need to be patched.", + "summary": "{{ $value }} virtual machines are using an outdated machine type.", + }, + Labels: map[string]string{ + severityAlertLabelKey: "warning", + healthImpactAlertLabelKey: "none", + }, + }) + } + + return rules } diff --git a/tests/func-tests/monitoring_test.go b/tests/func-tests/monitoring_test.go index 246846026..4cc14b24c 100644 --- a/tests/func-tests/monitoring_test.go +++ b/tests/func-tests/monitoring_test.go @@ -5,6 +5,7 @@ import ( "crypto/tls" "flag" "fmt" + "k8s.io/utils/ptr" "math" "net/http" "strconv" @@ -20,8 +21,10 @@ import ( promConfig "github.com/prometheus/common/config" promModel "github.com/prometheus/common/model" authenticationv1 "k8s.io/api/authentication/v1" + corev1 "k8s.io/api/core/v1" apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/kubernetes" @@ -192,6 +195,80 @@ var _ = Describe("[crit:high][vendor:cnv-qe@redhat.com][level:system]Monitoring" verifyOperatorHealthMetricValue(ctx, promClient, hcoClient, initialOperatorHealthMetricValue, warningImpact) }) + Context("VMHasOutdatedMachineType alert", func() { + const ( + query = `kubevirt_vmi_info{guest_os_machine=pc-q35-rhel8.4.0"}` + vmName = "test-vm-outdated-machine-type" + ) + + var ruleExists bool + + BeforeEach(func(ctx context.Context) { + By("Checking if the VMHasOutdatedMachineType rule is registered in Prometheus") + ruleExists = Eventually(func(ctx context.Context) (bool, error) { + rulesResult, err := promClient.Rules(ctx) + if err != nil { + return false, err + } + + for _, group := range rulesResult.Groups { + for _, rule := range group.Rules { + if alertingRule, ok := rule.(promApiv1.AlertingRule); ok { + if alertingRule.Name == hcoalerts.VMOutdatedMachineTypeAlert { + return true, nil + } + } + } + } + return false, nil + }).WithTimeout(60 * time.Second).WithPolling(time.Second).WithContext(ctx).Should(BeTrue()) + }) + + It("should fire the VMHasOutdatedMachineType alert when a VM is using an outdated machine type", func(ctx context.Context) { + if !ruleExists { + Skip("Skipping test because the VMHasOutdatedMachineType rule is not registered") + } + + By("Ensuring the VMHasOutdatedMachineType alert doesnt exist before creating the VM") + Consistently(func(ctx context.Context) *promApiv1.Alert { + alerts, err := promClient.Alerts(ctx) + Expect(err).ToNot(HaveOccurred()) + alert := getAlertByName(alerts, hcoalerts.VMOutdatedMachineTypeAlert) + return alert + }).WithPolling(time.Second).WithTimeout(15 * time.Second).WithContext(ctx).Should(BeNil()) + + By("Creating a VM with an outdated machine type") + vm := &kubevirtcorev1.VirtualMachine{ + ObjectMeta: metav1.ObjectMeta{ + Name: vmName, + Namespace: tests.TestNamespace, + }, + } + vm.Spec.Template.Spec.Domain.Resources.Requests = corev1.ResourceList{corev1.ResourceMemory: resource.MustParse("128Mi")} + vm.Spec.RunStrategy = ptr.To(kubevirtcorev1.RunStrategyOnce) + vm.Spec.Template.Spec.Domain.Machine = &kubevirtcorev1.Machine{Type: "pc-q35-rhel8.4.0"} + Expect(cli.Create(ctx, vm)).To(Succeed()) + + By("Checking that the metric for outdated machine types is set to 1.0") + Eventually(func(g Gomega, ctx context.Context) float64 { + valueAfter, err := hcoClient.GetHCOMetric(ctx, query) + g.Expect(err).NotTo(HaveOccurred()) + return valueAfter + }).WithTimeout(60*time.Second).WithPolling(time.Second).WithContext(ctx).Should( + Equal(float64(1)), + "expected outdated machine type metric to be 1.0", + ) + + By("Checking the VMHasOutdatedMachineType alert") + Eventually(func(ctx context.Context) *promApiv1.Alert { + alerts, err := promClient.Alerts(ctx) + Expect(err).ToNot(HaveOccurred()) + alert := getAlertByName(alerts, hcoalerts.VMOutdatedMachineTypeAlert) + return alert + }).WithTimeout(60 * time.Second).WithPolling(time.Second).WithContext(ctx).ShouldNot(BeNil()) + }) + }) + Describe("KubeDescheduler", Serial, Ordered, Label(tests.OpenshiftLabel, "monitoring"), func() { var ( diff --git a/tools/csv-merger/csv-merger.go b/tools/csv-merger/csv-merger.go index 74b6adb37..d9776c6d0 100644 --- a/tools/csv-merger/csv-merger.go +++ b/tools/csv-merger/csv-merger.go @@ -118,6 +118,7 @@ var ( crdDir = flag.String("crds-dir", "", "the directory containing the CRDs for apigroup validation. The validation will be performed if and only if the value is non-empty.") hcoKvIoVersion = flag.String("hco-kv-io-version", "", "KubeVirt version") kubevirtVersion = flag.String("kubevirt-version", "", "Kubevirt operator version") + kubevirtVirtLauncherOSVersion = flag.String("kubevirt-virt-launcher-os-version", "", "Kubevirt Virt launcher OS version") cdiVersion = flag.String("cdi-version", "", "CDI operator version") cnaoVersion = flag.String("cnao-version", "", "CNA operator version") sspVersion = flag.String("ssp-version", "", "SSP operator version") @@ -514,27 +515,28 @@ func getCsvBaseParams(replaces string, version semver.Version) *components.CSVBa func getDeploymentParams() *components.DeploymentOperatorParams { return &components.DeploymentOperatorParams{ - Namespace: *namespace, - Image: *operatorImage, - WebhookImage: *webhookImage, - CliDownloadsImage: *cliDownloadsImage, - KVUIPluginImage: *kvUIPluginImage, - KVUIProxyImage: *kvUIProxyImage, - ImagePullPolicy: "IfNotPresent", - VirtIOWinContainer: *kvVirtIOWinImage, - Smbios: *smbios, - Machinetype: *machinetype, - Amd64MachineType: *amd64MachineType, - Arm64MachineType: *arm64MachineType, - HcoKvIoVersion: *hcoKvIoVersion, - KubevirtVersion: *kubevirtVersion, - CdiVersion: *cdiVersion, - CnaoVersion: *cnaoVersion, - SspVersion: *sspVersion, - HppoVersion: *hppoVersion, - AaqVersion: *aaqVersion, - PrimaryUDNImage: *primaryUDNImage, - Env: envVars, + Namespace: *namespace, + Image: *operatorImage, + WebhookImage: *webhookImage, + CliDownloadsImage: *cliDownloadsImage, + KVUIPluginImage: *kvUIPluginImage, + KVUIProxyImage: *kvUIProxyImage, + ImagePullPolicy: "IfNotPresent", + VirtIOWinContainer: *kvVirtIOWinImage, + Smbios: *smbios, + Machinetype: *machinetype, + Amd64MachineType: *amd64MachineType, + Arm64MachineType: *arm64MachineType, + HcoKvIoVersion: *hcoKvIoVersion, + KubevirtVersion: *kubevirtVersion, + KubevirtVirtLancherOsVersion: *kubevirtVirtLauncherOSVersion, + CdiVersion: *cdiVersion, + CnaoVersion: *cnaoVersion, + SspVersion: *sspVersion, + HppoVersion: *hppoVersion, + AaqVersion: *aaqVersion, + PrimaryUDNImage: *primaryUDNImage, + Env: envVars, } } diff --git a/tools/manifest-templator/manifest-templator.go b/tools/manifest-templator/manifest-templator.go index 13a23dc6e..e43de1908 100644 --- a/tools/manifest-templator/manifest-templator.go +++ b/tools/manifest-templator/manifest-templator.go @@ -50,34 +50,35 @@ var ( // flags for the command line arguments we accept var ( - cwd, _ = os.Getwd() - deployDir = flag.String("deploy-dir", "deploy", "Directory where manifests should be written") - cnaCsv = flag.String("cna-csv", "", "Cluster Network Addons CSV string") - virtCsv = flag.String("virt-csv", "", "KubeVirt CSV string") - sspCsv = flag.String("ssp-csv", "", "Scheduling Scale Performance CSV string") - cdiCsv = flag.String("cdi-csv", "", "Containerized Data Importer CSV String") - hppCsv = flag.String("hpp-csv", "", "HostPath Provisioner Operator CSV String") - _ = flag.String("mtq-csv", "", "deprecated. This flag is ignored") - aaqCsv = flag.String("aaq-csv", "", "Applications Aware Quota Operator CSV String") - operatorNamespace = flag.String("operator-namespace", "kubevirt-hyperconverged", "Name of the Operator") - operatorImage = flag.String("operator-image", "", "HyperConverged Cluster Operator image") - webhookImage = flag.String("webhook-image", "", "HyperConverged Cluster Webhook image") - cliDownloadsImage = flag.String("cli-downloads-image", "", "Downloads Server image") - kvVirtIOWinImage = flag.String("kv-virtiowin-image-name", "", "KubeVirt VirtIO Win image") - smbios = flag.String("smbios", "", "Custom SMBIOS string for KubeVirt ConfigMap") - machinetype = flag.String("machinetype", "", "Custom MACHINETYPE string for KubeVirt ConfigMap (Deprecated, use amd64-machinetype)") - amd64MachineType = flag.String("amd64-machinetype", "", "Custom AMD64_MACHINETYPE string for KubeVirt ConfigMap") - arm64MachineType = flag.String("arm64-machinetype", "", "Custom ARM64_MACHINETYPE string for KubeVirt ConfigMap") - hcoKvIoVersion = flag.String("hco-kv-io-version", "", "KubeVirt version") - kubevirtVersion = flag.String("kubevirt-version", "", "Kubevirt operator version") - cdiVersion = flag.String("cdi-version", "", "CDI operator version") - cnaoVersion = flag.String("cnao-version", "", "CNA operator version") - sspVersion = flag.String("ssp-version", "", "SSP operator version") - hppoVersion = flag.String("hppo-version", "", "HPP operator version") - _ = flag.String("mtq-version", "", "deprecated. This flag is ignored") - aaqVersion = flag.String("aaq-version", "", "AAQ operator version") - primaryUDNImage = flag.String("primary-udn-binding-image-name", "", "Primary UDN binding image") - apiSources = flag.String("api-sources", cwd+"/...", "Project sources") + cwd, _ = os.Getwd() + deployDir = flag.String("deploy-dir", "deploy", "Directory where manifests should be written") + cnaCsv = flag.String("cna-csv", "", "Cluster Network Addons CSV string") + virtCsv = flag.String("virt-csv", "", "KubeVirt CSV string") + sspCsv = flag.String("ssp-csv", "", "Scheduling Scale Performance CSV string") + cdiCsv = flag.String("cdi-csv", "", "Containerized Data Importer CSV String") + hppCsv = flag.String("hpp-csv", "", "HostPath Provisioner Operator CSV String") + _ = flag.String("mtq-csv", "", "deprecated. This flag is ignored") + aaqCsv = flag.String("aaq-csv", "", "Applications Aware Quota Operator CSV String") + operatorNamespace = flag.String("operator-namespace", "kubevirt-hyperconverged", "Name of the Operator") + operatorImage = flag.String("operator-image", "", "HyperConverged Cluster Operator image") + webhookImage = flag.String("webhook-image", "", "HyperConverged Cluster Webhook image") + cliDownloadsImage = flag.String("cli-downloads-image", "", "Downloads Server image") + kvVirtIOWinImage = flag.String("kv-virtiowin-image-name", "", "KubeVirt VirtIO Win image") + smbios = flag.String("smbios", "", "Custom SMBIOS string for KubeVirt ConfigMap") + machinetype = flag.String("machinetype", "", "Custom MACHINETYPE string for KubeVirt ConfigMap (Deprecated, use amd64-machinetype)") + amd64MachineType = flag.String("amd64-machinetype", "", "Custom AMD64_MACHINETYPE string for KubeVirt ConfigMap") + arm64MachineType = flag.String("arm64-machinetype", "", "Custom ARM64_MACHINETYPE string for KubeVirt ConfigMap") + hcoKvIoVersion = flag.String("hco-kv-io-version", "", "KubeVirt version") + kubevirtVersion = flag.String("kubevirt-version", "", "Kubevirt operator version") + kubevirtVirtLauncherOSVersion = flag.String("kubevirt-virt-launcher-os-version", "", "Kubevirt Virt launcher OS version") + cdiVersion = flag.String("cdi-version", "", "CDI operator version") + cnaoVersion = flag.String("cnao-version", "", "CNA operator version") + sspVersion = flag.String("ssp-version", "", "SSP operator version") + hppoVersion = flag.String("hppo-version", "", "HPP operator version") + _ = flag.String("mtq-version", "", "deprecated. This flag is ignored") + aaqVersion = flag.String("aaq-version", "", "AAQ operator version") + primaryUDNImage = flag.String("primary-udn-binding-image-name", "", "Primary UDN binding image") + apiSources = flag.String("api-sources", cwd+"/...", "Project sources") ) // check handles errors @@ -412,25 +413,26 @@ func getCsvWithComponent() []util.CsvWithComponent { func getOperatorParameters() *components.DeploymentOperatorParams { params := &components.DeploymentOperatorParams{ - Namespace: *operatorNamespace, - Image: *operatorImage, - WebhookImage: *webhookImage, - CliDownloadsImage: *cliDownloadsImage, - ImagePullPolicy: "IfNotPresent", - VirtIOWinContainer: *kvVirtIOWinImage, - Smbios: *smbios, - Machinetype: *machinetype, - Amd64MachineType: *amd64MachineType, - Arm64MachineType: *arm64MachineType, - HcoKvIoVersion: *hcoKvIoVersion, - KubevirtVersion: *kubevirtVersion, - CdiVersion: *cdiVersion, - CnaoVersion: *cnaoVersion, - SspVersion: *sspVersion, - HppoVersion: *hppoVersion, - AaqVersion: *aaqVersion, - PrimaryUDNImage: *primaryUDNImage, - Env: []corev1.EnvVar{}, + Namespace: *operatorNamespace, + Image: *operatorImage, + WebhookImage: *webhookImage, + CliDownloadsImage: *cliDownloadsImage, + ImagePullPolicy: "IfNotPresent", + VirtIOWinContainer: *kvVirtIOWinImage, + Smbios: *smbios, + Machinetype: *machinetype, + Amd64MachineType: *amd64MachineType, + Arm64MachineType: *arm64MachineType, + HcoKvIoVersion: *hcoKvIoVersion, + KubevirtVersion: *kubevirtVersion, + KubevirtVirtLancherOsVersion: *kubevirtVirtLauncherOSVersion, + CdiVersion: *cdiVersion, + CnaoVersion: *cnaoVersion, + SspVersion: *sspVersion, + HppoVersion: *hppoVersion, + AaqVersion: *aaqVersion, + PrimaryUDNImage: *primaryUDNImage, + Env: []corev1.EnvVar{}, } return params }