From f376283aeffe485bb7a8d49e75966256b7ee41ab Mon Sep 17 00:00:00 2001 From: machadovilaca Date: Mon, 8 Jul 2024 12:43:33 +0100 Subject: [PATCH] Ensure PodDisruptionBudgetAtLimit alert is silenced Signed-off-by: machadovilaca --- cmd/hyperconverged-cluster-operator/main.go | 18 +-- .../observability/observability_controller.go | 53 +++++--- .../observability_controller_test.go | 32 ----- .../pod_disruption_budget_at_limit.go | 107 ++++++++++++++++ deploy/cluster_role.yaml | 10 ++ ...perator.v1.13.0.clusterserviceversion.yaml | 10 ++ ...perator.v1.13.0.clusterserviceversion.yaml | 12 +- .../alertmanager/alertmanager_suite_test.go | 6 +- pkg/alertmanager/silences.go | 119 ++++++++++++++++++ pkg/alertmanager/silences_test.go | 69 ++++++++++ pkg/components/components.go | 5 + .../observability_controller_test.go | 57 +++++++++ 12 files changed, 438 insertions(+), 60 deletions(-) delete mode 100644 controllers/observability/observability_controller_test.go create mode 100644 controllers/observability/pod_disruption_budget_at_limit.go rename controllers/observability/observability_suite_test.go => pkg/alertmanager/alertmanager_suite_test.go (50%) create mode 100644 pkg/alertmanager/silences.go create mode 100644 pkg/alertmanager/silences_test.go create mode 100644 tests/func-tests/observability_controller_test.go diff --git a/cmd/hyperconverged-cluster-operator/main.go b/cmd/hyperconverged-cluster-operator/main.go index e143d6547..24bcd6bc7 100644 --- a/cmd/hyperconverged-cluster-operator/main.go +++ b/cmd/hyperconverged-cluster-operator/main.go @@ -58,6 +58,8 @@ import ( hcoutil "github.com/kubevirt/hyperconverged-cluster-operator/pkg/util" ) +const openshiftMonitoringNamespace = "openshift-monitoring" + // Change below variables to serve metrics on different host or port. var ( logger = logf.Log.WithName("hyperconverged-operator-cmd") @@ -172,12 +174,11 @@ func main() { os.Exit(1) } - if err = (&observability.Reconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), - }).SetupWithManager(mgr); err != nil { - logger.Error(err, "unable to create controller", "controller", "Observability") - os.Exit(1) + if ci.IsOpenshift() { + if err = observability.SetupWithManager(mgr); err != nil { + logger.Error(err, "unable to create controller", "controller", "Observability") + os.Exit(1) + } } err = createPriorityClass(ctx, mgr) @@ -258,7 +259,10 @@ func getCacheOption(operatorNamespace string, isMonitoringAvailable, isOpenshift cacheOptionsByObjectForOpenshift := map[client.Object]cache.ByObject{ &openshiftroutev1.Route{}: { - Field: namespaceSelector, + Namespaces: map[string]cache.Config{ + operatorNamespace: {}, + openshiftMonitoringNamespace: {}, + }, }, &imagev1.ImageStream{}: { Label: labelSelector, diff --git a/controllers/observability/observability_controller.go b/controllers/observability/observability_controller.go index afc84f9be..229f35141 100644 --- a/controllers/observability/observability_controller.go +++ b/controllers/observability/observability_controller.go @@ -5,47 +5,50 @@ import ( "time" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/rest" ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/event" "sigs.k8s.io/controller-runtime/pkg/handler" logf "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/source" + + "github.com/kubevirt/hyperconverged-cluster-operator/pkg/alertmanager" ) var ( log = logf.Log.WithName("controller_observability") - periodicity = 5 * time.Second + periodicity = 1 * time.Hour ) type Reconciler struct { - client.Client - Scheme *runtime.Scheme - + config *rest.Config events chan event.GenericEvent + + amApi *alertmanager.Api } func (r *Reconciler) Reconcile(_ context.Context, _ ctrl.Request) (ctrl.Result, error) { log.Info("Reconciling Observability") - // TODO(user): your logic here + if err := r.ensurePodDisruptionBudgetAtLimitIsSilenced(); err != nil { + return ctrl.Result{}, err + } return ctrl.Result{}, nil } -func (r *Reconciler) SetupWithManager(mgr ctrl.Manager) error { +func NewReconciler(config *rest.Config) *Reconciler { + return &Reconciler{ + config: config, + events: make(chan event.GenericEvent, 1), + } +} + +func SetupWithManager(mgr ctrl.Manager) error { log.Info("Setting up controller") - r.events = make(chan event.GenericEvent) - go func() { - for { - r.events <- event.GenericEvent{ - Object: &metav1.PartialObjectMetadata{}, - } - time.Sleep(periodicity) - } - }() + r := NewReconciler(mgr.GetConfig()) + r.startEventLoop() return ctrl.NewControllerManagedBy(mgr). Named("observability"). @@ -55,3 +58,19 @@ func (r *Reconciler) SetupWithManager(mgr ctrl.Manager) error { )). Complete(r) } + +func (r *Reconciler) startEventLoop() { + ticker := time.NewTicker(periodicity) + + go func() { + r.events <- event.GenericEvent{ + Object: &metav1.PartialObjectMetadata{}, + } + + for range ticker.C { + r.events <- event.GenericEvent{ + Object: &metav1.PartialObjectMetadata{}, + } + } + }() +} diff --git a/controllers/observability/observability_controller_test.go b/controllers/observability/observability_controller_test.go deleted file mode 100644 index 28cbcc9c5..000000000 --- a/controllers/observability/observability_controller_test.go +++ /dev/null @@ -1,32 +0,0 @@ -package observability_test - -import ( - "context" - - . "github.com/onsi/ginkgo/v2" - . "github.com/onsi/gomega" - - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/reconcile" - - "github.com/kubevirt/hyperconverged-cluster-operator/controllers/commontestutils" - "github.com/kubevirt/hyperconverged-cluster-operator/controllers/observability" -) - -var _ = Describe("Observability Controller", func() { - Context("When reconciling a resource", func() { - ctx := context.Background() - - It("should successfully reconcile the resource", func() { - By("Reconciling the created resource") - k8sClient := commontestutils.InitClient([]client.Object{}) - controllerReconciler := &observability.Reconciler{ - Client: k8sClient, - Scheme: k8sClient.Scheme(), - } - - _, err := controllerReconciler.Reconcile(ctx, reconcile.Request{}) - Expect(err).NotTo(HaveOccurred()) - }) - }) -}) diff --git a/controllers/observability/pod_disruption_budget_at_limit.go b/controllers/observability/pod_disruption_budget_at_limit.go new file mode 100644 index 000000000..356113864 --- /dev/null +++ b/controllers/observability/pod_disruption_budget_at_limit.go @@ -0,0 +1,107 @@ +package observability + +import ( + "crypto/tls" + "crypto/x509" + "fmt" + "net/http" + "os" + "time" + + "github.com/kubevirt/hyperconverged-cluster-operator/pkg/alertmanager" +) + +const ( + alertmanagerSvcHost = "https://alertmanager-main.openshift-monitoring.svc.cluster.local:9094" + tlsCertPath = "/var/run/secrets/kubernetes.io/serviceaccount/service-ca.crt" +) + +func (r *Reconciler) ensurePodDisruptionBudgetAtLimitIsSilenced() error { + if r.amApi == nil { + var err error + r.amApi, err = r.NewAlertmanagerApi() + if err != nil { + return fmt.Errorf("failed to initialize alertmanager api: %w", err) + } + } + + amSilences, err := r.amApi.ListSilences() + if err != nil { + return fmt.Errorf("failed to list alertmanager silences: %w", err) + } + + if FindPodDisruptionBudgetAtLimitSilence(amSilences) != nil { + log.Info("KubeVirt PodDisruptionBudgetAtLimit alerts are already silenced") + return nil + } + + silence := alertmanager.Silence{ + Comment: "Silence KubeVirt PodDisruptionBudgetAtLimit alerts", + CreatedBy: "hyperconverged-cluster-operator", + EndsAt: "3000-01-01T00:00:00Z", + Matchers: []alertmanager.Matcher{ + { + IsEqual: true, + Name: "alertname", + Value: "PodDisruptionBudgetAtLimit", + }, + { + IsRegex: true, + Name: "poddisruptionbudget", + Value: "kubevirt-disruption-budget-.*", + }, + }, + StartsAt: time.Now().Format(time.RFC3339), + } + + if err := r.amApi.CreateSilence(silence); err != nil { + return fmt.Errorf("failed to create alertmanager silence: %w", err) + } + log.Info("Silenced PodDisruptionBudgetAtLimit alerts") + + return nil +} + +func (r *Reconciler) NewAlertmanagerApi() (*alertmanager.Api, error) { + caCert, err := os.ReadFile(tlsCertPath) + if err != nil { + return nil, fmt.Errorf("failed to read ca cert: %w", err) + } + + caCertPool := x509.NewCertPool() + caCertPool.AppendCertsFromPEM(caCert) + + httpClient := http.Client{} + httpClient.Transport = &http.Transport{ + TLSClientConfig: &tls.Config{RootCAs: caCertPool}, + } + + return alertmanager.NewAPI(httpClient, alertmanagerSvcHost, r.config.BearerToken), nil +} + +func FindPodDisruptionBudgetAtLimitSilence(amSilences []alertmanager.Silence) *alertmanager.Silence { + for _, silence := range amSilences { + if silence.Status.State != "active" { + continue + } + + var isPDBSilence bool + var isKubeVirtPDBSilence bool + + for _, matcher := range silence.Matchers { + if matcher.Name == "alertname" && matcher.Value == "PodDisruptionBudgetAtLimit" && matcher.IsEqual { + isPDBSilence = true + } + + if matcher.Name == "poddisruptionbudget" && matcher.IsRegex && matcher.Value == "kubevirt-disruption-budget-.*" { + isKubeVirtPDBSilence = true + } + } + + if isPDBSilence && isKubeVirtPDBSilence { + return &silence + } + } + + return nil +} diff --git a/deploy/cluster_role.yaml b/deploy/cluster_role.yaml index 3dda21682..41e13f233 100644 --- a/deploy/cluster_role.yaml +++ b/deploy/cluster_role.yaml @@ -1145,6 +1145,16 @@ rules: - list - watch - update +- apiGroups: + - monitoring.coreos.com + resources: + - alertmanagers + - alertmanagers/api + verbs: + - get + - list + - create + - delete --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole diff --git a/deploy/index-image/community-kubevirt-hyperconverged/1.13.0/manifests/kubevirt-hyperconverged-operator.v1.13.0.clusterserviceversion.yaml b/deploy/index-image/community-kubevirt-hyperconverged/1.13.0/manifests/kubevirt-hyperconverged-operator.v1.13.0.clusterserviceversion.yaml index 9c51ed2e9..30f99d297 100644 --- a/deploy/index-image/community-kubevirt-hyperconverged/1.13.0/manifests/kubevirt-hyperconverged-operator.v1.13.0.clusterserviceversion.yaml +++ b/deploy/index-image/community-kubevirt-hyperconverged/1.13.0/manifests/kubevirt-hyperconverged-operator.v1.13.0.clusterserviceversion.yaml @@ -539,6 +539,16 @@ spec: - list - watch - update + - apiGroups: + - monitoring.coreos.com + resources: + - alertmanagers + - alertmanagers/api + verbs: + - get + - list + - create + - delete serviceAccountName: hyperconverged-cluster-operator - rules: - apiGroups: diff --git a/deploy/olm-catalog/community-kubevirt-hyperconverged/1.13.0/manifests/kubevirt-hyperconverged-operator.v1.13.0.clusterserviceversion.yaml b/deploy/olm-catalog/community-kubevirt-hyperconverged/1.13.0/manifests/kubevirt-hyperconverged-operator.v1.13.0.clusterserviceversion.yaml index 997e95ad0..cc3e65165 100644 --- a/deploy/olm-catalog/community-kubevirt-hyperconverged/1.13.0/manifests/kubevirt-hyperconverged-operator.v1.13.0.clusterserviceversion.yaml +++ b/deploy/olm-catalog/community-kubevirt-hyperconverged/1.13.0/manifests/kubevirt-hyperconverged-operator.v1.13.0.clusterserviceversion.yaml @@ -9,7 +9,7 @@ metadata: certified: "false" console.openshift.io/disable-operand-delete: "true" containerImage: quay.io/kubevirt/hyperconverged-cluster-operator:1.13.0-unstable - createdAt: "2024-07-17 05:05:02" + createdAt: "2024-07-17 12:30:51" description: A unified operator deploying and controlling KubeVirt and its supporting operators with opinionated defaults features.operators.openshift.io/cnf: "false" @@ -539,6 +539,16 @@ spec: - list - watch - update + - apiGroups: + - monitoring.coreos.com + resources: + - alertmanagers + - alertmanagers/api + verbs: + - get + - list + - create + - delete serviceAccountName: hyperconverged-cluster-operator - rules: - apiGroups: diff --git a/controllers/observability/observability_suite_test.go b/pkg/alertmanager/alertmanager_suite_test.go similarity index 50% rename from controllers/observability/observability_suite_test.go rename to pkg/alertmanager/alertmanager_suite_test.go index 3c473db5c..7e84c72b9 100644 --- a/controllers/observability/observability_suite_test.go +++ b/pkg/alertmanager/alertmanager_suite_test.go @@ -1,4 +1,4 @@ -package observability_test +package alertmanager_test import ( "testing" @@ -7,7 +7,7 @@ import ( . "github.com/onsi/gomega" ) -func TestObservability(t *testing.T) { +func TestAlertmanager(t *testing.T) { RegisterFailHandler(Fail) - RunSpecs(t, "Observability Controller Suite") + RunSpecs(t, "Alertmanager Suite") } diff --git a/pkg/alertmanager/silences.go b/pkg/alertmanager/silences.go new file mode 100644 index 000000000..c1e61ada4 --- /dev/null +++ b/pkg/alertmanager/silences.go @@ -0,0 +1,119 @@ +package alertmanager + +import ( + "bytes" + "encoding/json" + "fmt" + "net/http" + + logf "sigs.k8s.io/controller-runtime/pkg/log" +) + +var log = logf.Log.WithName("alertmanager") + +type Api struct { + httpClient http.Client + host string + token string +} + +type Silence struct { + ID string `json:"id"` + Comment string `json:"comment"` + CreatedBy string `json:"createdBy"` + EndsAt string `json:"endsAt"` + Matchers []Matcher `json:"matchers"` + StartsAt string `json:"startsAt"` + Status Status `json:"status"` +} + +type Status struct { + State string `json:"state"` +} + +type Matcher struct { + IsEqual bool `json:"isEqual"` + IsRegex bool `json:"isRegex"` + Name string `json:"name"` + Value string `json:"value"` +} + +func NewAPI(httpClient http.Client, host string, token string) *Api { + return &Api{ + httpClient: httpClient, + host: host, + token: token, + } +} + +func (api *Api) ListSilences() ([]Silence, error) { + req, err := http.NewRequest(http.MethodGet, fmt.Sprintf("%s/api/v2/silences", api.host), nil) + if err != nil { + return nil, fmt.Errorf("failed to create request: %w", err) + } + + req.Header.Add("Authorization", "Bearer "+api.token) + + resp, err := api.httpClient.Do(req) + if err != nil { + return nil, fmt.Errorf("failed to list silences: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + log.V(1).Info("list silences http request", "req", req, "resp", resp) + return nil, fmt.Errorf("failed to list silences: %s", resp.Status) + } + + var amSilences []Silence + err = json.NewDecoder(resp.Body).Decode(&amSilences) + if err != nil { + return nil, fmt.Errorf("failed to decode response: %w", err) + } + + return amSilences, nil +} + +func (api *Api) CreateSilence(s Silence) error { + body, err := json.Marshal(s) + if err != nil { + return fmt.Errorf("failed to marshal silence: %w", err) + } + + req, err := http.NewRequest(http.MethodPost, fmt.Sprintf("%s/api/v2/silences", api.host), bytes.NewReader(body)) + if err != nil { + return fmt.Errorf("failed to create request: %w", err) + } + + req.Header.Add("Authorization", "Bearer "+api.token) + req.Header.Add("Content-Type", "application/json") + + resp, err := api.httpClient.Do(req) + if err != nil { + return fmt.Errorf("failed to create silence: %w", err) + } + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("failed to create silence: %s", resp.Status) + } + + return nil +} + +func (api *Api) DeleteSilence(id string) error { + req, err := http.NewRequest(http.MethodDelete, fmt.Sprintf("%s/api/v2/silence/%s", api.host, id), nil) + if err != nil { + return fmt.Errorf("failed to create request: %w", err) + } + + req.Header.Add("Authorization", "Bearer "+api.token) + + resp, err := api.httpClient.Do(req) + if err != nil { + return fmt.Errorf("failed to delete silence: %w", err) + } + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("failed to delete silence: %s", resp.Status) + } + + return nil +} diff --git a/pkg/alertmanager/silences_test.go b/pkg/alertmanager/silences_test.go new file mode 100644 index 000000000..779a04521 --- /dev/null +++ b/pkg/alertmanager/silences_test.go @@ -0,0 +1,69 @@ +package alertmanager_test + +import ( + "fmt" + "net/http" + "net/http/httptest" + "strings" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/kubevirt/hyperconverged-cluster-operator/pkg/alertmanager" +) + +const listResp = `[{"id":"bb881d7f-3278-46fd-a638-d42c57f235b6","status":{"state":"active"},"updatedAt":"2024-07-16T11:46:30.653Z","comment":"test purposes","createdBy":"test_user","endsAt":"3000-01-01T00:00:00.000Z","matchers":[{"isEqual":true,"isRegex":false,"name":"alertname","value":"TestAlert"}],"startsAt":"2024-07-16T11:46:30.653Z"}]` + +var _ = Describe("Silences", func() { + var ( + ts *httptest.Server + api *alertmanager.Api + ) + + BeforeEach(func() { + ts = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if !strings.HasPrefix(r.URL.Path, "/api/v2/silence") { + w.WriteHeader(http.StatusNotFound) + return + } + + switch r.Method { + case http.MethodGet: + fmt.Fprintln(w, listResp) + case http.MethodPost: + w.WriteHeader(http.StatusOK) + case http.MethodDelete: + w.WriteHeader(http.StatusOK) + } + })) + + api = alertmanager.NewAPI(http.Client{}, ts.URL, "token") + }) + + AfterEach(func() { + ts.Close() + }) + + It("should successfully GET /api/v2/silences", func() { + silences, err := api.ListSilences() + Expect(err).ToNot(HaveOccurred()) + Expect(silences).To(HaveLen(1)) + + Expect(silences[0].Status.State).To(Equal("active")) + Expect(silences[0].EndsAt).To(Equal("3000-01-01T00:00:00.000Z")) + + Expect(silences[0].Matchers).To(HaveLen(1)) + Expect(silences[0].Matchers[0].Name).To(Equal("alertname")) + Expect(silences[0].Matchers[0].Value).To(Equal("TestAlert")) + }) + + It("should successfully POST /api/v2/silences", func() { + err := api.CreateSilence(alertmanager.Silence{}) + Expect(err).ToNot(HaveOccurred()) + }) + + It("should successfully DELETE /api/v2/silences/{id}", func() { + err := api.DeleteSilence("bb881d7f-3278-46fd-a638-d42c57f235b6") + Expect(err).ToNot(HaveOccurred()) + }) +}) diff --git a/pkg/components/components.go b/pkg/components/components.go index 6c136eff6..bf1d6c72b 100644 --- a/pkg/components/components.go +++ b/pkg/components/components.go @@ -575,6 +575,11 @@ func GetClusterPermissions() []rbacv1.PolicyRule { Resources: stringListToSlice("consoles"), Verbs: stringListToSlice("get", "list", "watch", "update"), }, + { + APIGroups: stringListToSlice("monitoring.coreos.com"), + Resources: stringListToSlice("alertmanagers", "alertmanagers/api"), + Verbs: stringListToSlice("get", "list", "create", "delete"), + }, } } diff --git a/tests/func-tests/observability_controller_test.go b/tests/func-tests/observability_controller_test.go new file mode 100644 index 000000000..029da8301 --- /dev/null +++ b/tests/func-tests/observability_controller_test.go @@ -0,0 +1,57 @@ +package tests_test + +import ( + "context" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + v1 "k8s.io/api/core/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/kubevirt/hyperconverged-cluster-operator/controllers/observability" + tests "github.com/kubevirt/hyperconverged-cluster-operator/tests/func-tests" +) + +var _ = Describe("Observability Controller", Label(tests.OpenshiftLabel, "observability_controller"), func() { + Context("PodDisruptionBudgetAtLimit", func() { + It("should be silenced", func() { + r := observability.NewReconciler(tests.GetClientConfig()) + + amApi, err := r.NewAlertmanagerApi() + Expect(err).ToNot(HaveOccurred()) + + amSilences, err := amApi.ListSilences() + Expect(err).ToNot(HaveOccurred()) + + // PodDisruptionBudgetAtLimit silence should have been created by the controller + podDisruptionBudgetAtLimitSilence := observability.FindPodDisruptionBudgetAtLimitSilence(amSilences) + Expect(podDisruptionBudgetAtLimitSilence).ToNot(BeNil()) + + err = amApi.DeleteSilence(podDisruptionBudgetAtLimitSilence.ID) + Expect(err).ToNot(HaveOccurred()) + + // Restart pod to force reconcile (reconcile periodicity is 1h) + cli := tests.GetControllerRuntimeClient() + var hcoPods v1.PodList + err = cli.List(context.Background(), &hcoPods, &client.MatchingLabels{ + "name": "hyperconverged-cluster-operator", + }) + Expect(err).ToNot(HaveOccurred()) + Expect(hcoPods.Items).ToNot(BeEmpty()) + + for _, pod := range hcoPods.Items { + err = cli.Delete(context.Background(), &pod) + Expect(err).ToNot(HaveOccurred()) + } + + // Wait for the controller to recreate the silence + Eventually(func() bool { + amSilences, err := amApi.ListSilences() + Expect(err).ToNot(HaveOccurred()) + + return observability.FindPodDisruptionBudgetAtLimitSilence(amSilences) != nil + }, "5m", "10s").Should(BeTrue()) + }) + }) +})