Skip to content

Commit

Permalink
Ensure PodDisruptionBudgetAtLimit alert is silenced (#3020)
Browse files Browse the repository at this point in the history
* Add observability controller

Signed-off-by: machadovilaca <machadovilaca@gmail.com>

* Ensure PodDisruptionBudgetAtLimit alert is silenced

Signed-off-by: machadovilaca <machadovilaca@gmail.com>

---------

Signed-off-by: machadovilaca <machadovilaca@gmail.com>
  • Loading branch information
machadovilaca authored Jul 17, 2024
1 parent b45fc6a commit 2628100
Show file tree
Hide file tree
Showing 11 changed files with 491 additions and 2 deletions.
15 changes: 14 additions & 1 deletion cmd/hyperconverged-cluster-operator/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,14 @@ import (
hcov1beta1 "github.com/kubevirt/hyperconverged-cluster-operator/api/v1beta1"
"github.com/kubevirt/hyperconverged-cluster-operator/cmd/cmdcommon"
"github.com/kubevirt/hyperconverged-cluster-operator/controllers/hyperconverged"
"github.com/kubevirt/hyperconverged-cluster-operator/controllers/observability"
"github.com/kubevirt/hyperconverged-cluster-operator/controllers/operands"
"github.com/kubevirt/hyperconverged-cluster-operator/pkg/monitoring/metrics"
hcoutil "github.com/kubevirt/hyperconverged-cluster-operator/pkg/util"
)

const openshiftMonitoringNamespace = "openshift-monitoring"

// Change below variables to serve metrics on different host or port.
var (
logger = logf.Log.WithName("hyperconverged-operator-cmd")
Expand Down Expand Up @@ -171,6 +174,13 @@ func main() {
os.Exit(1)
}

if ci.IsOpenshift() {
if err = observability.SetupWithManager(mgr); err != nil {
logger.Error(err, "unable to create controller", "controller", "Observability")
os.Exit(1)
}
}

err = createPriorityClass(ctx, mgr)
cmdHelper.ExitOnError(err, "Failed creating PriorityClass")

Expand Down Expand Up @@ -249,7 +259,10 @@ func getCacheOption(operatorNamespace string, isMonitoringAvailable, isOpenshift

cacheOptionsByObjectForOpenshift := map[client.Object]cache.ByObject{
&openshiftroutev1.Route{}: {
Field: namespaceSelector,
Namespaces: map[string]cache.Config{
operatorNamespace: {},
openshiftMonitoringNamespace: {},
},
},
&imagev1.ImageStream{}: {
Label: labelSelector,
Expand Down
76 changes: 76 additions & 0 deletions controllers/observability/observability_controller.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
package observability

import (
"context"
"time"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/rest"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/event"
"sigs.k8s.io/controller-runtime/pkg/handler"
logf "sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/source"

"github.com/kubevirt/hyperconverged-cluster-operator/pkg/alertmanager"
)

var (
log = logf.Log.WithName("controller_observability")
periodicity = 1 * time.Hour
)

type Reconciler struct {
config *rest.Config
events chan event.GenericEvent

amApi *alertmanager.Api
}

func (r *Reconciler) Reconcile(_ context.Context, _ ctrl.Request) (ctrl.Result, error) {
log.Info("Reconciling Observability")

if err := r.ensurePodDisruptionBudgetAtLimitIsSilenced(); err != nil {
return ctrl.Result{}, err
}

return ctrl.Result{}, nil
}

func NewReconciler(config *rest.Config) *Reconciler {
return &Reconciler{
config: config,
events: make(chan event.GenericEvent, 1),
}
}

func SetupWithManager(mgr ctrl.Manager) error {
log.Info("Setting up controller")

r := NewReconciler(mgr.GetConfig())
r.startEventLoop()

return ctrl.NewControllerManagedBy(mgr).
Named("observability").
WatchesRawSource(source.Channel(
r.events,
&handler.EnqueueRequestForObject{},
)).
Complete(r)
}

func (r *Reconciler) startEventLoop() {
ticker := time.NewTicker(periodicity)

go func() {
r.events <- event.GenericEvent{
Object: &metav1.PartialObjectMetadata{},
}

for range ticker.C {
r.events <- event.GenericEvent{
Object: &metav1.PartialObjectMetadata{},
}
}
}()
}
107 changes: 107 additions & 0 deletions controllers/observability/pod_disruption_budget_at_limit.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
package observability

import (
"crypto/tls"
"crypto/x509"
"fmt"
"net/http"
"os"
"time"

"github.com/kubevirt/hyperconverged-cluster-operator/pkg/alertmanager"
)

const (
alertmanagerSvcHost = "https://alertmanager-main.openshift-monitoring.svc.cluster.local:9094"
tlsCertPath = "/var/run/secrets/kubernetes.io/serviceaccount/service-ca.crt"
)

func (r *Reconciler) ensurePodDisruptionBudgetAtLimitIsSilenced() error {
if r.amApi == nil {
var err error
r.amApi, err = r.NewAlertmanagerApi()
if err != nil {
return fmt.Errorf("failed to initialize alertmanager api: %w", err)
}
}

amSilences, err := r.amApi.ListSilences()
if err != nil {
return fmt.Errorf("failed to list alertmanager silences: %w", err)
}

if FindPodDisruptionBudgetAtLimitSilence(amSilences) != nil {
log.Info("KubeVirt PodDisruptionBudgetAtLimit alerts are already silenced")
return nil
}

silence := alertmanager.Silence{
Comment: "Silence KubeVirt PodDisruptionBudgetAtLimit alerts",
CreatedBy: "hyperconverged-cluster-operator",
EndsAt: "3000-01-01T00:00:00Z",
Matchers: []alertmanager.Matcher{
{
IsEqual: true,
Name: "alertname",
Value: "PodDisruptionBudgetAtLimit",
},
{
IsRegex: true,
Name: "poddisruptionbudget",
Value: "kubevirt-disruption-budget-.*",
},
},
StartsAt: time.Now().Format(time.RFC3339),
}

if err := r.amApi.CreateSilence(silence); err != nil {
return fmt.Errorf("failed to create alertmanager silence: %w", err)
}
log.Info("Silenced PodDisruptionBudgetAtLimit alerts")

return nil
}

func (r *Reconciler) NewAlertmanagerApi() (*alertmanager.Api, error) {
caCert, err := os.ReadFile(tlsCertPath)
if err != nil {
return nil, fmt.Errorf("failed to read ca cert: %w", err)
}

caCertPool := x509.NewCertPool()
caCertPool.AppendCertsFromPEM(caCert)

httpClient := http.Client{}
httpClient.Transport = &http.Transport{
TLSClientConfig: &tls.Config{RootCAs: caCertPool},
}

return alertmanager.NewAPI(httpClient, alertmanagerSvcHost, r.config.BearerToken), nil
}

func FindPodDisruptionBudgetAtLimitSilence(amSilences []alertmanager.Silence) *alertmanager.Silence {
for _, silence := range amSilences {
if silence.Status.State != "active" {
continue
}

var isPDBSilence bool
var isKubeVirtPDBSilence bool

for _, matcher := range silence.Matchers {
if matcher.Name == "alertname" && matcher.Value == "PodDisruptionBudgetAtLimit" && matcher.IsEqual {
isPDBSilence = true
}

if matcher.Name == "poddisruptionbudget" && matcher.IsRegex && matcher.Value == "kubevirt-disruption-budget-.*" {
isKubeVirtPDBSilence = true
}
}

if isPDBSilence && isKubeVirtPDBSilence {
return &silence
}
}

return nil
}
10 changes: 10 additions & 0 deletions deploy/cluster_role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1145,6 +1145,16 @@ rules:
- list
- watch
- update
- apiGroups:
- monitoring.coreos.com
resources:
- alertmanagers
- alertmanagers/api
verbs:
- get
- list
- create
- delete
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -539,6 +539,16 @@ spec:
- list
- watch
- update
- apiGroups:
- monitoring.coreos.com
resources:
- alertmanagers
- alertmanagers/api
verbs:
- get
- list
- create
- delete
serviceAccountName: hyperconverged-cluster-operator
- rules:
- apiGroups:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ metadata:
certified: "false"
console.openshift.io/disable-operand-delete: "true"
containerImage: quay.io/kubevirt/hyperconverged-cluster-operator:1.13.0-unstable
createdAt: "2024-07-17 05:05:02"
createdAt: "2024-07-17 12:30:51"
description: A unified operator deploying and controlling KubeVirt and its supporting
operators with opinionated defaults
features.operators.openshift.io/cnf: "false"
Expand Down Expand Up @@ -539,6 +539,16 @@ spec:
- list
- watch
- update
- apiGroups:
- monitoring.coreos.com
resources:
- alertmanagers
- alertmanagers/api
verbs:
- get
- list
- create
- delete
serviceAccountName: hyperconverged-cluster-operator
- rules:
- apiGroups:
Expand Down
13 changes: 13 additions & 0 deletions pkg/alertmanager/alertmanager_suite_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
package alertmanager_test

import (
"testing"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)

func TestAlertmanager(t *testing.T) {
RegisterFailHandler(Fail)
RunSpecs(t, "Alertmanager Suite")
}
Loading

0 comments on commit 2628100

Please sign in to comment.