From 805e505288ce82c3e2b625a3ca63aaf2b0aa9cea Mon Sep 17 00:00:00 2001 From: gotjosh Date: Thu, 16 Jun 2022 11:16:06 +0100 Subject: [PATCH] Alert metric reports different results to what the user sees via API (#2943) * Alert metric reports different results to what the user sees via API Fixes #1439 and #2619. The previous metric is not _technically_ reporting incorrect results as the alerts _are_ still around and will be re-used if that same alert (equal fingerprint) is received before it is GCed. Therefore, I have kept the old metric under a new name `alertmanager_marked_alerts` and repurpose the current metric to match what the user sees in the UI. Signed-off-by: gotjosh --- cmd/alertmanager/main.go | 2 +- dispatch/dispatch_test.go | 8 ++-- provider/mem/mem.go | 51 ++++++++++++++++++++++++- provider/mem/mem_test.go | 80 ++++++++++++++++++++++++++++++++++++--- types/types.go | 12 +++--- types/types_test.go | 52 +++++++++++++++++++++++++ 6 files changed, 187 insertions(+), 18 deletions(-) diff --git a/cmd/alertmanager/main.go b/cmd/alertmanager/main.go index 3d9add04bc..d8d3ddfe76 100644 --- a/cmd/alertmanager/main.go +++ b/cmd/alertmanager/main.go @@ -339,7 +339,7 @@ func run() int { go peer.Settle(ctx, *gossipInterval*10) } - alerts, err := mem.NewAlerts(context.Background(), marker, *alertGCInterval, nil, logger) + alerts, err := mem.NewAlerts(context.Background(), marker, *alertGCInterval, nil, logger, prometheus.DefaultRegisterer) if err != nil { level.Error(logger).Log("err", err) return 1 diff --git a/dispatch/dispatch_test.go b/dispatch/dispatch_test.go index f4f08c2af5..17ffe85a9a 100644 --- a/dispatch/dispatch_test.go +++ b/dispatch/dispatch_test.go @@ -366,7 +366,7 @@ route: logger := log.NewNopLogger() route := NewRoute(conf.Route, nil) marker := types.NewMarker(prometheus.NewRegistry()) - alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger) + alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger, nil) if err != nil { t.Fatal(err) } @@ -504,7 +504,7 @@ route: logger := log.NewNopLogger() route := NewRoute(conf.Route, nil) marker := types.NewMarker(prometheus.NewRegistry()) - alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger) + alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger, nil) if err != nil { t.Fatal(err) } @@ -625,7 +625,7 @@ func newAlert(labels model.LabelSet) *types.Alert { func TestDispatcherRace(t *testing.T) { logger := log.NewNopLogger() marker := types.NewMarker(prometheus.NewRegistry()) - alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger) + alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger, nil) if err != nil { t.Fatal(err) } @@ -642,7 +642,7 @@ func TestDispatcherRaceOnFirstAlertNotDeliveredWhenGroupWaitIsZero(t *testing.T) logger := log.NewNopLogger() marker := types.NewMarker(prometheus.NewRegistry()) - alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger) + alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger, nil) if err != nil { t.Fatal(err) } diff --git a/provider/mem/mem.go b/provider/mem/mem.go index 9a67cceb9a..0e39ba1d5f 100644 --- a/provider/mem/mem.go +++ b/provider/mem/mem.go @@ -20,6 +20,7 @@ import ( "github.com/go-kit/log" "github.com/go-kit/log/level" + "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/model" "github.com/prometheus/alertmanager/provider" @@ -33,8 +34,10 @@ const alertChannelLength = 200 type Alerts struct { cancel context.CancelFunc + alerts *store.Alerts + marker types.Marker + mtx sync.Mutex - alerts *store.Alerts listeners map[int]listeningAlerts next int @@ -62,14 +65,34 @@ type listeningAlerts struct { done chan struct{} } +func (a *Alerts) registerMetrics(r prometheus.Registerer) { + newMemAlertByStatus := func(s types.AlertState) prometheus.GaugeFunc { + return prometheus.NewGaugeFunc( + prometheus.GaugeOpts{ + Name: "alertmanager_alerts", + Help: "How many alerts by state.", + ConstLabels: prometheus.Labels{"state": string(s)}, + }, + func() float64 { + return float64(a.count(s)) + }, + ) + } + + r.MustRegister(newMemAlertByStatus(types.AlertStateActive)) + r.MustRegister(newMemAlertByStatus(types.AlertStateSuppressed)) + r.MustRegister(newMemAlertByStatus(types.AlertStateUnprocessed)) +} + // NewAlerts returns a new alert provider. -func NewAlerts(ctx context.Context, m types.Marker, intervalGC time.Duration, alertCallback AlertStoreCallback, l log.Logger) (*Alerts, error) { +func NewAlerts(ctx context.Context, m types.Marker, intervalGC time.Duration, alertCallback AlertStoreCallback, l log.Logger, r prometheus.Registerer) (*Alerts, error) { if alertCallback == nil { alertCallback = noopCallback{} } ctx, cancel := context.WithCancel(ctx) a := &Alerts{ + marker: m, alerts: store.NewAlerts(), cancel: cancel, listeners: map[int]listeningAlerts{}, @@ -98,6 +121,11 @@ func NewAlerts(ctx context.Context, m types.Marker, intervalGC time.Duration, al } a.mtx.Unlock() }) + + if r != nil { + a.registerMetrics(r) + } + go a.alerts.Run(ctx, intervalGC) return a, nil @@ -212,6 +240,25 @@ func (a *Alerts) Put(alerts ...*types.Alert) error { return nil } +// count returns the number of non-resolved alerts we currently have stored filtered by the provided state. +func (a *Alerts) count(state types.AlertState) int { + var count int + for _, alert := range a.alerts.List() { + if alert.Resolved() { + continue + } + + status := a.marker.Status(alert.Fingerprint()) + if status.State != state { + continue + } + + count++ + } + + return count +} + type noopCallback struct{} func (n noopCallback) PreStore(_ *types.Alert, _ bool) error { return nil } diff --git a/provider/mem/mem_test.go b/provider/mem/mem_test.go index 7ff5d2f6cf..038a3fea46 100644 --- a/provider/mem/mem_test.go +++ b/provider/mem/mem_test.go @@ -86,7 +86,7 @@ func init() { // a listener can not unsubscribe as the lock is hold by `alerts.Lock`. func TestAlertsSubscribePutStarvation(t *testing.T) { marker := types.NewMarker(prometheus.NewRegistry()) - alerts, err := NewAlerts(context.Background(), marker, 30*time.Minute, noopCallback{}, log.NewNopLogger()) + alerts, err := NewAlerts(context.Background(), marker, 30*time.Minute, noopCallback{}, log.NewNopLogger(), nil) if err != nil { t.Fatal(err) } @@ -137,7 +137,7 @@ func TestAlertsSubscribePutStarvation(t *testing.T) { func TestAlertsPut(t *testing.T) { marker := types.NewMarker(prometheus.NewRegistry()) - alerts, err := NewAlerts(context.Background(), marker, 30*time.Minute, noopCallback{}, log.NewNopLogger()) + alerts, err := NewAlerts(context.Background(), marker, 30*time.Minute, noopCallback{}, log.NewNopLogger(), nil) if err != nil { t.Fatal(err) } @@ -165,7 +165,7 @@ func TestAlertsSubscribe(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - alerts, err := NewAlerts(ctx, marker, 30*time.Minute, noopCallback{}, log.NewNopLogger()) + alerts, err := NewAlerts(ctx, marker, 30*time.Minute, noopCallback{}, log.NewNopLogger(), nil) if err != nil { t.Fatal(err) } @@ -242,7 +242,7 @@ func TestAlertsSubscribe(t *testing.T) { func TestAlertsGetPending(t *testing.T) { marker := types.NewMarker(prometheus.NewRegistry()) - alerts, err := NewAlerts(context.Background(), marker, 30*time.Minute, noopCallback{}, log.NewNopLogger()) + alerts, err := NewAlerts(context.Background(), marker, 30*time.Minute, noopCallback{}, log.NewNopLogger(), nil) if err != nil { t.Fatal(err) } @@ -285,7 +285,7 @@ func TestAlertsGetPending(t *testing.T) { func TestAlertsGC(t *testing.T) { marker := types.NewMarker(prometheus.NewRegistry()) - alerts, err := NewAlerts(context.Background(), marker, 200*time.Millisecond, noopCallback{}, log.NewNopLogger()) + alerts, err := NewAlerts(context.Background(), marker, 200*time.Millisecond, noopCallback{}, log.NewNopLogger(), nil) if err != nil { t.Fatal(err) } @@ -322,7 +322,7 @@ func TestAlertsStoreCallback(t *testing.T) { cb := &limitCountCallback{limit: 3} marker := types.NewMarker(prometheus.NewRegistry()) - alerts, err := NewAlerts(context.Background(), marker, 200*time.Millisecond, cb, log.NewNopLogger()) + alerts, err := NewAlerts(context.Background(), marker, 200*time.Millisecond, cb, log.NewNopLogger(), nil) if err != nil { t.Fatal(err) } @@ -383,6 +383,74 @@ func TestAlertsStoreCallback(t *testing.T) { } } +func TestAlerts_Count(t *testing.T) { + marker := types.NewMarker(prometheus.NewRegistry()) + alerts, err := NewAlerts(context.Background(), marker, 200*time.Millisecond, nil, log.NewNopLogger(), nil) + require.NoError(t, err) + + states := []types.AlertState{types.AlertStateActive, types.AlertStateSuppressed, types.AlertStateUnprocessed} + + countByState := func(st types.AlertState) int { + return alerts.count(st) + } + countTotal := func() int { + var count int + for _, st := range states { + count += countByState(st) + } + return count + } + + // First, there shouldn't be any alerts. + require.Equal(t, 0, countTotal()) + + // When you insert a new alert that will eventually be active, it should be unprocessed first. + now := time.Now() + a1 := &types.Alert{ + Alert: model.Alert{ + Labels: model.LabelSet{"bar": "foo"}, + Annotations: model.LabelSet{"foo": "bar"}, + StartsAt: now, + EndsAt: now.Add(400 * time.Millisecond), + GeneratorURL: "http://example.com/prometheus", + }, + UpdatedAt: now, + Timeout: false, + } + + alerts.Put(a1) + require.Equal(t, 1, countByState(types.AlertStateUnprocessed)) + require.Equal(t, 1, countTotal()) + require.Eventually(t, func() bool { + // When the alert will eventually expire and is considered resolved - it won't count. + return countTotal() == 0 + }, 600*time.Millisecond, 100*time.Millisecond) + + now = time.Now() + a2 := &types.Alert{ + Alert: model.Alert{ + Labels: model.LabelSet{"bar": "foo"}, + Annotations: model.LabelSet{"foo": "bar"}, + StartsAt: now, + EndsAt: now.Add(400 * time.Millisecond), + GeneratorURL: "http://example.com/prometheus", + }, + UpdatedAt: now, + Timeout: false, + } + + // When insert an alert, and then silence it. It shows up with the correct filter. + alerts.Put(a2) + marker.SetSilenced(a2.Fingerprint(), 1, []string{"1"}, nil) + require.Equal(t, 1, countByState(types.AlertStateSuppressed)) + require.Equal(t, 1, countTotal()) + + require.Eventually(t, func() bool { + // When the alert will eventually expire and is considered resolved - it won't count. + return countTotal() == 0 + }, 600*time.Millisecond, 100*time.Millisecond) +} + func alertsEqual(a1, a2 *types.Alert) bool { if a1 == nil || a2 == nil { return false diff --git a/types/types.go b/types/types.go index 521d8670eb..ce45301d5b 100644 --- a/types/types.go +++ b/types/types.go @@ -108,11 +108,11 @@ type memMarker struct { } func (m *memMarker) registerMetrics(r prometheus.Registerer) { - newAlertMetricByState := func(st AlertState) prometheus.GaugeFunc { + newMarkedAlertMetricByState := func(st AlertState) prometheus.GaugeFunc { return prometheus.NewGaugeFunc( prometheus.GaugeOpts{ - Name: "alertmanager_alerts", - Help: "How many alerts by state.", + Name: "alertmanager_marked_alerts", + Help: "How many alerts by state are currently marked in the Alertmanager regardless of their expiry.", ConstLabels: prometheus.Labels{"state": string(st)}, }, func() float64 { @@ -121,11 +121,13 @@ func (m *memMarker) registerMetrics(r prometheus.Registerer) { ) } - alertsActive := newAlertMetricByState(AlertStateActive) - alertsSuppressed := newAlertMetricByState(AlertStateSuppressed) + alertsActive := newMarkedAlertMetricByState(AlertStateActive) + alertsSuppressed := newMarkedAlertMetricByState(AlertStateSuppressed) + alertStateUnprocessed := newMarkedAlertMetricByState(AlertStateUnprocessed) r.MustRegister(alertsActive) r.MustRegister(alertsSuppressed) + r.MustRegister(alertStateUnprocessed) } // Count implements Marker. diff --git a/types/types_test.go b/types/types_test.go index 84cb21b4f7..edb6969776 100644 --- a/types/types_test.go +++ b/types/types_test.go @@ -25,6 +25,58 @@ import ( "github.com/stretchr/testify/require" ) +func TestMemMarker_Count(t *testing.T) { + r := prometheus.NewRegistry() + marker := NewMarker(r) + now := time.Now() + + states := []AlertState{AlertStateSuppressed, AlertStateActive, AlertStateUnprocessed} + countByState := func(state AlertState) int { + return marker.Count(state) + } + + countTotal := func() int { + var count int + for _, s := range states { + count += countByState(s) + } + return count + } + + require.Equal(t, 0, countTotal()) + + a1 := model.Alert{ + StartsAt: now.Add(-2 * time.Minute), + EndsAt: now.Add(2 * time.Minute), + Labels: model.LabelSet{"test": "active"}, + } + a2 := model.Alert{ + StartsAt: now.Add(-2 * time.Minute), + EndsAt: now.Add(2 * time.Minute), + Labels: model.LabelSet{"test": "suppressed"}, + } + a3 := model.Alert{ + StartsAt: now.Add(-2 * time.Minute), + EndsAt: now.Add(-1 * time.Minute), + Labels: model.LabelSet{"test": "resolved"}, + } + + // Insert an active alert. + marker.SetSilenced(a1.Fingerprint(), 1, nil, nil) + require.Equal(t, 1, countByState(AlertStateActive)) + require.Equal(t, 1, countTotal()) + + // Insert a suppressed alert. + marker.SetSilenced(a2.Fingerprint(), 1, []string{"1"}, nil) + require.Equal(t, 1, countByState(AlertStateSuppressed)) + require.Equal(t, 2, countTotal()) + + // Insert a resolved alert - it'll count as active. + marker.SetSilenced(a3.Fingerprint(), 1, []string{"1"}, nil) + require.Equal(t, 1, countByState(AlertStateActive)) + require.Equal(t, 3, countTotal()) +} + func TestAlertMerge(t *testing.T) { now := time.Now()