Skip to content

Commit

Permalink
Alert metric reports different results to what the user sees via API (#…
Browse files Browse the repository at this point in the history
…2943)

* Alert metric reports different results to what the user sees via API

Fixes #1439 and #2619.

The previous metric is not _technically_ reporting incorrect results as the alerts _are_ still around and will be re-used if that same alert (equal fingerprint) is received before it is GCed. Therefore, I have kept the old metric under a new name `alertmanager_marked_alerts` and repurpose the current metric to match what the user sees in the UI.

Signed-off-by: gotjosh <josue.abreu@gmail.com>
  • Loading branch information
gotjosh authored Jun 16, 2022
1 parent 4d09995 commit 805e505
Show file tree
Hide file tree
Showing 6 changed files with 187 additions and 18 deletions.
2 changes: 1 addition & 1 deletion cmd/alertmanager/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -339,7 +339,7 @@ func run() int {
go peer.Settle(ctx, *gossipInterval*10)
}

alerts, err := mem.NewAlerts(context.Background(), marker, *alertGCInterval, nil, logger)
alerts, err := mem.NewAlerts(context.Background(), marker, *alertGCInterval, nil, logger, prometheus.DefaultRegisterer)
if err != nil {
level.Error(logger).Log("err", err)
return 1
Expand Down
8 changes: 4 additions & 4 deletions dispatch/dispatch_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -366,7 +366,7 @@ route:
logger := log.NewNopLogger()
route := NewRoute(conf.Route, nil)
marker := types.NewMarker(prometheus.NewRegistry())
alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger)
alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger, nil)
if err != nil {
t.Fatal(err)
}
Expand Down Expand Up @@ -504,7 +504,7 @@ route:
logger := log.NewNopLogger()
route := NewRoute(conf.Route, nil)
marker := types.NewMarker(prometheus.NewRegistry())
alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger)
alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger, nil)
if err != nil {
t.Fatal(err)
}
Expand Down Expand Up @@ -625,7 +625,7 @@ func newAlert(labels model.LabelSet) *types.Alert {
func TestDispatcherRace(t *testing.T) {
logger := log.NewNopLogger()
marker := types.NewMarker(prometheus.NewRegistry())
alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger)
alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger, nil)
if err != nil {
t.Fatal(err)
}
Expand All @@ -642,7 +642,7 @@ func TestDispatcherRaceOnFirstAlertNotDeliveredWhenGroupWaitIsZero(t *testing.T)

logger := log.NewNopLogger()
marker := types.NewMarker(prometheus.NewRegistry())
alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger)
alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger, nil)
if err != nil {
t.Fatal(err)
}
Expand Down
51 changes: 49 additions & 2 deletions provider/mem/mem.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (

"github.com/go-kit/log"
"github.com/go-kit/log/level"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/common/model"

"github.com/prometheus/alertmanager/provider"
Expand All @@ -33,8 +34,10 @@ const alertChannelLength = 200
type Alerts struct {
cancel context.CancelFunc

alerts *store.Alerts
marker types.Marker

mtx sync.Mutex
alerts *store.Alerts
listeners map[int]listeningAlerts
next int

Expand Down Expand Up @@ -62,14 +65,34 @@ type listeningAlerts struct {
done chan struct{}
}

func (a *Alerts) registerMetrics(r prometheus.Registerer) {
newMemAlertByStatus := func(s types.AlertState) prometheus.GaugeFunc {
return prometheus.NewGaugeFunc(
prometheus.GaugeOpts{
Name: "alertmanager_alerts",
Help: "How many alerts by state.",
ConstLabels: prometheus.Labels{"state": string(s)},
},
func() float64 {
return float64(a.count(s))
},
)
}

r.MustRegister(newMemAlertByStatus(types.AlertStateActive))
r.MustRegister(newMemAlertByStatus(types.AlertStateSuppressed))
r.MustRegister(newMemAlertByStatus(types.AlertStateUnprocessed))
}

// NewAlerts returns a new alert provider.
func NewAlerts(ctx context.Context, m types.Marker, intervalGC time.Duration, alertCallback AlertStoreCallback, l log.Logger) (*Alerts, error) {
func NewAlerts(ctx context.Context, m types.Marker, intervalGC time.Duration, alertCallback AlertStoreCallback, l log.Logger, r prometheus.Registerer) (*Alerts, error) {
if alertCallback == nil {
alertCallback = noopCallback{}
}

ctx, cancel := context.WithCancel(ctx)
a := &Alerts{
marker: m,
alerts: store.NewAlerts(),
cancel: cancel,
listeners: map[int]listeningAlerts{},
Expand Down Expand Up @@ -98,6 +121,11 @@ func NewAlerts(ctx context.Context, m types.Marker, intervalGC time.Duration, al
}
a.mtx.Unlock()
})

if r != nil {
a.registerMetrics(r)
}

go a.alerts.Run(ctx, intervalGC)

return a, nil
Expand Down Expand Up @@ -212,6 +240,25 @@ func (a *Alerts) Put(alerts ...*types.Alert) error {
return nil
}

// count returns the number of non-resolved alerts we currently have stored filtered by the provided state.
func (a *Alerts) count(state types.AlertState) int {
var count int
for _, alert := range a.alerts.List() {
if alert.Resolved() {
continue
}

status := a.marker.Status(alert.Fingerprint())
if status.State != state {
continue
}

count++
}

return count
}

type noopCallback struct{}

func (n noopCallback) PreStore(_ *types.Alert, _ bool) error { return nil }
Expand Down
80 changes: 74 additions & 6 deletions provider/mem/mem_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ func init() {
// a listener can not unsubscribe as the lock is hold by `alerts.Lock`.
func TestAlertsSubscribePutStarvation(t *testing.T) {
marker := types.NewMarker(prometheus.NewRegistry())
alerts, err := NewAlerts(context.Background(), marker, 30*time.Minute, noopCallback{}, log.NewNopLogger())
alerts, err := NewAlerts(context.Background(), marker, 30*time.Minute, noopCallback{}, log.NewNopLogger(), nil)
if err != nil {
t.Fatal(err)
}
Expand Down Expand Up @@ -137,7 +137,7 @@ func TestAlertsSubscribePutStarvation(t *testing.T) {

func TestAlertsPut(t *testing.T) {
marker := types.NewMarker(prometheus.NewRegistry())
alerts, err := NewAlerts(context.Background(), marker, 30*time.Minute, noopCallback{}, log.NewNopLogger())
alerts, err := NewAlerts(context.Background(), marker, 30*time.Minute, noopCallback{}, log.NewNopLogger(), nil)
if err != nil {
t.Fatal(err)
}
Expand Down Expand Up @@ -165,7 +165,7 @@ func TestAlertsSubscribe(t *testing.T) {

ctx, cancel := context.WithCancel(context.Background())
defer cancel()
alerts, err := NewAlerts(ctx, marker, 30*time.Minute, noopCallback{}, log.NewNopLogger())
alerts, err := NewAlerts(ctx, marker, 30*time.Minute, noopCallback{}, log.NewNopLogger(), nil)
if err != nil {
t.Fatal(err)
}
Expand Down Expand Up @@ -242,7 +242,7 @@ func TestAlertsSubscribe(t *testing.T) {

func TestAlertsGetPending(t *testing.T) {
marker := types.NewMarker(prometheus.NewRegistry())
alerts, err := NewAlerts(context.Background(), marker, 30*time.Minute, noopCallback{}, log.NewNopLogger())
alerts, err := NewAlerts(context.Background(), marker, 30*time.Minute, noopCallback{}, log.NewNopLogger(), nil)
if err != nil {
t.Fatal(err)
}
Expand Down Expand Up @@ -285,7 +285,7 @@ func TestAlertsGetPending(t *testing.T) {

func TestAlertsGC(t *testing.T) {
marker := types.NewMarker(prometheus.NewRegistry())
alerts, err := NewAlerts(context.Background(), marker, 200*time.Millisecond, noopCallback{}, log.NewNopLogger())
alerts, err := NewAlerts(context.Background(), marker, 200*time.Millisecond, noopCallback{}, log.NewNopLogger(), nil)
if err != nil {
t.Fatal(err)
}
Expand Down Expand Up @@ -322,7 +322,7 @@ func TestAlertsStoreCallback(t *testing.T) {
cb := &limitCountCallback{limit: 3}

marker := types.NewMarker(prometheus.NewRegistry())
alerts, err := NewAlerts(context.Background(), marker, 200*time.Millisecond, cb, log.NewNopLogger())
alerts, err := NewAlerts(context.Background(), marker, 200*time.Millisecond, cb, log.NewNopLogger(), nil)
if err != nil {
t.Fatal(err)
}
Expand Down Expand Up @@ -383,6 +383,74 @@ func TestAlertsStoreCallback(t *testing.T) {
}
}

func TestAlerts_Count(t *testing.T) {
marker := types.NewMarker(prometheus.NewRegistry())
alerts, err := NewAlerts(context.Background(), marker, 200*time.Millisecond, nil, log.NewNopLogger(), nil)
require.NoError(t, err)

states := []types.AlertState{types.AlertStateActive, types.AlertStateSuppressed, types.AlertStateUnprocessed}

countByState := func(st types.AlertState) int {
return alerts.count(st)
}
countTotal := func() int {
var count int
for _, st := range states {
count += countByState(st)
}
return count
}

// First, there shouldn't be any alerts.
require.Equal(t, 0, countTotal())

// When you insert a new alert that will eventually be active, it should be unprocessed first.
now := time.Now()
a1 := &types.Alert{
Alert: model.Alert{
Labels: model.LabelSet{"bar": "foo"},
Annotations: model.LabelSet{"foo": "bar"},
StartsAt: now,
EndsAt: now.Add(400 * time.Millisecond),
GeneratorURL: "http://example.com/prometheus",
},
UpdatedAt: now,
Timeout: false,
}

alerts.Put(a1)
require.Equal(t, 1, countByState(types.AlertStateUnprocessed))
require.Equal(t, 1, countTotal())
require.Eventually(t, func() bool {
// When the alert will eventually expire and is considered resolved - it won't count.
return countTotal() == 0
}, 600*time.Millisecond, 100*time.Millisecond)

now = time.Now()
a2 := &types.Alert{
Alert: model.Alert{
Labels: model.LabelSet{"bar": "foo"},
Annotations: model.LabelSet{"foo": "bar"},
StartsAt: now,
EndsAt: now.Add(400 * time.Millisecond),
GeneratorURL: "http://example.com/prometheus",
},
UpdatedAt: now,
Timeout: false,
}

// When insert an alert, and then silence it. It shows up with the correct filter.
alerts.Put(a2)
marker.SetSilenced(a2.Fingerprint(), 1, []string{"1"}, nil)
require.Equal(t, 1, countByState(types.AlertStateSuppressed))
require.Equal(t, 1, countTotal())

require.Eventually(t, func() bool {
// When the alert will eventually expire and is considered resolved - it won't count.
return countTotal() == 0
}, 600*time.Millisecond, 100*time.Millisecond)
}

func alertsEqual(a1, a2 *types.Alert) bool {
if a1 == nil || a2 == nil {
return false
Expand Down
12 changes: 7 additions & 5 deletions types/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,11 +108,11 @@ type memMarker struct {
}

func (m *memMarker) registerMetrics(r prometheus.Registerer) {
newAlertMetricByState := func(st AlertState) prometheus.GaugeFunc {
newMarkedAlertMetricByState := func(st AlertState) prometheus.GaugeFunc {
return prometheus.NewGaugeFunc(
prometheus.GaugeOpts{
Name: "alertmanager_alerts",
Help: "How many alerts by state.",
Name: "alertmanager_marked_alerts",
Help: "How many alerts by state are currently marked in the Alertmanager regardless of their expiry.",
ConstLabels: prometheus.Labels{"state": string(st)},
},
func() float64 {
Expand All @@ -121,11 +121,13 @@ func (m *memMarker) registerMetrics(r prometheus.Registerer) {
)
}

alertsActive := newAlertMetricByState(AlertStateActive)
alertsSuppressed := newAlertMetricByState(AlertStateSuppressed)
alertsActive := newMarkedAlertMetricByState(AlertStateActive)
alertsSuppressed := newMarkedAlertMetricByState(AlertStateSuppressed)
alertStateUnprocessed := newMarkedAlertMetricByState(AlertStateUnprocessed)

r.MustRegister(alertsActive)
r.MustRegister(alertsSuppressed)
r.MustRegister(alertStateUnprocessed)
}

// Count implements Marker.
Expand Down
52 changes: 52 additions & 0 deletions types/types_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,58 @@ import (
"github.com/stretchr/testify/require"
)

func TestMemMarker_Count(t *testing.T) {
r := prometheus.NewRegistry()
marker := NewMarker(r)
now := time.Now()

states := []AlertState{AlertStateSuppressed, AlertStateActive, AlertStateUnprocessed}
countByState := func(state AlertState) int {
return marker.Count(state)
}

countTotal := func() int {
var count int
for _, s := range states {
count += countByState(s)
}
return count
}

require.Equal(t, 0, countTotal())

a1 := model.Alert{
StartsAt: now.Add(-2 * time.Minute),
EndsAt: now.Add(2 * time.Minute),
Labels: model.LabelSet{"test": "active"},
}
a2 := model.Alert{
StartsAt: now.Add(-2 * time.Minute),
EndsAt: now.Add(2 * time.Minute),
Labels: model.LabelSet{"test": "suppressed"},
}
a3 := model.Alert{
StartsAt: now.Add(-2 * time.Minute),
EndsAt: now.Add(-1 * time.Minute),
Labels: model.LabelSet{"test": "resolved"},
}

// Insert an active alert.
marker.SetSilenced(a1.Fingerprint(), 1, nil, nil)
require.Equal(t, 1, countByState(AlertStateActive))
require.Equal(t, 1, countTotal())

// Insert a suppressed alert.
marker.SetSilenced(a2.Fingerprint(), 1, []string{"1"}, nil)
require.Equal(t, 1, countByState(AlertStateSuppressed))
require.Equal(t, 2, countTotal())

// Insert a resolved alert - it'll count as active.
marker.SetSilenced(a3.Fingerprint(), 1, []string{"1"}, nil)
require.Equal(t, 1, countByState(AlertStateActive))
require.Equal(t, 3, countTotal())
}

func TestAlertMerge(t *testing.T) {
now := time.Now()

Expand Down

0 comments on commit 805e505

Please sign in to comment.