Skip to content

Commit 4ea517b

Browse files
ultrotterGuido Trotter
andauthored
GC: report errors, but remove erroneous silences and continue (#4724)
Signed-off-by: Guido Trotter <guido@hudson-trading.com> Co-authored-by: Guido Trotter <guido@hudson-trading.com>
1 parent e3d4214 commit 4ea517b

File tree

2 files changed

+83
-8
lines changed

2 files changed

+83
-8
lines changed

silence/silence.go

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,7 @@ type MaintenanceFunc func() (int64, error)
278278

279279
type metrics struct {
280280
gcDuration prometheus.Summary
281+
gcErrorsTotal prometheus.Counter
281282
snapshotDuration prometheus.Summary
282283
snapshotSize prometheus.Gauge
283284
queriesTotal prometheus.Counter
@@ -323,6 +324,10 @@ func newMetrics(r prometheus.Registerer, s *Silences) *metrics {
323324
Help: "Duration of the last silence garbage collection cycle.",
324325
Objectives: map[float64]float64{},
325326
})
327+
m.gcErrorsTotal = promauto.With(r).NewCounter(prometheus.CounterOpts{
328+
Name: "alertmanager_silences_gc_errors_total",
329+
Help: "How many silence GC errors were encountered.",
330+
})
326331
m.snapshotDuration = promauto.With(r).NewSummary(prometheus.SummaryOpts{
327332
Name: "alertmanager_silences_snapshot_duration_seconds",
328333
Help: "Duration of the last silence snapshot.",
@@ -540,8 +545,7 @@ Loop:
540545
break Loop
541546
case <-t.C:
542547
if err := runMaintenance(doMaintenance); err != nil {
543-
// @tjhop: this should probably log at error level
544-
s.logger.Info("Running maintenance failed", "err", err)
548+
s.logger.Error("Running maintenance failed", "err", err)
545549
}
546550
}
547551
}
@@ -564,6 +568,7 @@ func (s *Silences) GC() (int, error) {
564568

565569
now := s.nowUTC()
566570
var n int
571+
var errs error
567572

568573
s.mtx.Lock()
569574
defer s.mtx.Unlock()
@@ -586,15 +591,21 @@ func (s *Silences) GC() (int, error) {
586591
// Iterate state map directly (fast - no extra lookups).
587592
for _, sv := range s.vi {
588593
sil, ok := s.st[sv.id]
589-
// FIXME: in both these cases rather than breaking GC forever
590-
// we should increase an error metric, log, and drop the culprit silence.
594+
expire := false
591595
if !ok {
592-
return n, errors.New("silence in index missing from state")
596+
// Silence in version index but not in state - remove from version index and count error
597+
s.metrics.gcErrorsTotal.Inc()
598+
errs = errors.Join(errs, fmt.Errorf("silence %s in version index missing from state", sv.id))
599+
// not adding to targetVi effectively removes it
600+
continue
593601
}
594602
if sil.ExpiresAt.IsZero() {
595-
return n, errors.New("unexpected zero expiration timestamp")
603+
// Invalid expiration timestamp - remove silence and count error
604+
s.metrics.gcErrorsTotal.Inc()
605+
errs = errors.Join(errs, fmt.Errorf("silence %s has zero expiration timestamp", sil.Silence.Id))
606+
expire = true
596607
}
597-
if !sil.ExpiresAt.After(now) {
608+
if expire || !sil.ExpiresAt.After(now) {
598609
delete(s.st, sil.Silence.Id)
599610
delete(s.mi, sil.Silence.Id)
600611
n++
@@ -610,7 +621,7 @@ func (s *Silences) GC() (int, error) {
610621
s.vi = targetVi
611622
s.updateSizeMetrics()
612623

613-
return n, nil
624+
return n, errs
614625
}
615626

616627
func validateMatcher(m *pb.Matcher) error {

silence/silence_test.go

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -332,6 +332,70 @@ func TestSilenceGCOverTime(t *testing.T) {
332332
require.Contains(t, id, "active-")
333333
}
334334
})
335+
336+
t.Run("GC continues and removes erroneous silences", func(t *testing.T) {
337+
reg := prometheus.NewRegistry()
338+
s, err := New(Options{Metrics: reg})
339+
require.NoError(t, err)
340+
clock := quartz.NewMock(t)
341+
s.clock = clock
342+
now := clock.Now()
343+
344+
// Create a valid silence
345+
validSil := &pb.Silence{
346+
Matchers: []*pb.Matcher{{
347+
Type: pb.Matcher_EQUAL,
348+
Name: "foo",
349+
Pattern: "bar",
350+
}},
351+
StartsAt: now,
352+
EndsAt: now.Add(time.Minute),
353+
}
354+
require.NoError(t, s.Set(validSil))
355+
validID := validSil.Id
356+
357+
// Manually add an erroneous silence with zero expiration
358+
erroneousSil := &pb.MeshSilence{
359+
Silence: &pb.Silence{
360+
Id: "erroneous",
361+
Matchers: []*pb.Matcher{{
362+
Type: pb.Matcher_EQUAL,
363+
Name: "bar",
364+
Pattern: "baz",
365+
}},
366+
StartsAt: now,
367+
EndsAt: now.Add(time.Minute),
368+
},
369+
ExpiresAt: time.Time{}, // Zero expiration - invalid
370+
}
371+
s.st["erroneous"] = erroneousSil
372+
s.vi.add(s.version+1, "erroneous")
373+
s.version++
374+
375+
// Manually add an entry to version index that doesn't exist in state
376+
s.vi.add(s.version+1, "missing")
377+
s.version++
378+
379+
require.Len(t, s.st, 2)
380+
require.Len(t, s.vi, 3)
381+
382+
// Run GC - should continue despite errors
383+
n, err := s.GC()
384+
require.Error(t, err)
385+
require.Contains(t, err.Error(), "zero expiration timestamp")
386+
require.Contains(t, err.Error(), "missing from state")
387+
388+
// GC should have removed erroneous silences
389+
require.Equal(t, 1, n) // Only the erroneous silence with zero expiration
390+
require.Len(t, s.st, 1)
391+
require.Len(t, s.vi, 1)
392+
require.Contains(t, s.st, validID)
393+
require.NotContains(t, s.st, "erroneous")
394+
395+
// Check that the error metric was incremented
396+
metricValue := testutil.ToFloat64(s.metrics.gcErrorsTotal)
397+
require.Equal(t, float64(2), metricValue)
398+
})
335399
}
336400

337401
func TestSilencesSnapshot(t *testing.T) {

0 commit comments

Comments
 (0)