@@ -278,6 +278,7 @@ type MaintenanceFunc func() (int64, error)
278278
279279type metrics struct {
280280 gcDuration prometheus.Summary
281+ gcErrorsTotal prometheus.Counter
281282 snapshotDuration prometheus.Summary
282283 snapshotSize prometheus.Gauge
283284 queriesTotal prometheus.Counter
@@ -323,6 +324,10 @@ func newMetrics(r prometheus.Registerer, s *Silences) *metrics {
323324 Help : "Duration of the last silence garbage collection cycle." ,
324325 Objectives : map [float64 ]float64 {},
325326 })
327+ m .gcErrorsTotal = promauto .With (r ).NewCounter (prometheus.CounterOpts {
328+ Name : "alertmanager_silences_gc_errors_total" ,
329+ Help : "How many silence GC errors were encountered." ,
330+ })
326331 m .snapshotDuration = promauto .With (r ).NewSummary (prometheus.SummaryOpts {
327332 Name : "alertmanager_silences_snapshot_duration_seconds" ,
328333 Help : "Duration of the last silence snapshot." ,
@@ -540,8 +545,7 @@ Loop:
540545 break Loop
541546 case <- t .C :
542547 if err := runMaintenance (doMaintenance ); err != nil {
543- // @tjhop: this should probably log at error level
544- s .logger .Info ("Running maintenance failed" , "err" , err )
548+ s .logger .Error ("Running maintenance failed" , "err" , err )
545549 }
546550 }
547551 }
@@ -564,6 +568,7 @@ func (s *Silences) GC() (int, error) {
564568
565569 now := s .nowUTC ()
566570 var n int
571+ var errs error
567572
568573 s .mtx .Lock ()
569574 defer s .mtx .Unlock ()
@@ -586,15 +591,21 @@ func (s *Silences) GC() (int, error) {
586591 // Iterate state map directly (fast - no extra lookups).
587592 for _ , sv := range s .vi {
588593 sil , ok := s .st [sv .id ]
589- // FIXME: in both these cases rather than breaking GC forever
590- // we should increase an error metric, log, and drop the culprit silence.
594+ expire := false
591595 if ! ok {
592- return n , errors .New ("silence in index missing from state" )
596+ // Silence in version index but not in state - remove from version index and count error
597+ s .metrics .gcErrorsTotal .Inc ()
598+ errs = errors .Join (errs , fmt .Errorf ("silence %s in version index missing from state" , sv .id ))
599+ // not adding to targetVi effectively removes it
600+ continue
593601 }
594602 if sil .ExpiresAt .IsZero () {
595- return n , errors .New ("unexpected zero expiration timestamp" )
603+ // Invalid expiration timestamp - remove silence and count error
604+ s .metrics .gcErrorsTotal .Inc ()
605+ errs = errors .Join (errs , fmt .Errorf ("silence %s has zero expiration timestamp" , sil .Silence .Id ))
606+ expire = true
596607 }
597- if ! sil .ExpiresAt .After (now ) {
608+ if expire || ! sil .ExpiresAt .After (now ) {
598609 delete (s .st , sil .Silence .Id )
599610 delete (s .mi , sil .Silence .Id )
600611 n ++
@@ -610,7 +621,7 @@ func (s *Silences) GC() (int, error) {
610621 s .vi = targetVi
611622 s .updateSizeMetrics ()
612623
613- return n , nil
624+ return n , errs
614625}
615626
616627func validateMatcher (m * pb.Matcher ) error {
0 commit comments