Skip to content

Commit

Permalink
Adjust unhealthy metric and failed duration
Browse files Browse the repository at this point in the history
  • Loading branch information
zalegrala committed Apr 18, 2022
1 parent 2df00cf commit 0ca1ccd
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 6 deletions.
6 changes: 3 additions & 3 deletions operations/tempo-mixin/alerts.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,8 @@
// wait 5m for failed flushes to self-heal using retries
alert: 'TempoIngesterFlushesUnhealthy',
expr: |||
sum by (%s) (increase(tempo_ingester_flush_failed_retries_total{}[1h])) > %s and
sum by (%s) (increase(tempo_ingester_flush_failed_retries_total{}[5m])) > 0
sum by (%s) (increase(tempo_ingester_failed_flushes_total{}[1h])) > %s and
sum by (%s) (increase(tempo_ingester_failed_flushes_total{}[5m])) > 0
||| % [$._config.group_by_cluster, $._config.alerts.flushes_per_hour_failed, $._config.group_by_cluster],
'for': '5m',
labels: {
Expand All @@ -105,7 +105,7 @@
sum by (%s) (increase(tempo_ingester_flush_failed_retries_total{}[1h])) > %s and
sum by (%s) (increase(tempo_ingester_flush_failed_retries_total{}[5m])) > 0
||| % [$._config.group_by_cluster, $._config.alerts.flushes_per_hour_failed, $._config.group_by_cluster],
'for': '10m',
'for': '5m',
labels: {
severity: 'critical',
},
Expand Down
6 changes: 3 additions & 3 deletions operations/tempo-mixin/yamls/alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,8 @@
"message": "Greater than 2 flush retries have occurred in the past hour."
"runbook_url": "https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoIngesterFlushesFailing"
"expr": |
sum by (cluster, namespace) (increase(tempo_ingester_flush_failed_retries_total{}[1h])) > 2 and
sum by (cluster, namespace) (increase(tempo_ingester_flush_failed_retries_total{}[5m])) > 0
sum by (cluster, namespace) (increase(tempo_ingester_failed_flushes_total{}[1h])) > 2 and
sum by (cluster, namespace) (increase(tempo_ingester_failed_flushes_total{}[5m])) > 0
"for": "5m"
"labels":
"severity": "warning"
Expand All @@ -69,7 +69,7 @@
"expr": |
sum by (cluster, namespace) (increase(tempo_ingester_flush_failed_retries_total{}[1h])) > 2 and
sum by (cluster, namespace) (increase(tempo_ingester_flush_failed_retries_total{}[5m])) > 0
"for": "10m"
"for": "5m"
"labels":
"severity": "critical"
- "alert": "TempoPollsFailing"
Expand Down

0 comments on commit 0ca1ccd

Please sign in to comment.