Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

mixin: Added critical Rules alerts. #2374

Merged
merged 2 commits into from
Apr 3, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 25 additions & 5 deletions examples/alerts/alerts.md
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ rules:
)
for: 5m
labels:
severity: warning
severity: critical
- alert: ThanosRuleHighRuleEvaluationWarnings
annotations:
message: Thanos Rule {{$labels.job}} {{$labels.pod}} has high number of evaluation
Expand Down Expand Up @@ -144,8 +144,8 @@ rules:
severity: info
- alert: ThanosRuleQueryHighDNSFailures
annotations:
message: Thanos Rule {{$labels.job}} have {{ $value | humanize }}% of failing
DNS queries for query endpoints.
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS
queries for query endpoints.
expr: |
(
sum by (job) (rate(thanos_ruler_query_apis_dns_failures_total{job=~"thanos-rule.*"}[5m]))
Expand All @@ -158,8 +158,8 @@ rules:
severity: warning
- alert: ThanosRuleAlertmanagerHighDNSFailures
annotations:
message: Thanos Rule {{$labels.job}} have {{ $value | humanize }}% of failing
DNS queries for Alertmanager endpoints.
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS
queries for Alertmanager endpoints.
expr: |
(
sum by (job) (rate(thanos_ruler_alertmanagers_dns_failures_total{job=~"thanos-rule.*"}[5m]))
Expand All @@ -170,6 +170,26 @@ rules:
for: 15m
labels:
severity: warning
- alert: ThanosRuleNoEvaluationFor10Intervals
annotations:
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% rule groups
that did not evaluate for at least 10x of their expected interval.
expr: |
time() - prometheus_rule_group_last_evaluation_timestamp_seconds{job=~"thanos-rule.*"}
>
10 * prometheus_rule_group_interval_seconds{job=~"thanos-rule.*"}
for: 5m
labels:
severity: critical
- alert: ThanosRuleTSDBNotIngestingSamples
annotations:
message: Thanos Rule {{$labels.job}} did not ingest any samples for the last 15
minutes.
expr: |
rate(prometheus_tsdb_head_samples_appended_total{job=~"thanos-rule.*"}[5m]) <= 0
for: 10m
labels:
severity: critical
```

## Store Gateway
Expand Down
26 changes: 23 additions & 3 deletions examples/alerts/alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,7 @@ groups:
)
for: 5m
labels:
severity: warning
severity: critical
- alert: ThanosRuleHighRuleEvaluationWarnings
annotations:
message: Thanos Rule {{$labels.job}} {{$labels.pod}} has high number of evaluation
Expand Down Expand Up @@ -363,7 +363,7 @@ groups:
severity: info
- alert: ThanosRuleQueryHighDNSFailures
annotations:
message: Thanos Rule {{$labels.job}} have {{ $value | humanize }}% of failing
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing
DNS queries for query endpoints.
expr: |
(
Expand All @@ -377,7 +377,7 @@ groups:
severity: warning
- alert: ThanosRuleAlertmanagerHighDNSFailures
annotations:
message: Thanos Rule {{$labels.job}} have {{ $value | humanize }}% of failing
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing
DNS queries for Alertmanager endpoints.
expr: |
(
Expand All @@ -389,6 +389,26 @@ groups:
for: 15m
labels:
severity: warning
- alert: ThanosRuleNoEvaluationFor10Intervals
annotations:
message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% rule groups
that did not evaluate for at least 10x of their expected interval.
expr: |
time() - prometheus_rule_group_last_evaluation_timestamp_seconds{job=~"thanos-rule.*"}
>
10 * prometheus_rule_group_interval_seconds{job=~"thanos-rule.*"}
for: 5m
labels:
severity: critical
- alert: ThanosRuleTSDBNotIngestingSamples
annotations:
message: Thanos Rule {{$labels.job}} did not ingest any samples for the last
15 minutes.
expr: |
rate(prometheus_tsdb_head_samples_appended_total{job=~"thanos-rule.*"}[5m]) <= 0
for: 10m
labels:
severity: critical
- name: thanos-component-absent.rules
rules:
- alert: ThanosCompactIsDown
Expand Down
36 changes: 33 additions & 3 deletions mixin/thanos/alerts/rule.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@

'for': '5m',
labels: {
severity: 'warning',
severity: 'critical',
},
},
{
Expand Down Expand Up @@ -120,7 +120,7 @@
{
alert: 'ThanosRuleQueryHighDNSFailures',
annotations: {
message: 'Thanos Rule {{$labels.job}} have {{ $value | humanize }}% of failing DNS queries for query endpoints.',
message: 'Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS queries for query endpoints.',
},
expr: |||
(
Expand All @@ -138,7 +138,7 @@
{
alert: 'ThanosRuleAlertmanagerHighDNSFailures',
annotations: {
message: 'Thanos Rule {{$labels.job}} have {{ $value | humanize }}% of failing DNS queries for Alertmanager endpoints.',
message: 'Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS queries for Alertmanager endpoints.',
},
expr: |||
(
Expand All @@ -153,6 +153,36 @@
severity: 'warning',
},
},
{
// NOTE: This alert will give false positive if no rules are configured.
alert: 'ThanosRuleNoEvaluationFor10Intervals',
annotations: {
message: 'Thanos Rule {{$labels.job}} has {{ $value | humanize }}% rule groups that did not evaluate for at least 10x of their expected interval.',
},
expr: |||
time() - prometheus_rule_group_last_evaluation_timestamp_seconds{%(selector)s}
>
10 * prometheus_rule_group_interval_seconds{%(selector)s}
||| % thanos.rule,
'for': '5m',
labels: {
severity: 'critical',
},
},
{
// NOTE: This alert will give false positive if no rules are configured.
alert: 'ThanosRuleTSDBNotIngestingSamples',
annotations: {
message: 'Thanos Rule {{$labels.job}} did not ingest any samples for the last 15 minutes.',
},
expr: |||
rate(prometheus_tsdb_head_samples_appended_total{%(selector)s}[5m]) <= 0
||| % thanos.rule,
'for': '10m',
labels: {
severity: 'critical',
},
},
],
},
],
Expand Down