-
Notifications
You must be signed in to change notification settings - Fork 3.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Fix mixin generation when cluster label is changed
Signed-off-by: QuentinBisson <quentin@giantswarm.io>
- Loading branch information
1 parent
3530c61
commit c92564a
Showing
11 changed files
with
212 additions
and
228 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,41 +1,41 @@ | ||
groups: | ||
- name: loki_alerts | ||
rules: | ||
- alert: LokiRequestErrors | ||
annotations: | ||
message: | | ||
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. | ||
expr: | | ||
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route) | ||
/ | ||
sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route) | ||
> 10 | ||
for: 15m | ||
labels: | ||
severity: critical | ||
- alert: LokiRequestPanics | ||
annotations: | ||
message: | | ||
{{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. | ||
expr: | | ||
sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 | ||
labels: | ||
severity: critical | ||
- alert: LokiRequestLatency | ||
annotations: | ||
message: | | ||
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. | ||
expr: | | ||
cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1 | ||
for: 15m | ||
labels: | ||
severity: critical | ||
- alert: LokiTooManyCompactorsRunning | ||
annotations: | ||
message: | | ||
{{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time. | ||
expr: | | ||
sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1 | ||
for: 5m | ||
labels: | ||
severity: warning | ||
- name: loki_alerts | ||
rules: | ||
- alert: LokiRequestErrors | ||
annotations: | ||
message: | | ||
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. | ||
expr: | | ||
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route) | ||
/ | ||
sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route) | ||
> 10 | ||
for: 15m | ||
labels: | ||
severity: critical | ||
- alert: LokiRequestPanics | ||
annotations: | ||
message: | | ||
{{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. | ||
expr: | | ||
sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 | ||
labels: | ||
severity: critical | ||
- alert: LokiRequestLatency | ||
annotations: | ||
message: | | ||
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. | ||
expr: | | ||
cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1 | ||
for: 15m | ||
labels: | ||
severity: critical | ||
- alert: LokiTooManyCompactorsRunning | ||
annotations: | ||
message: | | ||
{{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time. | ||
expr: | | ||
sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1 | ||
for: 5m | ||
labels: | ||
severity: warning |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,53 +1,39 @@ | ||
groups: | ||
- name: loki_rules | ||
rules: | ||
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) | ||
by (le, cluster, job)) | ||
record: cluster_job:loki_request_duration_seconds:99quantile | ||
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) | ||
by (le, cluster, job)) | ||
record: cluster_job:loki_request_duration_seconds:50quantile | ||
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(loki_request_duration_seconds_count[1m])) | ||
by (cluster, job) | ||
record: cluster_job:loki_request_duration_seconds:avg | ||
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job) | ||
record: cluster_job:loki_request_duration_seconds_bucket:sum_rate | ||
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) | ||
record: cluster_job:loki_request_duration_seconds_sum:sum_rate | ||
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job) | ||
record: cluster_job:loki_request_duration_seconds_count:sum_rate | ||
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) | ||
by (le, cluster, job, route)) | ||
record: cluster_job_route:loki_request_duration_seconds:99quantile | ||
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) | ||
by (le, cluster, job, route)) | ||
record: cluster_job_route:loki_request_duration_seconds:50quantile | ||
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) | ||
/ sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) | ||
record: cluster_job_route:loki_request_duration_seconds:avg | ||
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, | ||
route) | ||
record: cluster_job_route:loki_request_duration_seconds_bucket:sum_rate | ||
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) | ||
record: cluster_job_route:loki_request_duration_seconds_sum:sum_rate | ||
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) | ||
record: cluster_job_route:loki_request_duration_seconds_count:sum_rate | ||
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) | ||
by (le, cluster, namespace, job, route)) | ||
record: cluster_namespace_job_route:loki_request_duration_seconds:99quantile | ||
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) | ||
by (le, cluster, namespace, job, route)) | ||
record: cluster_namespace_job_route:loki_request_duration_seconds:50quantile | ||
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, | ||
job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, | ||
namespace, job, route) | ||
record: cluster_namespace_job_route:loki_request_duration_seconds:avg | ||
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, | ||
job, route) | ||
record: cluster_namespace_job_route:loki_request_duration_seconds_bucket:sum_rate | ||
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, | ||
job, route) | ||
record: cluster_namespace_job_route:loki_request_duration_seconds_sum:sum_rate | ||
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace, | ||
job, route) | ||
record: cluster_namespace_job_route:loki_request_duration_seconds_count:sum_rate | ||
- name: loki_rules | ||
rules: | ||
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job)) | ||
record: cluster_job:loki_request_duration_seconds:99quantile | ||
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job)) | ||
record: cluster_job:loki_request_duration_seconds:50quantile | ||
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job) | ||
record: cluster_job:loki_request_duration_seconds:avg | ||
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job) | ||
record: cluster_job:loki_request_duration_seconds_bucket:sum_rate | ||
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) | ||
record: cluster_job:loki_request_duration_seconds_sum:sum_rate | ||
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job) | ||
record: cluster_job:loki_request_duration_seconds_count:sum_rate | ||
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, route)) | ||
record: cluster_job_route:loki_request_duration_seconds:99quantile | ||
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, route)) | ||
record: cluster_job_route:loki_request_duration_seconds:50quantile | ||
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) | ||
record: cluster_job_route:loki_request_duration_seconds:avg | ||
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, route) | ||
record: cluster_job_route:loki_request_duration_seconds_bucket:sum_rate | ||
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) | ||
record: cluster_job_route:loki_request_duration_seconds_sum:sum_rate | ||
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) | ||
record: cluster_job_route:loki_request_duration_seconds_count:sum_rate | ||
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)) | ||
record: cluster_namespace_job_route:loki_request_duration_seconds:99quantile | ||
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)) | ||
record: cluster_namespace_job_route:loki_request_duration_seconds:50quantile | ||
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) | ||
record: cluster_namespace_job_route:loki_request_duration_seconds:avg | ||
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route) | ||
record: cluster_namespace_job_route:loki_request_duration_seconds_bucket:sum_rate | ||
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) | ||
record: cluster_namespace_job_route:loki_request_duration_seconds_sum:sum_rate | ||
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) | ||
record: cluster_namespace_job_route:loki_request_duration_seconds_count:sum_rate |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,41 +1,41 @@ | ||
groups: | ||
- name: loki_alerts | ||
rules: | ||
- alert: LokiRequestErrors | ||
annotations: | ||
message: | | ||
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. | ||
expr: | | ||
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route) | ||
/ | ||
sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route) | ||
> 10 | ||
for: 15m | ||
labels: | ||
severity: critical | ||
- alert: LokiRequestPanics | ||
annotations: | ||
message: | | ||
{{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. | ||
expr: | | ||
sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 | ||
labels: | ||
severity: critical | ||
- alert: LokiRequestLatency | ||
annotations: | ||
message: | | ||
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. | ||
expr: | | ||
cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1 | ||
for: 15m | ||
labels: | ||
severity: critical | ||
- alert: LokiTooManyCompactorsRunning | ||
annotations: | ||
message: | | ||
{{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time. | ||
expr: | | ||
sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1 | ||
for: 5m | ||
labels: | ||
severity: warning | ||
- name: loki_alerts | ||
rules: | ||
- alert: LokiRequestErrors | ||
annotations: | ||
message: | | ||
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. | ||
expr: | | ||
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route) | ||
/ | ||
sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route) | ||
> 10 | ||
for: 15m | ||
labels: | ||
severity: critical | ||
- alert: LokiRequestPanics | ||
annotations: | ||
message: | | ||
{{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. | ||
expr: | | ||
sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 | ||
labels: | ||
severity: critical | ||
- alert: LokiRequestLatency | ||
annotations: | ||
message: | | ||
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. | ||
expr: | | ||
cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1 | ||
for: 15m | ||
labels: | ||
severity: critical | ||
- alert: LokiTooManyCompactorsRunning | ||
annotations: | ||
message: | | ||
{{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time. | ||
expr: | | ||
sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1 | ||
for: 5m | ||
labels: | ||
severity: warning |
Oops, something went wrong.