Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: mixin / add loki compaction not successfull alert #14239

Merged
33 changes: 33 additions & 0 deletions production/loki-mixin-compiled-ssd/alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,36 @@ groups:
for: 5m
labels:
severity: warning
- alert: LokiCompactorHasNotSuccessfullyRunCompaction
annotations:
description: |
{{ $labels.cluster }} {{ $labels.namespace }} has not run compaction in the last 3 hours since the last compaction. This may indicate a problem with the compactor.
summary: Loki compaction has not run in the last 3 hours since the last compaction.
expr: |
# The "last successful run" metric is updated even if the compactor owns no tenants,
# so this alert correctly doesn't fire if compactor has nothing to do.
min (
ashwanthgoli marked this conversation as resolved.
Show resolved Hide resolved
time() - (loki_boltdb_shipper_compact_tables_operation_last_successful_run_timestamp_seconds{} > 0)
)
by (cluster, namespace)
> 60 * 60 * 3
for: 1h
labels:
severity: critical
- alert: LokiCompactorHasNotSuccessfullyRunCompaction
annotations:
description: |
{{ $labels.cluster }} {{ $labels.namespace }} has not run compaction in the last 3h since startup. This may indicate a problem with the compactor.
summary: Loki compaction has not run in the last 3h since startup.
expr: |
# The "last successful run" metric is updated even if the compactor owns no tenants,
# so this alert correctly doesn't fire if compactor has nothing to do.
max(
max_over_time(
loki_boltdb_shipper_compact_tables_operation_last_successful_run_timestamp_seconds{}[3h]
)
) by (cluster, namespace)
== 0
for: 1h
labels:
severity: critical
33 changes: 33 additions & 0 deletions production/loki-mixin-compiled/alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,36 @@ groups:
for: 5m
labels:
severity: warning
- alert: LokiCompactorHasNotSuccessfullyRunCompaction
annotations:
description: |
{{ $labels.cluster }} {{ $labels.namespace }} has not run compaction in the last 3 hours since the last compaction. This may indicate a problem with the compactor.
summary: Loki compaction has not run in the last 3 hours since the last compaction.
expr: |
# The "last successful run" metric is updated even if the compactor owns no tenants,
# so this alert correctly doesn't fire if compactor has nothing to do.
min (
time() - (loki_boltdb_shipper_compact_tables_operation_last_successful_run_timestamp_seconds{} > 0)
)
by (cluster, namespace)
> 60 * 60 * 3
for: 1h
labels:
severity: critical
- alert: LokiCompactorHasNotSuccessfullyRunCompaction
annotations:
description: |
{{ $labels.cluster }} {{ $labels.namespace }} has not run compaction in the last 3h since startup. This may indicate a problem with the compactor.
summary: Loki compaction has not run in the last 3h since startup.
expr: |
# The "last successful run" metric is updated even if the compactor owns no tenants,
# so this alert correctly doesn't fire if compactor has nothing to do.
max(
max_over_time(
loki_boltdb_shipper_compact_tables_operation_last_successful_run_timestamp_seconds{}[3h]
)
) by (cluster, namespace)
== 0
for: 1h
labels:
severity: critical
47 changes: 47 additions & 0 deletions production/loki-mixin/alerts.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,53 @@
|||, 'cluster', $._config.per_cluster_label),
},
},
{
// Alert if the compactor has not successfully run compaction in the last 3h since the last compaction.
alert: 'LokiCompactorHasNotSuccessfullyRunCompaction',
expr: |||
# The "last successful run" metric is updated even if the compactor owns no tenants,
# so this alert correctly doesn't fire if compactor has nothing to do.
min (
time() - (loki_boltdb_shipper_compact_tables_operation_last_successful_run_timestamp_seconds{} > 0)
)
by (%s, namespace)
> 60 * 60 * 3
||| % $._config.per_cluster_label,
'for': '1h',
labels: {
severity: 'critical',
},
annotations: {
summary: 'Loki compaction has not run in the last 3 hours since the last compaction.',
description: std.strReplace(|||
{{ $labels.cluster }} {{ $labels.namespace }} has not run compaction in the last 3 hours since the last compaction. This may indicate a problem with the compactor.
|||, 'cluster', $._config.per_cluster_label),
},
},
{
// Alert if the compactor has not successfully run compaction in the last 3h since startup.
alert: 'LokiCompactorHasNotSuccessfullyRunCompaction',
expr: |||
# The "last successful run" metric is updated even if the compactor owns no tenants,
# so this alert correctly doesn't fire if compactor has nothing to do.
max(
max_over_time(
loki_boltdb_shipper_compact_tables_operation_last_successful_run_timestamp_seconds{}[3h]
)
) by (%s, namespace)
== 0
||| % $._config.per_cluster_label,
'for': '1h',
labels: {
severity: 'critical',
},
annotations: {
summary: 'Loki compaction has not run in the last 3h since startup.',
description: std.strReplace(|||
{{ $labels.cluster }} {{ $labels.namespace }} has not run compaction in the last 3h since startup. This may indicate a problem with the compactor.
|||, 'cluster', $._config.per_cluster_label),
},
},
],
},
],
Expand Down
Loading