From 7c58e37fc3ede83d9ccf2ef8550674945a8a0413 Mon Sep 17 00:00:00 2001 From: mtweten Date: Thu, 17 Oct 2024 10:37:43 -0500 Subject: [PATCH] [mixin/alerts]: Enable configuring job prefix for alerts to prevent clashes with metrics from Loki/Tempo --- CHANGELOG.md | 1 + .../templates/metamonitoring/mixin-alerts.yaml | 2 +- operations/mimir-mixin-compiled-baremetal/alerts.yaml | 2 +- operations/mimir-mixin-compiled/alerts.yaml | 2 +- operations/mimir-mixin/alerts/alerts-utils.libsonnet | 4 ++-- operations/mimir-mixin/alerts/alerts.libsonnet | 4 ++-- operations/mimir-mixin/config.libsonnet | 3 +++ 7 files changed, 11 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index da28cd2ad71..b45d164299a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -53,6 +53,7 @@ ### Mixin * [ENHANCEMENT] Unify ingester autoscaling panels on 'Mimir / Writes' dashboard to work for both ingest-storage and non-ingest-storage autoscaling. #9617 +* [ENHANCEMENT] Alerts: Enable configuring job prefix for alerts to prevent clashes with metrics from Loki/Tempo. #9659 * [BUGFIX] Dashboards: Fix autoscaling metrics joins when series churn. #9412 #9450 #9432 * [BUGFIX] Alerts: Fix autoscaling metrics joins in `MimirAutoscalerNotActive` when series churn. #9412 * [BUGFIX] Alerts: Exclude failed cache "add" operations from alerting since failures are expected in normal operation. #9658 diff --git a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml index 8d9dd76a97b..865af5cd014 100644 --- a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml +++ b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml @@ -537,7 +537,7 @@ spec: expr: | max by (cluster, namespace) (memberlist_client_cluster_members_count) > - (sum by (cluster, namespace) (up{job=~".+/(admin-api|alertmanager|compactor.*|distributor.*|ingester.*|querier.*|ruler|ruler-querier.*|store-gateway.*|cortex|mimir|mimir-write.*|mimir-read.*|mimir-backend.*)"}) + 10) + (sum by (cluster, namespace) (up{job=~".*/(admin-api|alertmanager|compactor.*|distributor.*|ingester.*|querier.*|ruler|ruler-querier.*|store-gateway.*|cortex|mimir|mimir-write.*|mimir-read.*|mimir-backend.*)"}) + 10) for: 20m labels: severity: warning diff --git a/operations/mimir-mixin-compiled-baremetal/alerts.yaml b/operations/mimir-mixin-compiled-baremetal/alerts.yaml index f9a76ec9208..70eed51a877 100644 --- a/operations/mimir-mixin-compiled-baremetal/alerts.yaml +++ b/operations/mimir-mixin-compiled-baremetal/alerts.yaml @@ -515,7 +515,7 @@ groups: expr: | max by (cluster, namespace) (memberlist_client_cluster_members_count) > - (sum by (cluster, namespace) (up{job=~".+/(admin-api|alertmanager|compactor.*|distributor.*|ingester.*|querier.*|ruler|ruler-querier.*|store-gateway.*|cortex|mimir|mimir-write.*|mimir-read.*|mimir-backend.*)"}) + 10) + (sum by (cluster, namespace) (up{job=~".*/(admin-api|alertmanager|compactor.*|distributor.*|ingester.*|querier.*|ruler|ruler-querier.*|store-gateway.*|cortex|mimir|mimir-write.*|mimir-read.*|mimir-backend.*)"}) + 10) for: 20m labels: severity: warning diff --git a/operations/mimir-mixin-compiled/alerts.yaml b/operations/mimir-mixin-compiled/alerts.yaml index e4633662070..58de80d1e10 100644 --- a/operations/mimir-mixin-compiled/alerts.yaml +++ b/operations/mimir-mixin-compiled/alerts.yaml @@ -525,7 +525,7 @@ groups: expr: | max by (cluster, namespace) (memberlist_client_cluster_members_count) > - (sum by (cluster, namespace) (up{job=~".+/(admin-api|alertmanager|compactor.*|distributor.*|ingester.*|querier.*|ruler|ruler-querier.*|store-gateway.*|cortex|mimir|mimir-write.*|mimir-read.*|mimir-backend.*)"}) + 10) + (sum by (cluster, namespace) (up{job=~".*/(admin-api|alertmanager|compactor.*|distributor.*|ingester.*|querier.*|ruler|ruler-querier.*|store-gateway.*|cortex|mimir|mimir-write.*|mimir-read.*|mimir-backend.*)"}) + 10) for: 20m labels: severity: warning diff --git a/operations/mimir-mixin/alerts/alerts-utils.libsonnet b/operations/mimir-mixin/alerts/alerts-utils.libsonnet index 3f3307a100a..3d344020f64 100644 --- a/operations/mimir-mixin/alerts/alerts-utils.libsonnet +++ b/operations/mimir-mixin/alerts/alerts-utils.libsonnet @@ -10,10 +10,10 @@ $._config.product + name, jobMatcher(job):: - 'job=~".*/%s"' % formatJobForQuery(job), + '%s=~"%s%s"' % [$._config.per_job_label, $._config.alert_job_prefix, formatJobForQuery(job)], jobNotMatcher(job):: - 'job!~".*/%s"' % formatJobForQuery(job), + '%s!~"%s%s"' % [$._config.per_job_label, $._config.alert_job_prefix, formatJobForQuery(job)], local formatJobForQuery(job) = if std.isArray(job) then '(%s)' % std.join('|', job) diff --git a/operations/mimir-mixin/alerts/alerts.libsonnet b/operations/mimir-mixin/alerts/alerts.libsonnet index 2f76caeba72..3ebd24e6b17 100644 --- a/operations/mimir-mixin/alerts/alerts.libsonnet +++ b/operations/mimir-mixin/alerts/alerts.libsonnet @@ -779,8 +779,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; ||| max by (%s) (memberlist_client_cluster_members_count) > - (sum by (%s) (up{%s=~".+/%s"}) + 10) - ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels, $._config.per_job_label, simpleRegexpOpt($._config.job_names.ring_members)], + (sum by (%s) (up{%s}) + 10) + ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels, $.jobMatcher($._config.job_names.ring_members)], 'for': '20m', labels: { severity: 'warning', diff --git a/operations/mimir-mixin/config.libsonnet b/operations/mimir-mixin/config.libsonnet index e3527e1d876..464382465f6 100644 --- a/operations/mimir-mixin/config.libsonnet +++ b/operations/mimir-mixin/config.libsonnet @@ -203,6 +203,9 @@ // Used to add extra annotations to all alerts, Careful: takes precedence over default annotations. alert_extra_annotations: {}, + // Used as the job prefix in alerts that select on job label (e.g. GossipMembersTooHigh, RingMembersMismatch). This can be set to a known namespace to prevent those alerts from firing incorrectly due to selecting similar metrics from Loki/Tempo. + alert_job_prefix: '.*/', + // Whether alerts for experimental ingest storage are enabled. ingest_storage_enabled: true,