diff --git a/CHANGELOG.md b/CHANGELOG.md index 2b1775ea6b6..3f8a9f3e7e6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -52,6 +52,7 @@ * [ENHANCEMENT] Unify ingester autoscaling panels on 'Mimir / Writes' dashboard to work for both ingest-storage and non-ingest-storage autoscaling. #9617 * [BUGFIX] Dashboards: Fix autoscaling metrics joins when series churn. #9412 #9450 #9432 * [BUGFIX] Alerts: Fix autoscaling metrics joins in `MimirAutoscalerNotActive` when series churn. #9412 +* [BUGFIX] Alerts: Exclude failed cache "add" operations from alerting since failures are expected in normal operation. #9658 ### Jsonnet diff --git a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml index 4cb5a5f5f7f..8d9dd76a97b 100644 --- a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml +++ b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml @@ -119,15 +119,15 @@ spec: expr: | ( sum by(cluster, namespace, name, operation) ( - rate(thanos_memcached_operation_failures_total[1m]) + rate(thanos_memcached_operation_failures_total{operation!="add"}[1m]) or - rate(thanos_cache_operation_failures_total[1m]) + rate(thanos_cache_operation_failures_total{operation!="add"}[1m]) ) / sum by(cluster, namespace, name, operation) ( - rate(thanos_memcached_operations_total[1m]) + rate(thanos_memcached_operations_total{operation!="add"}[1m]) or - rate(thanos_cache_operations_total[1m]) + rate(thanos_cache_operations_total{operation!="add"}[1m]) ) ) * 100 > 5 for: 5m diff --git a/operations/mimir-mixin-compiled-baremetal/alerts.yaml b/operations/mimir-mixin-compiled-baremetal/alerts.yaml index 1fb8acf7cd4..f9a76ec9208 100644 --- a/operations/mimir-mixin-compiled-baremetal/alerts.yaml +++ b/operations/mimir-mixin-compiled-baremetal/alerts.yaml @@ -107,15 +107,15 @@ groups: expr: | ( sum by(cluster, namespace, name, operation) ( - rate(thanos_memcached_operation_failures_total[1m]) + rate(thanos_memcached_operation_failures_total{operation!="add"}[1m]) or - rate(thanos_cache_operation_failures_total[1m]) + rate(thanos_cache_operation_failures_total{operation!="add"}[1m]) ) / sum by(cluster, namespace, name, operation) ( - rate(thanos_memcached_operations_total[1m]) + rate(thanos_memcached_operations_total{operation!="add"}[1m]) or - rate(thanos_cache_operations_total[1m]) + rate(thanos_cache_operations_total{operation!="add"}[1m]) ) ) * 100 > 5 for: 5m diff --git a/operations/mimir-mixin-compiled/alerts.yaml b/operations/mimir-mixin-compiled/alerts.yaml index 5b8551450ff..e4633662070 100644 --- a/operations/mimir-mixin-compiled/alerts.yaml +++ b/operations/mimir-mixin-compiled/alerts.yaml @@ -107,15 +107,15 @@ groups: expr: | ( sum by(cluster, namespace, name, operation) ( - rate(thanos_memcached_operation_failures_total[1m]) + rate(thanos_memcached_operation_failures_total{operation!="add"}[1m]) or - rate(thanos_cache_operation_failures_total[1m]) + rate(thanos_cache_operation_failures_total{operation!="add"}[1m]) ) / sum by(cluster, namespace, name, operation) ( - rate(thanos_memcached_operations_total[1m]) + rate(thanos_memcached_operations_total{operation!="add"}[1m]) or - rate(thanos_cache_operations_total[1m]) + rate(thanos_cache_operations_total{operation!="add"}[1m]) ) ) * 100 > 5 for: 5m diff --git a/operations/mimir-mixin/alerts/alerts.libsonnet b/operations/mimir-mixin/alerts/alerts.libsonnet index 1304f588fb4..2f76caeba72 100644 --- a/operations/mimir-mixin/alerts/alerts.libsonnet +++ b/operations/mimir-mixin/alerts/alerts.libsonnet @@ -202,18 +202,20 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, { alert: $.alertName('CacheRequestErrors'), + // Specifically exclude "add" operations which are used for cache invalidation and "locking" since + // they are expected to sometimes fail in normal operation (such as when a "lock" already exists). expr: ||| ( sum by(%(group_by)s, name, operation) ( - rate(thanos_memcached_operation_failures_total[%(range_interval)s]) + rate(thanos_memcached_operation_failures_total{operation!="add"}[%(range_interval)s]) or - rate(thanos_cache_operation_failures_total[%(range_interval)s]) + rate(thanos_cache_operation_failures_total{operation!="add"}[%(range_interval)s]) ) / sum by(%(group_by)s, name, operation) ( - rate(thanos_memcached_operations_total[%(range_interval)s]) + rate(thanos_memcached_operations_total{operation!="add"}[%(range_interval)s]) or - rate(thanos_cache_operations_total[%(range_interval)s]) + rate(thanos_cache_operations_total{operation!="add"}[%(range_interval)s]) ) ) * 100 > 5 ||| % {