From ee4be2980795360528dd8f8a0fb233a6fa129e0d Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Wed, 5 Apr 2023 15:40:03 +0200 Subject: [PATCH] Update bundled thanos mixin dashboards (#445) * Update bundled thanos mixin dashboards * Update bundled rules --- jsonnetfile.lock.json | 4 +- ...bservatorium-thanos-compact.configmap.yaml | 16 +- ...servatorium-thanos-overview.configmap.yaml | 16 +- ...orium-thanos-query-frontend.configmap.yaml | 13 +- ...-observatorium-thanos-query.configmap.yaml | 12 +- ...bservatorium-thanos-receive.configmap.yaml | 34 ++-- ...d-observatorium-thanos-rule.configmap.yaml | 148 ++++++++++++++---- ...-observatorium-thanos-store.configmap.yaml | 58 ++++--- ...ium-thanos-production.prometheusrules.yaml | 4 +- ...vatorium-thanos-stage.prometheusrules.yaml | 4 +- 10 files changed, 189 insertions(+), 120 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index cb2723ac15..292b3ded26 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -356,8 +356,8 @@ "subdir": "mixin" } }, - "version": "37e37cc009dcda7bd123aedc7fec69a412d8be38", - "sum": "Io++1+lp1oQVoQiVRSCXUiGdTIRPV7aL6Ewgs3bShEs=", + "version": "96fd6db6942290e1e520d556eed2e89df4e29eea", + "sum": "YWi7WSDfTecWr5xnXODfyxGcZUnXSw9Ba6lEanaqLW4=", "name": "thanos-mixin" } ], diff --git a/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-compact.configmap.yaml b/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-compact.configmap.yaml index 0badf21752..a81ff24402 100644 --- a/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-compact.configmap.yaml +++ b/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-compact.configmap.yaml @@ -59,7 +59,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job, group) (rate(thanos_compact_group_compactions_total{namespace=\"$namespace\", job=~\"$job\"}[$interval]))", + "expr": "sum by (namespace, job, group) (rate(thanos_compact_group_compactions_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "compaction {{job}} {{group}}", @@ -244,7 +244,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job, group) (rate(thanos_compact_downsample_total{namespace=\"$namespace\", job=~\"$job\"}[$interval]))", + "expr": "sum by (namespace, job, group) (rate(thanos_compact_downsample_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "downsample {{job}} {{group}}", @@ -429,7 +429,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job) (rate(thanos_compact_garbage_collection_total{namespace=\"$namespace\", job=~\"$job\"}[$interval]))", + "expr": "sum by (namespace, job) (rate(thanos_compact_garbage_collection_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "garbage collection {{job}}", @@ -740,7 +740,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job) (rate(thanos_compact_blocks_cleaned_total{namespace=\"$namespace\", job=~\"$job\"}[$interval]))", + "expr": "sum by (namespace, job) (rate(thanos_compact_blocks_cleaned_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "Blocks cleanup {{job}}", @@ -827,7 +827,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job) (rate(thanos_compact_block_cleanup_failures_total{namespace=\"$namespace\", job=~\"$job\"}[$interval]))", + "expr": "sum by (namespace, job) (rate(thanos_compact_block_cleanup_failures_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "Blocks cleanup failures {{job}}", @@ -914,7 +914,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job) (rate(thanos_compact_blocks_marked_for_deletion_total{namespace=\"$namespace\", job=~\"$job\"}[$interval]))", + "expr": "sum by (namespace, job) (rate(thanos_compact_blocks_marked_for_deletion_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "Blocks marked {{job}}", @@ -1013,7 +1013,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job) (rate(thanos_blocks_meta_syncs_total{namespace=\"$namespace\", job=~\"$job\"}[$interval]))", + "expr": "sum by (namespace, job) (rate(thanos_blocks_meta_syncs_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "sync {{job}}", @@ -1324,7 +1324,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job, operation) (rate(thanos_objstore_bucket_operations_total{namespace=\"$namespace\", job=~\"$job\"}[$interval]))", + "expr": "sum by (namespace, job, operation) (rate(thanos_objstore_bucket_operations_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}} {{operation}}", diff --git a/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-overview.configmap.yaml b/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-overview.configmap.yaml index 7b096e2b54..6852c89e56 100644 --- a/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-overview.configmap.yaml +++ b/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-overview.configmap.yaml @@ -176,10 +176,9 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job) (rate(http_requests_total{namespace=\"$namespace\", handler=\"query\",code=~\"5..\"}[$interval])) / sum by (namespace, job) (rate(http_requests_total{namespace=\"$namespace\", handler=\"query\"}[$interval]))", + "expr": "sum by (namespace, job, code) (rate(http_requests_total{namespace=\"$namespace\", handler=\"query\",code=~\"5..\"}[$interval])) / ignoring (code) group_left() sum by (namespace, job) (rate(http_requests_total{namespace=\"$namespace\", handler=\"query\"}[$interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "error", "step": 10 } ], @@ -499,10 +498,9 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job) (rate(http_requests_total{namespace=\"$namespace\", handler=\"query_range\",code=~\"5..\"}[$interval])) / sum by (namespace, job) (rate(http_requests_total{namespace=\"$namespace\", handler=\"query_range\"}[$interval]))", + "expr": "sum by (namespace, job, code) (rate(http_requests_total{namespace=\"$namespace\", handler=\"query_range\",code=~\"5..\"}[$interval])) / ignoring (code) group_left() sum by (namespace, job) (rate(http_requests_total{namespace=\"$namespace\", handler=\"query_range\"}[$interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "error", "step": 10 } ], @@ -874,10 +872,9 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",namespace=\"$namespace\", grpc_type=\"unary\"}[$interval])) / sum by (namespace, job) (rate(grpc_server_handled_total{namespace=\"$namespace\", grpc_type=\"unary\"}[$interval]))", + "expr": "sum by (namespace, job, grpc_code) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",namespace=\"$namespace\", grpc_type=\"unary\"}[$interval])) / ignoring (grpc_code) group_left() sum by (namespace, job) (rate(grpc_server_handled_total{namespace=\"$namespace\", grpc_type=\"unary\"}[$interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "error", "step": 10 } ], @@ -1197,10 +1194,9 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job) (rate(http_requests_total{namespace=\"$namespace\", handler=\"receive\",code=~\"5..\"}[$interval])) / sum by (namespace, job) (rate(http_requests_total{namespace=\"$namespace\", handler=\"receive\"}[$interval]))", + "expr": "sum by (namespace, job, code) (rate(http_requests_total{namespace=\"$namespace\", handler=\"receive\",code=~\"5..\"}[$interval])) / ignoring (code) group_left() sum by (namespace, job) (rate(http_requests_total{namespace=\"$namespace\", handler=\"receive\"}[$interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "error", "step": 10 } ], @@ -1409,7 +1405,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job, alertmanager) (rate(thanos_alert_sender_alerts_sent_total{namespace=\"$namespace\"}[$interval]))", + "expr": "sum by (namespace, job, alertmanager) (rate(thanos_alert_sender_alerts_sent_total{namespace=\"$namespace\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{alertmanager}}", @@ -1714,7 +1710,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job) (rate(thanos_compact_group_compactions_total{namespace=\"$namespace\"}[$interval]))", + "expr": "sum by (namespace, job) (rate(thanos_compact_group_compactions_total{namespace=\"$namespace\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "compaction {{job}}", diff --git a/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-query-frontend.configmap.yaml b/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-query-frontend.configmap.yaml index ad520569a4..51cc6aaa17 100644 --- a/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-query-frontend.configmap.yaml +++ b/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-query-frontend.configmap.yaml @@ -269,10 +269,9 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job) (rate(http_requests_total{namespace=\"$namespace\", job=~\"$job\", handler=\"query-frontend\",code=~\"5..\"}[$interval])) / sum by (namespace, job) (rate(http_requests_total{namespace=\"$namespace\", job=~\"$job\", handler=\"query-frontend\"}[$interval]))", + "expr": "sum by (namespace, job, code) (rate(http_requests_total{namespace=\"$namespace\", job=~\"$job\", handler=\"query-frontend\",code=~\"5..\"}[$interval])) / ignoring (code) group_left() sum by (namespace, job) (rate(http_requests_total{namespace=\"$namespace\", job=~\"$job\", handler=\"query-frontend\"}[$interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "error", "step": 10 } ], @@ -493,7 +492,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job, tripperware) (rate(cortex_cache_request_duration_seconds_count{namespace=\"$namespace\", job=~\"$job\"}[$interval]))", + "expr": "sum by (namespace, job, tripperware) (rate(cortex_cache_request_duration_seconds_count{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}} {{tripperware}}", @@ -580,7 +579,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job, tripperware) (rate(querier_cache_gets_total{namespace=\"$namespace\", job=~\"$job\"}[$interval]))", + "expr": "sum by (namespace, job, tripperware) (rate(querier_cache_gets_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "Cache gets - {{job}} {{tripperware}}", @@ -588,7 +587,7 @@ data: "step": 10 }, { - "expr": "sum by (namespace, job, tripperware) (rate(querier_cache_misses_total{namespace=\"$namespace\", job=~\"$job\"}[$interval]))", + "expr": "sum by (namespace, job, tripperware) (rate(querier_cache_misses_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "Cache misses - {{job}} {{tripperware}}", @@ -675,7 +674,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job, tripperware) (rate(cortex_cache_fetched_keys_total{namespace=\"$namespace\", job=~\"$job\"}[$interval]))", + "expr": "sum by (namespace, job, tripperware) (rate(cortex_cache_fetched_keys_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}} {{tripperware}}", @@ -762,7 +761,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job, tripperware) (rate(cortex_cache_hits_total{namespace=\"$namespace\", job=~\"$job\"}[$interval]))", + "expr": "sum by (namespace, job, tripperware) (rate(cortex_cache_hits_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}} {{tripperware}}", diff --git a/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-query.configmap.yaml b/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-query.configmap.yaml index a31da1a8e7..0da884af5e 100644 --- a/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-query.configmap.yaml +++ b/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-query.configmap.yaml @@ -164,10 +164,9 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job) (rate(http_requests_total{namespace=\"$namespace\", job=~\"$job\", handler=\"query\",code=~\"5..\"}[$interval])) / sum by (namespace, job) (rate(http_requests_total{namespace=\"$namespace\", job=~\"$job\", handler=\"query\"}[$interval]))", + "expr": "sum by (namespace, job, code) (rate(http_requests_total{namespace=\"$namespace\", job=~\"$job\", handler=\"query\",code=~\"5..\"}[$interval])) / ignoring (code) group_left() sum by (namespace, job) (rate(http_requests_total{namespace=\"$namespace\", job=~\"$job\", handler=\"query\"}[$interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "error", "step": 10 } ], @@ -493,10 +492,9 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job) (rate(http_requests_total{namespace=\"$namespace\", job=~\"$job\", handler=\"query_range\",code=~\"5..\"}[$interval])) / sum by (namespace, job) (rate(http_requests_total{namespace=\"$namespace\", job=~\"$job\", handler=\"query_range\"}[$interval]))", + "expr": "sum by (namespace, job, code) (rate(http_requests_total{namespace=\"$namespace\", job=~\"$job\", handler=\"query_range\",code=~\"5..\"}[$interval])) / ignoring (code) group_left() sum by (namespace, job) (rate(http_requests_total{namespace=\"$namespace\", job=~\"$job\", handler=\"query_range\"}[$interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "error", "step": 10 } ], @@ -874,10 +872,9 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job) (rate(grpc_client_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",namespace=\"$namespace\", job=~\"$job\", grpc_type=\"unary\"}[$interval])) / sum by (namespace, job) (rate(grpc_client_handled_total{namespace=\"$namespace\", job=~\"$job\", grpc_type=\"unary\"}[$interval]))", + "expr": "sum by (namespace, job, grpc_code) (rate(grpc_client_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",namespace=\"$namespace\", job=~\"$job\", grpc_type=\"unary\"}[$interval])) / ignoring (grpc_code) group_left() sum by (namespace, job) (rate(grpc_client_handled_total{namespace=\"$namespace\", job=~\"$job\", grpc_type=\"unary\"}[$interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "error", "step": 10 } ], @@ -1255,10 +1252,9 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job) (rate(grpc_client_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",namespace=\"$namespace\", job=~\"$job\", grpc_type=\"server_stream\"}[$interval])) / sum by (namespace, job) (rate(grpc_client_handled_total{namespace=\"$namespace\", job=~\"$job\", grpc_type=\"server_stream\"}[$interval]))", + "expr": "sum by (namespace, job, grpc_code) (rate(grpc_client_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",namespace=\"$namespace\", job=~\"$job\", grpc_type=\"server_stream\"}[$interval])) / ignoring (grpc_code) group_left() sum by (namespace, job) (rate(grpc_client_handled_total{namespace=\"$namespace\", job=~\"$job\", grpc_type=\"server_stream\"}[$interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "error", "step": 10 } ], diff --git a/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-receive.configmap.yaml b/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-receive.configmap.yaml index 1fc3b0c0dd..edc2ebcc77 100644 --- a/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-receive.configmap.yaml +++ b/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-receive.configmap.yaml @@ -164,10 +164,9 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job) (rate(http_requests_total{namespace=\"$namespace\", job=~\"$job\", handler=\"receive\",code=~\"5..\"}[$interval])) / sum by (namespace, job) (rate(http_requests_total{namespace=\"$namespace\", job=~\"$job\", handler=\"receive\"}[$interval]))", + "expr": "sum by (namespace, job, code) (rate(http_requests_total{namespace=\"$namespace\", job=~\"$job\", handler=\"receive\",code=~\"5..\"}[$interval])) / ignoring (code) group_left() sum by (namespace, job) (rate(http_requests_total{namespace=\"$namespace\", job=~\"$job\", handler=\"receive\"}[$interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "error", "step": 10 } ], @@ -387,7 +386,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (tenant, code) (rate(http_requests_total{namespace=\"$namespace\", job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\"}[$interval]))", + "expr": "sum by (tenant, code) (rate(http_requests_total{namespace=\"$namespace\", job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{code}} - {{tenant}}", @@ -473,7 +472,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (tenant, code) (rate(http_requests_total{namespace=\"$namespace\", job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\", code!~\"2..\"}[$interval]))", + "expr": "sum by (tenant, code) (rate(http_requests_total{namespace=\"$namespace\", job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\", code!~\"2..\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{code}} - {{tenant}}", @@ -559,7 +558,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job, tenant) (rate(http_request_duration_seconds_sum{namespace=\"$namespace\", job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\"}[$interval])) / sum by (namespace, job, tenant) (http_request_duration_seconds_count{namespace=\"$namespace\", job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\"})", + "expr": "sum by (namespace, job, tenant) (rate(http_request_duration_seconds_sum{namespace=\"$namespace\", job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\"}[$__rate_interval])) / sum by (namespace, job, tenant) (http_request_duration_seconds_count{namespace=\"$namespace\", job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{tenant}}", @@ -657,7 +656,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job, tenant) (rate(http_request_size_bytes_sum{namespace=\"$namespace\", job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\", code=~\"2..\"}[$interval])) / sum by (namespace, job, tenant) (rate(http_request_size_bytes_count{namespace=\"$namespace\", job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\", code=~\"2..\"}[$interval]))", + "expr": "sum by (namespace, job, tenant) (rate(http_request_size_bytes_sum{namespace=\"$namespace\", job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\", code=~\"2..\"}[$__rate_interval])) / sum by (namespace, job, tenant) (rate(http_request_size_bytes_count{namespace=\"$namespace\", job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\", code=~\"2..\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{tenant}}", @@ -743,7 +742,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job, tenant) (rate(http_request_size_bytes_sum{namespace=\"$namespace\", job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\", code!~\"2..\"}[$interval])) / sum by (namespace, job, tenant) (rate(http_request_size_bytes_count{namespace=\"$namespace\", job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\", code!~\"2..\"}[$interval]))", + "expr": "sum by (namespace, job, tenant) (rate(http_request_size_bytes_sum{namespace=\"$namespace\", job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\", code!~\"2..\"}[$__rate_interval])) / sum by (namespace, job, tenant) (rate(http_request_size_bytes_count{namespace=\"$namespace\", job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\", code!~\"2..\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{tenant}}", @@ -927,7 +926,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_receive_write_timeseries_bucket{namespace=\"$namespace\", job=~\"$job\", tenant=~\"$tenant\", code=~\"2..\"}[$interval])) by (namespace, job, tenant) ", + "expr": "sum(rate(thanos_receive_write_timeseries_sum{namespace=\"$namespace\", job=~\"$job\", tenant=~\"$tenant\", code=~\"2..\"}[$__rate_interval])) by (namespace, job, tenant) ", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{tenant}}", @@ -1013,7 +1012,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_receive_write_timeseries_bucket{namespace=\"$namespace\", job=~\"$job\", tenant=~\"$tenant\", code!~\"2..\"}[$interval])) by (tenant, code) ", + "expr": "sum(rate(thanos_receive_write_timeseries_sum{namespace=\"$namespace\", job=~\"$job\", tenant=~\"$tenant\", code!~\"2..\"}[$__rate_interval])) by (tenant, code) ", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{code}} - {{tenant}}", @@ -1099,7 +1098,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_receive_write_samples_sum{namespace=\"$namespace\", job=~\"$job\", tenant=~\"$tenant\", code=~\"2..\"}[$interval])) by (namespace, job, tenant) ", + "expr": "sum(rate(thanos_receive_write_samples_sum{namespace=\"$namespace\", job=~\"$job\", tenant=~\"$tenant\", code=~\"2..\"}[$__rate_interval])) by (namespace, job, tenant) ", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{tenant}}", @@ -1185,7 +1184,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_receive_write_samples_sum{namespace=\"$namespace\", job=~\"$job\", tenant=~\"$tenant\", code!~\"2..\"}[$interval])) by (tenant, code) ", + "expr": "sum(rate(thanos_receive_write_samples_sum{namespace=\"$namespace\", job=~\"$job\", tenant=~\"$tenant\", code!~\"2..\"}[$__rate_interval])) by (tenant, code) ", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{code}} - {{tenant}}", @@ -1284,7 +1283,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job) (rate(thanos_receive_replications_total{namespace=\"$namespace\", job=~\"$job\"}[$interval]))", + "expr": "sum by (namespace, job) (rate(thanos_receive_replications_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "all {{job}}", @@ -1469,7 +1468,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job) (rate(thanos_receive_forward_requests_total{namespace=\"$namespace\", job=~\"$job\"}[$interval]))", + "expr": "sum by (namespace, job) (rate(thanos_receive_forward_requests_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "all {{job}}", @@ -1811,10 +1810,9 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",namespace=\"$namespace\", job=~\"$job\", grpc_type=\"unary\", grpc_method=\"RemoteWrite\"}[$interval])) / sum by (namespace, job) (rate(grpc_server_handled_total{namespace=\"$namespace\", job=~\"$job\", grpc_type=\"unary\", grpc_method=\"RemoteWrite\"}[$interval]))", + "expr": "sum by (namespace, job, grpc_code) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",namespace=\"$namespace\", job=~\"$job\", grpc_type=\"unary\", grpc_method=\"RemoteWrite\"}[$interval])) / ignoring (grpc_code) group_left() sum by (namespace, job) (rate(grpc_server_handled_total{namespace=\"$namespace\", job=~\"$job\", grpc_type=\"unary\", grpc_method=\"RemoteWrite\"}[$interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "error", "step": 10 } ], @@ -2192,10 +2190,9 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",namespace=\"$namespace\", job=~\"$job\", grpc_type=\"unary\", grpc_method!=\"RemoteWrite\"}[$interval])) / sum by (namespace, job) (rate(grpc_server_handled_total{namespace=\"$namespace\", job=~\"$job\", grpc_type=\"unary\", grpc_method!=\"RemoteWrite\"}[$interval]))", + "expr": "sum by (namespace, job, grpc_code) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",namespace=\"$namespace\", job=~\"$job\", grpc_type=\"unary\", grpc_method!=\"RemoteWrite\"}[$interval])) / ignoring (grpc_code) group_left() sum by (namespace, job) (rate(grpc_server_handled_total{namespace=\"$namespace\", job=~\"$job\", grpc_type=\"unary\", grpc_method!=\"RemoteWrite\"}[$interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "error", "step": 10 } ], @@ -2573,10 +2570,9 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",namespace=\"$namespace\", job=~\"$job\", grpc_type=\"server_stream\"}[$interval])) / sum by (namespace, job) (rate(grpc_server_handled_total{namespace=\"$namespace\", job=~\"$job\", grpc_type=\"server_stream\"}[$interval]))", + "expr": "sum by (namespace, job, grpc_code) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",namespace=\"$namespace\", job=~\"$job\", grpc_type=\"server_stream\"}[$interval])) / ignoring (grpc_code) group_left() sum by (namespace, job) (rate(grpc_server_handled_total{namespace=\"$namespace\", job=~\"$job\", grpc_type=\"server_stream\"}[$interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "error", "step": 10 } ], diff --git a/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-rule.configmap.yaml b/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-rule.configmap.yaml index f3fa293447..cc88e6c75d 100644 --- a/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-rule.configmap.yaml +++ b/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-rule.configmap.yaml @@ -53,15 +53,15 @@ data: ], "spaceLength": 10, - "span": 4, + "span": 3, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job, strategy) (rate(prometheus_rule_evaluations_total{namespace=\"$namespace\", job=~\"$job\"}[$interval]))", + "expr": "sum by (namespace, job, rule_group, strategy) (rate(prometheus_rule_evaluations_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{ strategy }}", + "legendFormat": "{{ rule_group }} {{ strategy }}", "legendLink": null, "step": 10 } @@ -139,15 +139,15 @@ data: ], "spaceLength": 10, - "span": 4, + "span": 3, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job, strategy) (increase(prometheus_rule_group_iterations_missed_total{namespace=\"$namespace\", job=~\"$job\"}[$interval]))", + "expr": "sum by (namespace, job, rule_group, strategy) (rate(prometheus_rule_evaluation_failures_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{ strategy }}", + "legendFormat": "{{ rule_group }} {{ strategy }}", "legendLink": null, "step": 10 } @@ -157,7 +157,7 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "Rule Group Evaluations Missed", + "title": "Rule Group Evaluations Failed", "tooltip": { "shared": false, "sort": 0, @@ -225,12 +225,98 @@ data: ], "spaceLength": 10, - "span": 4, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (namespace, job, rule_group, strategy) (increase(prometheus_rule_group_iterations_missed_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ rule_group }} {{ strategy }}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Rule Group Evaluations Missed", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 3, "stack": false, "steppedLine": false, "targets": [ { - "expr": "(\n max by(namespace, job, rule_group) (prometheus_rule_group_last_duration_seconds{namespace=\"$namespace\", job=~\"$job\"})\n >\n sum by(namespace, job, rule_group) (prometheus_rule_group_interval_seconds{namespace=\"$namespace\", job=~\"$job\"})\n)\n", + "expr": "(\n sum by(namespace, job, rule_group) (prometheus_rule_group_last_duration_seconds{namespace=\"$namespace\", job=~\"$job\"})\n >\n sum by(namespace, job, rule_group) (prometheus_rule_group_interval_seconds{namespace=\"$namespace\", job=~\"$job\"})\n)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ rule_group }}", @@ -243,7 +329,7 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "Rule Group Evlauations Too Slow", + "title": "Rule Group Evaluations Too Slow", "tooltip": { "shared": false, "sort": 0, @@ -300,7 +386,7 @@ data: "datasource": "$datasource", "description": "Shows rate of dropped alerts.", "fill": 1, - "id": 4, + "id": 5, "legend": { "avg": false, "current": false, @@ -329,7 +415,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job, alertmanager) (rate(thanos_alert_sender_alerts_dropped_total{namespace=\"$namespace\", job=~\"$job\"}[$interval]))", + "expr": "sum by (namespace, job, alertmanager) (rate(thanos_alert_sender_alerts_dropped_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{alertmanager}}", @@ -387,7 +473,7 @@ data: "datasource": "$datasource", "description": "Shows rate of alerts that successfully sent to alert manager.", "fill": 10, - "id": 5, + "id": 6, "legend": { "avg": false, "current": false, @@ -416,7 +502,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job, alertmanager) (rate(thanos_alert_sender_alerts_sent_total{namespace=\"$namespace\", job=~\"$job\"}[$interval]))", + "expr": "sum by (namespace, job, alertmanager) (rate(thanos_alert_sender_alerts_sent_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{alertmanager}}", @@ -474,7 +560,7 @@ data: "datasource": "$datasource", "description": "Shows ratio of errors compared to the total number of sent alerts.", "fill": 10, - "id": 6, + "id": 7, "legend": { "avg": false, "current": false, @@ -560,7 +646,7 @@ data: "datasource": "$datasource", "description": "Shows how long has it taken to send alerts to alert manager.", "fill": 1, - "id": 7, + "id": 8, "legend": { "avg": false, "current": false, @@ -698,7 +784,7 @@ data: "datasource": "$datasource", "description": "Shows rate of queued alerts.", "fill": 1, - "id": 8, + "id": 9, "legend": { "avg": false, "current": false, @@ -727,7 +813,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job) (rate(thanos_alert_queue_alerts_dropped_total{namespace=\"$namespace\", job=~\"$job\"}[$interval]))", + "expr": "sum by (namespace, job) (rate(thanos_alert_queue_alerts_dropped_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}}", @@ -785,7 +871,7 @@ data: "datasource": "$datasource", "description": "Shows ratio of dropped alerts compared to the total number of queued alerts.", "fill": 10, - "id": 9, + "id": 10, "legend": { "avg": false, "current": false, @@ -883,7 +969,7 @@ data: "datasource": "$datasource", "description": "Shows rate of handled Unary gRPC requests.", "fill": 10, - "id": 10, + "id": 11, "legend": { "avg": false, "current": false, @@ -1040,7 +1126,7 @@ data: "datasource": "$datasource", "description": "Shows ratio of errors compared to the total number of handled requests.", "fill": 10, - "id": 11, + "id": 12, "legend": { "avg": false, "current": false, @@ -1069,10 +1155,9 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",namespace=\"$namespace\", job=~\"$job\", grpc_type=\"unary\"}[$interval])) / sum by (namespace, job) (rate(grpc_server_handled_total{namespace=\"$namespace\", job=~\"$job\", grpc_type=\"unary\"}[$interval]))", + "expr": "sum by (namespace, job, grpc_code) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",namespace=\"$namespace\", job=~\"$job\", grpc_type=\"unary\"}[$interval])) / ignoring (grpc_code) group_left() sum by (namespace, job) (rate(grpc_server_handled_total{namespace=\"$namespace\", job=~\"$job\", grpc_type=\"unary\"}[$interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "error", "step": 10 } ], @@ -1126,7 +1211,7 @@ data: "datasource": "$datasource", "description": "Shows how long has it taken to handle requests, in quantiles.", "fill": 1, - "id": 12, + "id": 13, "legend": { "avg": false, "current": false, @@ -1264,7 +1349,7 @@ data: "datasource": "$datasource", "description": "Shows rate of handled Streamed gRPC requests.", "fill": 10, - "id": 13, + "id": 14, "legend": { "avg": false, "current": false, @@ -1421,7 +1506,7 @@ data: "datasource": "$datasource", "description": "Shows ratio of errors compared to the total number of handled requests.", "fill": 10, - "id": 14, + "id": 15, "legend": { "avg": false, "current": false, @@ -1450,10 +1535,9 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",namespace=\"$namespace\", job=~\"$job\", grpc_type=\"server_stream\"}[$interval])) / sum by (namespace, job) (rate(grpc_server_handled_total{namespace=\"$namespace\", job=~\"$job\", grpc_type=\"server_stream\"}[$interval]))", + "expr": "sum by (namespace, job, grpc_code) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",namespace=\"$namespace\", job=~\"$job\", grpc_type=\"server_stream\"}[$interval])) / ignoring (grpc_code) group_left() sum by (namespace, job) (rate(grpc_server_handled_total{namespace=\"$namespace\", job=~\"$job\", grpc_type=\"server_stream\"}[$interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "error", "step": 10 } ], @@ -1507,7 +1591,7 @@ data: "datasource": "$datasource", "description": "Shows how long has it taken to handle requests, in quantiles", "fill": 1, - "id": 15, + "id": 16, "legend": { "avg": false, "current": false, @@ -1644,7 +1728,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 16, + "id": 17, "legend": { "avg": false, "current": false, @@ -1770,7 +1854,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 17, + "id": 18, "legend": { "avg": false, "current": false, @@ -1856,7 +1940,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 18, + "id": 19, "legend": { "avg": false, "current": false, diff --git a/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-store.configmap.yaml b/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-store.configmap.yaml index 4dc7edac30..9115509225 100644 --- a/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-store.configmap.yaml +++ b/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-thanos-store.configmap.yaml @@ -216,10 +216,9 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",namespace=\"$namespace\", job=~\"$job\", grpc_type=\"unary\"}[$interval])) / sum by (namespace, job) (rate(grpc_server_handled_total{namespace=\"$namespace\", job=~\"$job\", grpc_type=\"unary\"}[$interval]))", + "expr": "sum by (namespace, job, grpc_code) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",namespace=\"$namespace\", job=~\"$job\", grpc_type=\"unary\"}[$interval])) / ignoring (grpc_code) group_left() sum by (namespace, job) (rate(grpc_server_handled_total{namespace=\"$namespace\", job=~\"$job\", grpc_type=\"unary\"}[$interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "error", "step": 10 } ], @@ -597,10 +596,9 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",namespace=\"$namespace\", job=~\"$job\", grpc_type=\"server_stream\"}[$interval])) / sum by (namespace, job) (rate(grpc_server_handled_total{namespace=\"$namespace\", job=~\"$job\", grpc_type=\"server_stream\"}[$interval]))", + "expr": "sum by (namespace, job, grpc_code) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",namespace=\"$namespace\", job=~\"$job\", grpc_type=\"server_stream\"}[$interval])) / ignoring (grpc_code) group_left() sum by (namespace, job) (rate(grpc_server_handled_total{namespace=\"$namespace\", job=~\"$job\", grpc_type=\"server_stream\"}[$interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "error", "step": 10 } ], @@ -821,7 +819,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job, operation) (rate(thanos_objstore_bucket_operations_total{namespace=\"$namespace\", job=~\"$job\"}[$interval]))", + "expr": "sum by (namespace, job, operation) (rate(thanos_objstore_bucket_operations_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}} {{operation}}", @@ -908,7 +906,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job, operation) (rate(thanos_objstore_bucket_operation_failures_total{namespace=\"$namespace\", job=~\"$job\"}[$interval])) / sum by (namespace, job, operation) (rate(thanos_objstore_bucket_operations_total{namespace=\"$namespace\", job=~\"$job\"}[$interval]))", + "expr": "sum by (namespace, job, operation) (rate(thanos_objstore_bucket_operation_failures_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval])) / sum by (namespace, job, operation) (rate(thanos_objstore_bucket_operations_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}} {{operation}}", @@ -995,7 +993,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (namespace, job, operation, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{namespace=\"$namespace\", job=~\"$job\"}[$interval]))) * 1", + "expr": "histogram_quantile(0.99, sum by (namespace, job, operation, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))) * 1", "format": "time_series", "intervalFactor": 2, "legendFormat": "P99 {{job}}", @@ -1003,7 +1001,7 @@ data: "step": 10 }, { - "expr": "sum by (namespace, job, operation) (rate(thanos_objstore_bucket_operation_duration_seconds_sum{namespace=\"$namespace\", job=~\"$job\"}[$interval])) * 1 / sum by (namespace, job, operation) (rate(thanos_objstore_bucket_operation_duration_seconds_count{namespace=\"$namespace\", job=~\"$job\"}[$interval]))", + "expr": "sum by (namespace, job, operation) (rate(thanos_objstore_bucket_operation_duration_seconds_sum{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval])) * 1 / sum by (namespace, job, operation) (rate(thanos_objstore_bucket_operation_duration_seconds_count{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "mean {{job}}", @@ -1011,7 +1009,7 @@ data: "step": 10 }, { - "expr": "histogram_quantile(0.50, sum by (namespace, job, operation, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{namespace=\"$namespace\", job=~\"$job\"}[$interval]))) * 1", + "expr": "histogram_quantile(0.50, sum by (namespace, job, operation, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))) * 1", "format": "time_series", "intervalFactor": 2, "legendFormat": "P50 {{job}}", @@ -1110,7 +1108,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job) (rate(thanos_bucket_store_block_loads_total{namespace=\"$namespace\", job=~\"$job\"}[$interval]))", + "expr": "sum by (namespace, job) (rate(thanos_bucket_store_block_loads_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "block loads", @@ -1283,7 +1281,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job, operation) (rate(thanos_bucket_store_block_drops_total{namespace=\"$namespace\", job=~\"$job\"}[$interval]))", + "expr": "sum by (namespace, job, operation) (rate(thanos_bucket_store_block_drops_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "block drops {{job}}", @@ -1468,7 +1466,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job, item_type) (rate(thanos_store_index_cache_requests_total{namespace=\"$namespace\", job=~\"$job\"}[$interval]))", + "expr": "sum by (namespace, job, item_type) (rate(thanos_store_index_cache_requests_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}} {{item_type}}", @@ -1555,7 +1553,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job, item_type) (rate(thanos_store_index_cache_hits_total{namespace=\"$namespace\", job=~\"$job\"}[$interval]))", + "expr": "sum by (namespace, job, item_type) (rate(thanos_store_index_cache_hits_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}} {{item_type}}", @@ -1642,7 +1640,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job, item_type) (rate(thanos_store_index_cache_items_added_total{namespace=\"$namespace\", job=~\"$job\"}[$interval]))", + "expr": "sum by (namespace, job, item_type) (rate(thanos_store_index_cache_items_added_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}} {{item_type}}", @@ -1729,7 +1727,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (namespace, job, item_type) (rate(thanos_store_index_cache_items_evicted_total{namespace=\"$namespace\", job=~\"$job\"}[$interval]))", + "expr": "sum by (namespace, job, item_type) (rate(thanos_store_index_cache_items_evicted_total{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}} {{item_type}}", @@ -1828,7 +1826,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (namespace, job, le) (rate(thanos_bucket_store_sent_chunk_size_bytes_bucket{namespace=\"$namespace\", job=~\"$job\"}[$interval])))", + "expr": "histogram_quantile(0.99, sum by (namespace, job, le) (rate(thanos_bucket_store_sent_chunk_size_bytes_bucket{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "P99", @@ -1836,7 +1834,7 @@ data: "step": 10 }, { - "expr": "sum by (namespace, job) (rate(thanos_bucket_store_sent_chunk_size_bytes_sum{namespace=\"$namespace\", job=~\"$job\"}[$interval])) / sum by (namespace, job) (rate(thanos_bucket_store_sent_chunk_size_bytes_count{namespace=\"$namespace\", job=~\"$job\"}[$interval]))", + "expr": "sum by (namespace, job) (rate(thanos_bucket_store_sent_chunk_size_bytes_sum{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval])) / sum by (namespace, job) (rate(thanos_bucket_store_sent_chunk_size_bytes_count{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "mean", @@ -1844,7 +1842,7 @@ data: "step": 10 }, { - "expr": "histogram_quantile(0.99, sum by (namespace, job, le) (rate(thanos_bucket_store_sent_chunk_size_bytes_bucket{namespace=\"$namespace\", job=~\"$job\"}[$interval])))", + "expr": "histogram_quantile(0.50, sum by (namespace, job, le) (rate(thanos_bucket_store_sent_chunk_size_bytes_bucket{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "P50", @@ -1942,7 +1940,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "thanos_bucket_store_series_blocks_queried{namespace=\"$namespace\", job=~\"$job\", quantile=\"0.99\"}", + "expr": "histogram_quantile(0.99, sum by (le) (rate(thanos_bucket_store_series_blocks_queried{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "P99", @@ -1950,7 +1948,7 @@ data: "step": 10 }, { - "expr": "sum by (namespace, job) (rate(thanos_bucket_store_series_blocks_queried_sum{namespace=\"$namespace\", job=~\"$job\"}[$interval])) / sum by (namespace, job) (rate(thanos_bucket_store_series_blocks_queried_count{namespace=\"$namespace\", job=~\"$job\"}[$interval]))", + "expr": "sum by (namespace, job) (rate(thanos_bucket_store_series_blocks_queried_sum{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval])) / sum by (namespace, job) (rate(thanos_bucket_store_series_blocks_queried_count{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "mean {{job}}", @@ -1958,7 +1956,7 @@ data: "step": 10 }, { - "expr": "thanos_bucket_store_series_blocks_queried{namespace=\"$namespace\", job=~\"$job\", quantile=\"0.50\"}", + "expr": "histogram_quantile(0.50, sum by (le) (rate(thanos_bucket_store_series_blocks_queried{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "P50", @@ -2045,7 +2043,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "thanos_bucket_store_series_data_fetched{namespace=\"$namespace\", job=~\"$job\", quantile=\"0.99\"}", + "expr": "histogram_quantile(0.99, sum by (le) (rate(thanos_bucket_store_series_data_fetched{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "P99: {{data_type}} / {{job}}", @@ -2053,7 +2051,7 @@ data: "step": 10 }, { - "expr": "sum by (namespace, job, data_type) (rate(thanos_bucket_store_series_data_fetched_sum{namespace=\"$namespace\", job=~\"$job\"}[$interval])) / sum by (namespace, job, data_type) (rate(thanos_bucket_store_series_data_fetched_count{namespace=\"$namespace\", job=~\"$job\"}[$interval]))", + "expr": "sum by (namespace, job, data_type) (rate(thanos_bucket_store_series_data_fetched_sum{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval])) / sum by (namespace, job, data_type) (rate(thanos_bucket_store_series_data_fetched_count{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "mean: {{data_type}} / {{job}}", @@ -2061,7 +2059,7 @@ data: "step": 10 }, { - "expr": "thanos_bucket_store_series_data_fetched{namespace=\"$namespace\", job=~\"$job\", quantile=\"0.50\"}", + "expr": "histogram_quantile(0.50, sum by (le) (rate(thanos_bucket_store_series_data_fetched{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "P50: {{data_type}} / {{job}}", @@ -2148,7 +2146,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "thanos_bucket_store_series_data_touched{namespace=\"$namespace\", job=~\"$job\", quantile=\"0.99\"}", + "expr": "histogram_quantile(0.99, sum by (le) (rate(thanos_bucket_store_series_data_touched{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "P99: {{data_type}} / {{job}}", @@ -2156,7 +2154,7 @@ data: "step": 10 }, { - "expr": "sum by (namespace, job, data_type) (rate(thanos_bucket_store_series_data_touched_sum{namespace=\"$namespace\", job=~\"$job\"}[$interval])) / sum by (namespace, job, data_type) (rate(thanos_bucket_store_series_data_touched_count{namespace=\"$namespace\", job=~\"$job\"}[$interval]))", + "expr": "sum by (namespace, job, data_type) (rate(thanos_bucket_store_series_data_touched_sum{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval])) / sum by (namespace, job, data_type) (rate(thanos_bucket_store_series_data_touched_count{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "mean: {{data_type}} / {{job}}", @@ -2164,7 +2162,7 @@ data: "step": 10 }, { - "expr": "thanos_bucket_store_series_data_touched{namespace=\"$namespace\", job=~\"$job\", quantile=\"0.50\"}", + "expr": "histogram_quantile(0.50, sum by (le) (rate(thanos_bucket_store_series_data_touched{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "P50: {{data_type}} / {{job}}", @@ -2250,7 +2248,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "thanos_bucket_store_series_result_series{namespace=\"$namespace\", job=~\"$job\",quantile=\"0.99\"}", + "expr": "histogram_quantile(0.99, sum by (le) (rate(thanos_bucket_store_series_result_series{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "P99", @@ -2258,7 +2256,7 @@ data: "step": 10 }, { - "expr": "sum by (namespace, job) (rate(thanos_bucket_store_series_result_series_sum{namespace=\"$namespace\", job=~\"$job\"}[$interval])) / sum by (namespace, job) (rate(thanos_bucket_store_series_result_series_count{namespace=\"$namespace\", job=~\"$job\"}[$interval]))", + "expr": "sum by (namespace, job) (rate(thanos_bucket_store_series_result_series_sum{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval])) / sum by (namespace, job) (rate(thanos_bucket_store_series_result_series_count{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "mean {{job}}", @@ -2266,7 +2264,7 @@ data: "step": 10 }, { - "expr": "thanos_bucket_store_series_result_series{namespace=\"$namespace\", job=~\"$job\",quantile=\"0.50\"}", + "expr": "histogram_quantile(0.50, sum by (le) (rate(thanos_bucket_store_series_result_series{namespace=\"$namespace\", job=~\"$job\"}[$__rate_interval])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "P50", diff --git a/resources/observability/prometheusrules/observatorium-thanos-production.prometheusrules.yaml b/resources/observability/prometheusrules/observatorium-thanos-production.prometheusrules.yaml index 7b1633085b..7761d5a1ce 100644 --- a/resources/observability/prometheusrules/observatorium-thanos-production.prometheusrules.yaml +++ b/resources/observability/prometheusrules/observatorium-thanos-production.prometheusrules.yaml @@ -370,10 +370,10 @@ spec: description: Thanos Store {{$labels.job}} in {{$labels.namespace}} is failing to handle {{$value | humanize}}% of requests. message: Thanos Store {{$labels.job}} in {{$labels.namespace}} is failing to handle {{$value | humanize}}% of requests. runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#thanosstoregrpcerrorrate - summary: Thanos Store is failing to handle qrpcd requests. + summary: Thanos Store is failing to handle gRPC requests. expr: | ( - sum by (namespace, job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"observatorium-thanos-store-shard.*"}[5m])) + sum by (namespace, job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"observatorium-thanos-store-shard.*"}[5m])) / sum by (namespace, job) (rate(grpc_server_started_total{job=~"observatorium-thanos-store-shard.*"}[5m])) * 100 > 5 diff --git a/resources/observability/prometheusrules/observatorium-thanos-stage.prometheusrules.yaml b/resources/observability/prometheusrules/observatorium-thanos-stage.prometheusrules.yaml index 20730cc034..e45fbbbf33 100644 --- a/resources/observability/prometheusrules/observatorium-thanos-stage.prometheusrules.yaml +++ b/resources/observability/prometheusrules/observatorium-thanos-stage.prometheusrules.yaml @@ -370,10 +370,10 @@ spec: description: Thanos Store {{$labels.job}} in {{$labels.namespace}} is failing to handle {{$value | humanize}}% of requests. message: Thanos Store {{$labels.job}} in {{$labels.namespace}} is failing to handle {{$value | humanize}}% of requests. runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#thanosstoregrpcerrorrate - summary: Thanos Store is failing to handle qrpcd requests. + summary: Thanos Store is failing to handle gRPC requests. expr: | ( - sum by (namespace, job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"observatorium-thanos-store-shard.*"}[5m])) + sum by (namespace, job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"observatorium-thanos-store-shard.*"}[5m])) / sum by (namespace, job) (rate(grpc_server_started_total{job=~"observatorium-thanos-store-shard.*"}[5m])) * 100 > 5