diff --git a/CHANGELOG.md b/CHANGELOG.md index 1ea016db6c7..76555e096d3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,7 @@ * [ENHANCEMENT] Added an example for running Tempo vulture [#3829](https://github.com/grafana/tempo/pull/3829) (@javiermolinar) * [ENHANCEMENT] Add a new helper method to allow debugging e2e tests [#3836](https://github.com/grafana/tempo/pull/3836) (@javiermolinar) * [ENHANCEMENT] Self document makefile [#3844](https://github.com/grafana/tempo/pull/3844) (@javiermolinar) +* [ENHANCEMENT] Mixin, make recording rule range interval configurable and increase range interval in alert to support scrape interval of 1 minute [#3851](https://github.com/grafana/tempo/pull/3851) (@jmichalek132) * [BUGFIX] Fix panic in certain metrics queries using `rate()` with `by` [#3847](https://github.com/grafana/tempo/pull/3847) (@stoewer) * [BUGFIX] Fix metrics queries when grouping by attributes that may not exist [#3734](https://github.com/grafana/tempo/pull/3734) (@mdisibio) * [BUGFIX] Fix frontend parsing error on cached responses [#3759](https://github.com/grafana/tempo/pull/3759) (@mdisibio) diff --git a/operations/tempo-mixin-compiled/alerts.yaml b/operations/tempo-mixin-compiled/alerts.yaml index 63ddc9a0775..bb8032b4df1 100644 --- a/operations/tempo-mixin-compiled/alerts.yaml +++ b/operations/tempo-mixin-compiled/alerts.yaml @@ -128,7 +128,7 @@ "message": "Ingesters in {{ $labels.cluster }}/{{ $labels.namespace }} are receiving more data/second than desired, add more ingesters." "runbook_url": "https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoProvisioningTooManyWrites" "expr": | - avg by (cluster, namespace) (rate(tempo_ingester_bytes_received_total{job=~".+/ingester"}[1m])) / 1024 / 1024 > 30 + avg by (cluster, namespace) (rate(tempo_ingester_bytes_received_total{job=~".+/ingester"}[5m])) / 1024 / 1024 > 30 "for": "15m" "labels": "severity": "warning" diff --git a/operations/tempo-mixin-compiled/dashboards/tempo-reads.json b/operations/tempo-mixin-compiled/dashboards/tempo-reads.json index 37f8766195c..2564b7ec408 100644 --- a/operations/tempo-mixin-compiled/dashboards/tempo-reads.json +++ b/operations/tempo-mixin-compiled/dashboards/tempo-reads.json @@ -35,6 +35,8 @@ "3xx": "#6ED0E0", "4xx": "#EF843C", "5xx": "#E24D42", + "OK": "#7EB26D", + "cancel": "#A9A9A9", "error": "#E24D42", "success": "#7EB26D" }, @@ -72,13 +74,11 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/cortex-gw(-internal)?\", route=~\"api_.*\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/cortex-gw(-internal)?\", route=~\"api_.*\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "{{status}}", - "refId": "A", - "step": 10 + "refId": "A" } ], "thresholds": [ @@ -245,6 +245,8 @@ "3xx": "#6ED0E0", "4xx": "#EF843C", "5xx": "#E24D42", + "OK": "#7EB26D", + "cancel": "#A9A9A9", "error": "#E24D42", "success": "#7EB26D" }, @@ -282,13 +284,11 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\", route=~\"api_.*\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\", route=~\"api_.*\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "{{status}}", - "refId": "A", - "step": 10 + "refId": "A" } ], "thresholds": [ @@ -455,6 +455,8 @@ "3xx": "#6ED0E0", "4xx": "#EF843C", "5xx": "#E24D42", + "OK": "#7EB26D", + "cancel": "#A9A9A9", "error": "#E24D42", "success": "#7EB26D" }, @@ -492,13 +494,11 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/querier\", route=~\"querier_api_.*\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/querier\", route=~\"querier_api_.*\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "{{status}}", - "refId": "A", - "step": 10 + "refId": "A" } ], "thresholds": [ @@ -665,6 +665,8 @@ "3xx": "#6ED0E0", "4xx": "#EF843C", "5xx": "#E24D42", + "OK": "#7EB26D", + "cancel": "#A9A9A9", "error": "#E24D42", "success": "#7EB26D" }, @@ -702,13 +704,11 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_querier_external_endpoint_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/querier\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_querier_external_endpoint_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/querier\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "{{status}}", - "refId": "A", - "step": 10 + "refId": "A" } ], "thresholds": [ @@ -875,6 +875,8 @@ "3xx": "#6ED0E0", "4xx": "#EF843C", "5xx": "#E24D42", + "OK": "#7EB26D", + "cancel": "#A9A9A9", "error": "#E24D42", "success": "#7EB26D" }, @@ -912,13 +914,11 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"/tempopb.Querier/.*\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"/tempopb.Querier/.*\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "{{status}}", - "refId": "A", - "step": 10 + "refId": "A" } ], "thresholds": [ @@ -1085,6 +1085,8 @@ "3xx": "#6ED0E0", "4xx": "#EF843C", "5xx": "#E24D42", + "OK": "#7EB26D", + "cancel": "#A9A9A9", "error": "#E24D42", "success": "#7EB26D" }, @@ -1122,13 +1124,11 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_memcache_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/querier\",method=~\"Memcache.Get|Memcache.GetMulti\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_memcache_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/querier\",method=~\"Memcache.Get|Memcache.GetMulti\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "{{status}}", - "refId": "A", - "step": 10 + "refId": "A" } ], "thresholds": [ @@ -1295,6 +1295,8 @@ "3xx": "#6ED0E0", "4xx": "#EF843C", "5xx": "#E24D42", + "OK": "#7EB26D", + "cancel": "#A9A9A9", "error": "#E24D42", "success": "#7EB26D" }, @@ -1332,13 +1334,11 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(tempodb_backend_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/querier\",operation=\"GET\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(tempodb_backend_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/querier\",operation=\"GET\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "{{status}}", - "refId": "A", - "step": 10 + "refId": "A" } ], "thresholds": [ @@ -1508,7 +1508,7 @@ "value": "default" }, "hide": 0, - "label": "Data Source", + "label": "Data source", "name": "datasource", "options": [ diff --git a/operations/tempo-mixin-compiled/dashboards/tempo-resources.json b/operations/tempo-mixin-compiled/dashboards/tempo-resources.json index 80c69317fb8..57a6ff739dc 100644 --- a/operations/tempo-mixin-compiled/dashboards/tempo-resources.json +++ b/operations/tempo-mixin-compiled/dashboards/tempo-resources.json @@ -78,28 +78,22 @@ "expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"cortex-gw(-internal)?\"}[$__rate_interval]))", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "{{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"cortex-gw(-internal)?\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"cortex-gw(-internal)?\"})", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "limit", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"cortex-gw(-internal)?\", resource=\"cpu\"} > 0)", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "request", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -192,28 +186,22 @@ "expr": "sum by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"cortex-gw(-internal)?\"})", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "{{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"cortex-gw(-internal)?\"} > 0)", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "limit", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"cortex-gw(-internal)?\", resource=\"memory\"} > 0)", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "request", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -297,10 +285,8 @@ "expr": "sum by(instance) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/cortex-gw(-internal)?\"})", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "{{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -405,28 +391,22 @@ "expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"distributor\"}[$__rate_interval]))", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "{{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"distributor\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"distributor\"})", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "limit", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"distributor\", resource=\"cpu\"} > 0)", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "request", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -519,28 +499,22 @@ "expr": "sum by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"distributor\"})", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "{{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"distributor\"} > 0)", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "limit", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"distributor\", resource=\"memory\"} > 0)", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "request", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -624,10 +598,8 @@ "expr": "sum by(instance) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/distributor\"})", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "{{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -732,28 +704,22 @@ "expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"ingester\"}[$__rate_interval]))", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "{{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"ingester\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"ingester\"})", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "limit", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"ingester\", resource=\"cpu\"} > 0)", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "request", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -846,28 +812,22 @@ "expr": "sum by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"ingester\"})", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "{{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"ingester\"} > 0)", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "limit", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"ingester\", resource=\"memory\"} > 0)", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "request", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -951,10 +911,8 @@ "expr": "sum by(instance) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/ingester\"})", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "{{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1059,28 +1017,22 @@ "expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"metrics-generator\"}[$__rate_interval]))", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "{{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"metrics-generator\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"metrics-generator\"})", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "limit", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"metrics-generator\", resource=\"cpu\"} > 0)", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "request", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1173,28 +1125,22 @@ "expr": "sum by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"metrics-generator\"})", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "{{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"metrics-generator\"} > 0)", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "limit", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"metrics-generator\", resource=\"memory\"} > 0)", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "request", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1278,10 +1224,8 @@ "expr": "sum by(instance) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/metrics-generator\"})", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "{{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1386,28 +1330,22 @@ "expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"query-frontend\"}[$__rate_interval]))", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "{{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"query-frontend\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"query-frontend\"})", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "limit", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"query-frontend\", resource=\"cpu\"} > 0)", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "request", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1500,28 +1438,22 @@ "expr": "sum by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"query-frontend\"})", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "{{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"query-frontend\"} > 0)", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "limit", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"query-frontend\", resource=\"memory\"} > 0)", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "request", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1605,10 +1537,8 @@ "expr": "sum by(instance) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\"})", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "{{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1713,28 +1643,22 @@ "expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"querier\"}[$__rate_interval]))", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "{{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"querier\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"querier\"})", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "limit", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"querier\", resource=\"cpu\"} > 0)", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "request", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1827,28 +1751,22 @@ "expr": "sum by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"querier\"})", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "{{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"querier\"} > 0)", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "limit", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"querier\", resource=\"memory\"} > 0)", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "request", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1932,10 +1850,8 @@ "expr": "sum by(instance) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/querier\"})", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "{{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -2040,28 +1956,22 @@ "expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"compactor\"}[$__rate_interval]))", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "{{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"compactor\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"compactor\"})", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "limit", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"compactor\", resource=\"cpu\"} > 0)", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "request", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -2154,28 +2064,22 @@ "expr": "sum by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"compactor\"})", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "{{pod}}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"compactor\"} > 0)", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "limit", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\",container=~\"compactor\", resource=\"memory\"} > 0)", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "request", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -2259,10 +2163,8 @@ "expr": "sum by(instance) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/compactor\"})", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "{{instance}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -2327,7 +2229,7 @@ "value": "default" }, "hide": 0, - "label": "Data Source", + "label": "Data source", "name": "datasource", "options": [ diff --git a/operations/tempo-mixin-compiled/dashboards/tempo-rollout-progress.json b/operations/tempo-mixin-compiled/dashboards/tempo-rollout-progress.json index 97f7fd717eb..34817738c75 100644 --- a/operations/tempo-mixin-compiled/dashboards/tempo-rollout-progress.json +++ b/operations/tempo-mixin-compiled/dashboards/tempo-rollout-progress.json @@ -1384,19 +1384,15 @@ "expr": "1 - (\n avg_over_time(histogram_quantile(0.99, sum by (le) (tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/cortex-gw(-internal)?\", route=~\"opentelemetry_proto_collector_trace_v1_traceservice_export\"} offset 24h))[1h:])\n /\n avg_over_time(histogram_quantile(0.99, sum by (le) (tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/cortex-gw(-internal)?\", route=~\"opentelemetry_proto_collector_trace_v1_traceservice_export\"}))[1h:])\n)\n", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "writes", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "1 - (\n avg_over_time(histogram_quantile(0.99, sum by (le) (tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/cortex-gw(-internal)?\", route=~\"tempo_api_.*\"} offset 24h))[1h:])\n /\n avg_over_time(histogram_quantile(0.99, sum by (le) (tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/cortex-gw(-internal)?\", route=~\"tempo_api_.*\"}))[1h:])\n)\n", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "reads", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1455,7 +1451,7 @@ "value": "default" }, "hide": 0, - "label": "Data Source", + "label": "Data source", "name": "datasource", "options": [ diff --git a/operations/tempo-mixin-compiled/dashboards/tempo-tenants.json b/operations/tempo-mixin-compiled/dashboards/tempo-tenants.json index 4eab61204f3..87387edef74 100644 --- a/operations/tempo-mixin-compiled/dashboards/tempo-tenants.json +++ b/operations/tempo-mixin-compiled/dashboards/tempo-tenants.json @@ -92,10 +92,8 @@ "expr": "max(\n max by (cluster, namespace, limit_name) (tempo_limits_overrides{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",user=\"$tenant\"})\n or max by (cluster, namespace, limit_name) (tempo_limits_defaults{cluster=~\"$cluster\", job=~\"($namespace)/compactor\"})\n) by (limit_name)\n", "format": "table", "instant": true, - "intervalFactor": 2, "legendFormat": "", - "refId": "A", - "step": 10 + "refId": "A" } ], "thresholds": [ @@ -201,28 +199,22 @@ "expr": "sum(rate(tempo_distributor_bytes_received_total{cluster=~\"$cluster\", job=~\"($namespace)/distributor\",tenant=\"$tenant\"}[$__rate_interval]))", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "received", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "max(\n max by (cluster, namespace, limit_name) (tempo_limits_overrides{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",user=\"$tenant\",limit_name=\"ingestion_rate_limit_bytes\"})\n or max by (cluster, namespace, limit_name) (tempo_limits_defaults{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",limit_name=\"ingestion_rate_limit_bytes\"})\n) by (ingestion_rate_limit_bytes)\n", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "limit", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "max(\n max by (cluster, namespace, limit_name) (tempo_limits_overrides{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",user=\"$tenant\",limit_name=\"ingestion_burst_size_bytes\"})\n or max by (cluster, namespace, limit_name) (tempo_limits_defaults{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",limit_name=\"ingestion_burst_size_bytes\"})\n) by (ingestion_burst_size_bytes)\n", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "burst limit", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -306,19 +298,15 @@ "expr": "sum(rate(tempo_distributor_spans_received_total{cluster=~\"$cluster\", job=~\"($namespace)/distributor\",tenant=\"$tenant\"}[$__rate_interval]))", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "accepted", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "sum(rate(tempo_discarded_spans_total{cluster=~\"$cluster\", job=~\"($namespace)/distributor\",tenant=\"$tenant\"}[$__rate_interval])) by (reason)", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "refused {{ reason }}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -411,28 +399,22 @@ "expr": "max(tempo_ingester_live_traces{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",tenant=\"$tenant\"})", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "live traces", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "max(\n max by (cluster, namespace, limit_name) (tempo_limits_overrides{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",user=\"$tenant\",limit_name=\"max_global_traces_per_user\"})\n or max by (cluster, namespace, limit_name) (tempo_limits_defaults{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",limit_name=\"max_global_traces_per_user\"})\n) by (max_global_traces_per_user)\n", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "global limit", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "max(\n max by (cluster, namespace, limit_name) (tempo_limits_overrides{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",user=\"$tenant\",limit_name=\"max_local_traces_per_user\"})\n or max by (cluster, namespace, limit_name) (tempo_limits_defaults{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",limit_name=\"max_local_traces_per_user\"})\n) by (max_local_traces_per_user)\n", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "local limit", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -528,10 +510,8 @@ "expr": "sum(rate(tempo_query_frontend_queries_total{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\",tenant=\"$tenant\",op=\"traces\"}[$__rate_interval])) by (status)", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "{{ status }}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -615,10 +595,8 @@ "expr": "sum(rate(tempo_query_frontend_queries_total{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\",tenant=\"$tenant\",op=\"search\"}[$__rate_interval])) by (status)", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "{{ status }}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -708,10 +686,8 @@ "expr": "avg(tempodb_blocklist_length{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",tenant=\"$tenant\"})", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "length", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -789,10 +765,8 @@ "expr": "sum(tempodb_compaction_outstanding_blocks{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",tenant=\"$tenant\"})\n/\ncount(tempo_build_info{cluster=~\"$cluster\", job=~\"($namespace)/compactor\"})\n", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "blocks", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -882,10 +856,8 @@ "expr": "sum(rate(tempo_metrics_generator_bytes_received_total{cluster=~\"$cluster\", job=~\"($namespace)/metrics-generator\",tenant=\"$tenant\"}[$__rate_interval]))", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "rate", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -973,19 +945,15 @@ "expr": "sum(tempo_metrics_generator_registry_active_series{cluster=~\"$cluster\", job=~\"($namespace)/metrics-generator\",tenant=\"$tenant\"})", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "{{ tenant }}", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "max(\n max by (cluster, namespace, limit_name) (tempo_limits_overrides{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",user=\"$tenant\",limit_name=\"metrics_generator_max_active_series\"})\n or max by (cluster, namespace, limit_name) (tempo_limits_defaults{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",limit_name=\"metrics_generator_max_active_series\"})\n) by (metrics_generator_max_active_series)\n", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "limit", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -1050,7 +1018,7 @@ "value": "default" }, "hide": 0, - "label": "Data Source", + "label": "Data source", "name": "datasource", "options": [ diff --git a/operations/tempo-mixin-compiled/dashboards/tempo-writes.json b/operations/tempo-mixin-compiled/dashboards/tempo-writes.json index 297a54146a5..5d9b29bbd4a 100644 --- a/operations/tempo-mixin-compiled/dashboards/tempo-writes.json +++ b/operations/tempo-mixin-compiled/dashboards/tempo-writes.json @@ -35,6 +35,8 @@ "3xx": "#6ED0E0", "4xx": "#EF843C", "5xx": "#E24D42", + "OK": "#7EB26D", + "cancel": "#A9A9A9", "error": "#E24D42", "success": "#7EB26D" }, @@ -72,13 +74,11 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/cortex-gw(-internal)?\", route=\"opentelemetry_proto_collector_trace_v1_traceservice_export\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/cortex-gw(-internal)?\", route=\"opentelemetry_proto_collector_trace_v1_traceservice_export\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "{{status}}", - "refId": "A", - "step": 10 + "refId": "A" } ], "thresholds": [ @@ -279,10 +279,8 @@ "expr": "sum by (grpc_status) (\n rate(\n label_replace(\n {cluster=~\"$cluster\", job=~\"($namespace)/cortex-gw(-internal)?\", __name__=~\"envoy_cluster_grpc_proto_collector_trace_v1_TraceService_[0-9]+\"},\n \"grpc_status\", \"$1\", \"__name__\", \"envoy_cluster_grpc_proto_collector_trace_v1_TraceService_(.+)\"\n )\n [$__interval:$__interval]\n )\n)\n", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "{{grpc_status}}", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -402,19 +400,15 @@ "expr": "sum(rate(tempo_receiver_accepted_spans{cluster=~\"$cluster\", job=~\"($namespace)/distributor\"}[$__rate_interval]))", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "accepted", - "legendLink": null, - "step": 10 + "legendLink": null }, { "expr": "sum(rate(tempo_receiver_refused_spans{cluster=~\"$cluster\", job=~\"($namespace)/distributor\"}[$__rate_interval]))", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "refused", - "legendLink": null, - "step": 10 + "legendLink": null } ], "thresholds": [ @@ -581,6 +575,8 @@ "3xx": "#6ED0E0", "4xx": "#EF843C", "5xx": "#E24D42", + "OK": "#7EB26D", + "cancel": "#A9A9A9", "error": "#E24D42", "success": "#7EB26D" }, @@ -618,13 +614,11 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"/tempopb.Pusher/Push.*\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"/tempopb.Pusher/Push.*\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "{{status}}", - "refId": "A", - "step": 10 + "refId": "A" } ], "thresholds": [ @@ -791,6 +785,8 @@ "3xx": "#6ED0E0", "4xx": "#EF843C", "5xx": "#E24D42", + "OK": "#7EB26D", + "cancel": "#A9A9A9", "error": "#E24D42", "success": "#7EB26D" }, @@ -828,13 +824,11 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_memcache_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",method=\"Memcache.Put\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_memcache_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",method=\"Memcache.Put\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "{{status}}", - "refId": "A", - "step": 10 + "refId": "A" } ], "thresholds": [ @@ -1001,6 +995,8 @@ "3xx": "#6ED0E0", "4xx": "#EF843C", "5xx": "#E24D42", + "OK": "#7EB26D", + "cancel": "#A9A9A9", "error": "#E24D42", "success": "#7EB26D" }, @@ -1038,13 +1034,11 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(tempodb_backend_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",operation=~\"(PUT|POST)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(tempodb_backend_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/ingester\",operation=~\"(PUT|POST)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "{{status}}", - "refId": "A", - "step": 10 + "refId": "A" } ], "thresholds": [ @@ -1211,6 +1205,8 @@ "3xx": "#6ED0E0", "4xx": "#EF843C", "5xx": "#E24D42", + "OK": "#7EB26D", + "cancel": "#A9A9A9", "error": "#E24D42", "success": "#7EB26D" }, @@ -1248,13 +1244,11 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_memcache_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",method=\"Memcache.Put\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_memcache_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",method=\"Memcache.Put\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "{{status}}", - "refId": "A", - "step": 10 + "refId": "A" } ], "thresholds": [ @@ -1421,6 +1415,8 @@ "3xx": "#6ED0E0", "4xx": "#EF843C", "5xx": "#E24D42", + "OK": "#7EB26D", + "cancel": "#A9A9A9", "error": "#E24D42", "success": "#7EB26D" }, @@ -1458,13 +1454,11 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(tempodb_backend_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",operation=~\"(PUT|POST)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(tempodb_backend_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/compactor\",operation=~\"(PUT|POST)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "interval": "1m", - "intervalFactor": 2, "legendFormat": "{{status}}", - "refId": "A", - "step": 10 + "refId": "A" } ], "thresholds": [ @@ -1634,7 +1628,7 @@ "value": "default" }, "hide": 0, - "label": "Data Source", + "label": "Data source", "name": "datasource", "options": [ diff --git a/operations/tempo-mixin/alerts.libsonnet b/operations/tempo-mixin/alerts.libsonnet index 2ef88cca662..8618f80ad41 100644 --- a/operations/tempo-mixin/alerts.libsonnet +++ b/operations/tempo-mixin/alerts.libsonnet @@ -199,7 +199,7 @@ alert: 'TempoProvisioningTooManyWrites', // 30MB/s written to the WAL per ingester max expr: ||| - avg by (%s) (rate(tempo_ingester_bytes_received_total{job=~".+/ingester"}[1m])) / 1024 / 1024 > 30 + avg by (%s) (rate(tempo_ingester_bytes_received_total{job=~".+/ingester"}[5m])) / 1024 / 1024 > 30 ||| % $._config.group_by_cluster, 'for': '15m', labels: { diff --git a/operations/tempo-mixin/config.libsonnet b/operations/tempo-mixin/config.libsonnet index 4cfe1ff73ad..ea37414e1b1 100644 --- a/operations/tempo-mixin/config.libsonnet +++ b/operations/tempo-mixin/config.libsonnet @@ -43,5 +43,11 @@ group_by_cluster: makeGroupBy($._config.cluster_selectors), group_by_job: makeGroupBy($._config.job_selectors), group_by_tenant: makeGroupBy($._config.tenant_selectors), + + // Tunes histogram recording rules to aggregate over this interval. + // Set to at least twice the scrape interval; otherwise, recording rules will output no data. + // Set to four times the scrape interval to account for edge cases: https://www.robustperception.io/what-range-should-i-use-with-rate/ + recording_rules_range_interval: '1m', + }, } diff --git a/operations/tempo-mixin/jsonnetfile.lock.json b/operations/tempo-mixin/jsonnetfile.lock.json index ac9ea626d82..6324ac7f29b 100644 --- a/operations/tempo-mixin/jsonnetfile.lock.json +++ b/operations/tempo-mixin/jsonnetfile.lock.json @@ -8,8 +8,8 @@ "subdir": "grafana-builder" } }, - "version": "84900d9dc450116ad66864f48088f92ccae36c54", - "sum": "0KkygBQd/AFzUvVzezE4qF/uDYgrwUXVpZfINBti0oc=" + "version": "167b75f241cb61513b399cad7f87052108a26b85", + "sum": "EEPwMLfUIJT9iEUI/gCW9x6PxWoTBPSJOfabTF4rp1M=" }, { "source": { @@ -18,8 +18,8 @@ "subdir": "mixin-utils" } }, - "version": "84900d9dc450116ad66864f48088f92ccae36c54", - "sum": "v6fuqqQp9rHZbsxN9o79QzOpUlwYZEJ84DxTCZMCYeU=" + "version": "167b75f241cb61513b399cad7f87052108a26b85", + "sum": "Qg992ZB0jkrS+YLq0Q7RV1fSHa8+hQT0jbpTyCGE2NI=" } ], "legacyImports": false diff --git a/operations/tempo-mixin/rules.libsonnet b/operations/tempo-mixin/rules.libsonnet index d7671239e7e..8ad0ffe8d21 100644 --- a/operations/tempo-mixin/rules.libsonnet +++ b/operations/tempo-mixin/rules.libsonnet @@ -5,7 +5,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; groups+: [{ name: 'tempo_rules', rules: - utils.histogramRules('tempo_request_duration_seconds', $._config.job_selectors + ['route']), + utils.histogramRules('tempo_request_duration_seconds', $._config.job_selectors + ['route'], $._config.recording_rules_range_interval), }], }, } diff --git a/operations/tempo-mixin/vendor/github.com/grafana/jsonnet-libs/grafana-builder/grafana.libsonnet b/operations/tempo-mixin/vendor/github.com/grafana/jsonnet-libs/grafana-builder/grafana.libsonnet index c98e2b36061..435575803aa 100644 --- a/operations/tempo-mixin/vendor/github.com/grafana/jsonnet-libs/grafana-builder/grafana.libsonnet +++ b/operations/tempo-mixin/vendor/github.com/grafana/jsonnet-libs/grafana-builder/grafana.libsonnet @@ -1,5 +1,7 @@ +local utils = import 'mixin-utils/utils.libsonnet'; + { - dashboard(title, uid='', datasource='default'):: { + dashboard(title, uid='', datasource='default', datasource_regex=''):: { // Stuff that isn't materialised. _nextPanel:: 1, addRow(row):: self { @@ -13,7 +15,7 @@ rows+: [row { panels: panels }], }, - addTemplate(name, metric_name, label_name, hide=0, allValue=null):: self { + addTemplate(name, metric_name, label_name, hide=0, allValue=null, includeAll=false, sort=2):: self { templating+: { list+: [{ allValue: allValue, @@ -23,7 +25,7 @@ }, datasource: '$datasource', hide: hide, - includeAll: false, + includeAll: includeAll, label: name, multi: false, name: name, @@ -31,7 +33,7 @@ query: 'label_values(%s, %s)' % [metric_name, label_name], refresh: 1, regex: '', - sort: 2, + sort: sort, tagValuesQuery: '', tags: [], tagsQuery: '', @@ -41,7 +43,7 @@ }, }, - addMultiTemplate(name, metric_name, label_name, hide=0, allValue='.+'):: self { + addMultiTemplate(name, metric_name, label_name, hide=0, allValue='.+', sort=2):: self { templating+: { list+: [{ allValue: allValue, @@ -60,7 +62,7 @@ query: 'label_values(%s, %s)' % [metric_name, label_name], refresh: 1, regex: '', - sort: 2, + sort: sort, tagValuesQuery: '', tags: [], tagsQuery: '', @@ -70,6 +72,40 @@ }, }, + addShowNativeLatencyVariable():: self { + templating+: { + list+: [{ + current: { + selected: true, + text: 'classic', + value: '1', + }, + description: 'Choose between showing latencies based on low precision classic or high precision native histogram metrics.', + hide: 0, + includeAll: false, + label: 'Latency metrics', + multi: false, + name: 'latency_metrics', + query: 'native : -1,classic : 1', + options: [ + { + selected: false, + text: 'native', + value: '-1', + }, + { + selected: true, + text: 'classic', + value: '1', + }, + ], + skipUrlSync: false, + type: 'custom', + useTags: false, + }], + }, + }, + dashboardLinkUrl(title, url):: self { links+: [ { @@ -109,12 +145,12 @@ value: datasource, }, hide: 0, - label: 'Data Source', + label: 'Data source', name: 'datasource', options: [], query: 'prometheus', refresh: 1, - regex: '', + regex: datasource_regex, type: 'datasource', }, ], @@ -178,6 +214,7 @@ titleSize: 'h6', }, + // "graph" type, now deprecated. panel(title):: { aliasColors: {}, bars: false, @@ -228,6 +265,46 @@ yaxes: $.yaxes('short'), }, + // "timeseries" panel, introduced with Grafana 7.4 and made standard in 8.0. + timeseriesPanel(title):: { + datasource: '$datasource', + fieldConfig: { + defaults: { + custom: { + drawStyle: 'line', + fillOpacity: 1, + lineWidth: 1, + pointSize: 5, + showPoints: 'never', + spanNulls: false, + stacking: { + group: 'A', + mode: 'none', + }, + }, + thresholds: { + mode: 'absolute', + steps: [], + }, + unit: 's', + }, + overrides: [], + }, + options: { + legend: { + showLegend: true, + }, + tooltip: { + mode: 'single', + sort: 'none', + }, + }, + links: [], + targets: [], + title: title, + type: 'timeseries', + }, + queryPanel(queries, legends, legendLink=null):: { local qs = @@ -248,9 +325,7 @@ legendLink: legendLink, expr: ql.q, format: 'time_series', - intervalFactor: 2, legendFormat: ql.l, - step: 10, } for ql in qsandls ], @@ -265,7 +340,6 @@ expr: query, format: 'time_series', instant: true, - intervalFactor: 2, refId: 'A', }, ], @@ -328,9 +402,7 @@ expr: qs[i], format: 'table', instant: true, - intervalFactor: 2, legendFormat: '', - step: 10, refId: std.char(65 + i), } for i in std.range(0, std.length(qs) - 1) @@ -384,16 +456,20 @@ }, ], + httpStatusColors:: { + '1xx': '#EAB839', + '2xx': '#7EB26D', + '3xx': '#6ED0E0', + '4xx': '#EF843C', + '5xx': '#E24D42', + OK: '#7EB26D', + success: '#7EB26D', + 'error': '#E24D42', + cancel: '#A9A9A9', + }, + qpsPanel(selector, statusLabelName='status_code'):: { - aliasColors: { - '1xx': '#EAB839', - '2xx': '#7EB26D', - '3xx': '#6ED0E0', - '4xx': '#EF843C', - '5xx': '#E24D42', - success: '#7EB26D', - 'error': '#E24D42', - }, + aliasColors: $.httpStatusColors, targets: [ { expr: @@ -401,13 +477,70 @@ sum by (status) ( label_replace(label_replace(rate(%s[$__rate_interval]), "status", "${1}xx", "%s", "([0-9]).."), - "status", "${1}", "%s", "([a-z]+)")) + "status", "${1}", "%s", "([a-zA-Z]+)")) ||| % [selector, statusLabelName, statusLabelName], format: 'time_series', - intervalFactor: 2, legendFormat: '{{status}}', refId: 'A', - step: 10, + }, + ], + } + $.stack, + + // Assumes that the metricName is for a histogram (as opposed to qpsPanel above) + // Assumes that there is a dashboard variable named latency_metrics, values are -1 (native) or 1 (classic) + qpsPanelNativeHistogram(metricName, selector, statusLabelName='status_code'):: { + local sumByStatus(nativeClassicQuery) = { + local template = + ||| + sum by (status) ( + label_replace(label_replace(%(metricQuery)s, + "status", "${1}xx", "%(label)s", "([0-9]).."), + "status", "${1}", "%(label)s", "([a-zA-Z]+)")) + |||, + native: template % { metricQuery: nativeClassicQuery.native, label: statusLabelName }, + classic: template % { metricQuery: nativeClassicQuery.classic, label: statusLabelName }, + }, + fieldConfig+: { + defaults+: { + custom+: { + lineWidth: 0, + fillOpacity: 100, // Get solid fill. + stacking: { + mode: 'normal', + group: 'A', + }, + }, + unit: 'reqps', + min: 0, + }, + overrides+: [{ + matcher: { + id: 'byName', + options: status, + }, + properties: [ + { + id: 'color', + value: { + mode: 'fixed', + fixedColor: $.httpStatusColors[status], + }, + }, + ], + } for status in std.objectFieldsAll($.httpStatusColors)], + }, + targets: [ + { + expr: utils.showClassicHistogramQuery(sumByStatus(utils.ncHistogramCountRate(metricName, selector))), + format: 'time_series', + legendFormat: '{{status}}', + refId: 'A_classic', + }, + { + expr: utils.showNativeHistogramQuery(sumByStatus(utils.ncHistogramCountRate(metricName, selector))), + format: 'time_series', + legendFormat: '{{status}}', + refId: 'A', }, ], } + $.stack, @@ -418,26 +551,72 @@ { expr: 'histogram_quantile(0.99, sum(rate(%s_bucket%s[$__rate_interval])) by (le)) * %s' % [metricName, selector, multiplier], format: 'time_series', - intervalFactor: 2, legendFormat: '99th Percentile', refId: 'A', - step: 10, }, { expr: 'histogram_quantile(0.50, sum(rate(%s_bucket%s[$__rate_interval])) by (le)) * %s' % [metricName, selector, multiplier], format: 'time_series', - intervalFactor: 2, legendFormat: '50th Percentile', refId: 'B', - step: 10, }, { expr: 'sum(rate(%s_sum%s[$__rate_interval])) * %s / sum(rate(%s_count%s[$__rate_interval]))' % [metricName, selector, multiplier, metricName, selector], format: 'time_series', - intervalFactor: 2, legendFormat: 'Average', refId: 'C', - step: 10, + }, + ], + yaxes: $.yaxes('ms'), + }, + + // Assumes that there is a dashboard variable named latency_metrics, values are -1 (native) or 1 (classic) + latencyPanelNativeHistogram(metricName, selector, multiplier='1e3'):: { + nullPointMode: 'null as zero', + fieldConfig+: { + defaults+: { + custom+: { + fillOpacity: 10, + }, + unit: 'ms', + }, + }, + targets: [ + { + expr: utils.showNativeHistogramQuery(utils.ncHistogramQuantile('0.99', metricName, selector, multiplier=multiplier)), + format: 'time_series', + legendFormat: '99th percentile', + refId: 'A', + }, + { + expr: utils.showClassicHistogramQuery(utils.ncHistogramQuantile('0.99', metricName, selector, multiplier=multiplier)), + format: 'time_series', + legendFormat: '99th percentile', + refId: 'A_classic', + }, + { + expr: utils.showNativeHistogramQuery(utils.ncHistogramQuantile('0.50', metricName, selector, multiplier=multiplier)), + format: 'time_series', + legendFormat: '50th percentile', + refId: 'B', + }, + { + expr: utils.showClassicHistogramQuery(utils.ncHistogramQuantile('0.50', metricName, selector, multiplier=multiplier)), + format: 'time_series', + legendFormat: '50th percentile', + refId: 'B_classic', + }, + { + expr: utils.showNativeHistogramQuery(utils.ncHistogramAverageRate(metricName, selector, multiplier=multiplier)), + format: 'time_series', + legendFormat: 'Average', + refId: 'C', + }, + { + expr: utils.showClassicHistogramQuery(utils.ncHistogramAverageRate(metricName, selector, multiplier=multiplier)), + format: 'time_series', + legendFormat: 'Average', + refId: 'C_classic', }, ], yaxes: $.yaxes('ms'), diff --git a/operations/tempo-mixin/vendor/github.com/grafana/jsonnet-libs/mixin-utils/test/.gitignore b/operations/tempo-mixin/vendor/github.com/grafana/jsonnet-libs/mixin-utils/test/.gitignore new file mode 100644 index 00000000000..80243dae6ba --- /dev/null +++ b/operations/tempo-mixin/vendor/github.com/grafana/jsonnet-libs/mixin-utils/test/.gitignore @@ -0,0 +1,2 @@ +vendor/ +jsonnetfile.lock.json diff --git a/operations/tempo-mixin/vendor/github.com/grafana/jsonnet-libs/mixin-utils/test/Makefile b/operations/tempo-mixin/vendor/github.com/grafana/jsonnet-libs/mixin-utils/test/Makefile new file mode 100644 index 00000000000..ea444b444c3 --- /dev/null +++ b/operations/tempo-mixin/vendor/github.com/grafana/jsonnet-libs/mixin-utils/test/Makefile @@ -0,0 +1,7 @@ +.PHONY: tests + +vendor jsonnetfile.lock.json: jsonnetfile.json + jb install + +tests: jsonnetfile.lock.json vendor + jsonnet -J vendor/ test_*.libsonnet diff --git a/operations/tempo-mixin/vendor/github.com/grafana/jsonnet-libs/mixin-utils/test/jsonnetfile.json b/operations/tempo-mixin/vendor/github.com/grafana/jsonnet-libs/mixin-utils/test/jsonnetfile.json new file mode 100644 index 00000000000..c146841952a --- /dev/null +++ b/operations/tempo-mixin/vendor/github.com/grafana/jsonnet-libs/mixin-utils/test/jsonnetfile.json @@ -0,0 +1,14 @@ +{ + "version": 1, + "dependencies": [ + { + "source": { + "git": { + "remote": "https://github.com/jsonnet-libs/testonnet.git" + } + }, + "version": "master" + } + ], + "legacyImports": true +} diff --git a/operations/tempo-mixin/vendor/github.com/grafana/jsonnet-libs/mixin-utils/test/test_native-classic-histogram.libsonnet b/operations/tempo-mixin/vendor/github.com/grafana/jsonnet-libs/mixin-utils/test/test_native-classic-histogram.libsonnet new file mode 100644 index 00000000000..6c1a3fbb34b --- /dev/null +++ b/operations/tempo-mixin/vendor/github.com/grafana/jsonnet-libs/mixin-utils/test/test_native-classic-histogram.libsonnet @@ -0,0 +1,191 @@ +local utils = import '../utils.libsonnet'; +local test = import 'github.com/jsonnet-libs/testonnet/main.libsonnet'; + +test.new(std.thisFile) + ++ test.case.new( + name='Quantile defaults', + test=test.expect.eq( + actual=utils.ncHistogramQuantile('0.95', 'request_duration_seconds', 'cluster="cluster1", job="job1"'), + expected={ + classic: 'histogram_quantile(0.95, sum by (le) (rate(request_duration_seconds_bucket{cluster="cluster1", job="job1"}[$__rate_interval])))', + native: 'histogram_quantile(0.95, sum (rate(request_duration_seconds{cluster="cluster1", job="job1"}[$__rate_interval])))', + }, + ) +) ++ test.case.new( + name='Quantile different groups, interval, multiplier', + test=test.expect.eq( + actual=utils.ncHistogramQuantile('0.95', 'request_duration_seconds', 'cluster="cluster1", job="job1"', ['namespace', 'route'], '5m', '42'), + expected={ + classic: 'histogram_quantile(0.95, sum by (le,namespace,route) (rate(request_duration_seconds_bucket{cluster="cluster1", job="job1"}[5m]))) * 42', + native: 'histogram_quantile(0.95, sum by (namespace,route) (rate(request_duration_seconds{cluster="cluster1", job="job1"}[5m]))) * 42', + }, + ) +) ++ test.case.new( + name='Quantile in recording rule with different groups, interval, multiplier, rate', + test=test.expect.eq( + actual=utils.ncHistogramQuantile('0.95', 'request_duration_seconds', 'cluster="cluster1", job="job1"', ['namespace', 'route'], '5m', '42', true), + expected={ + classic: 'histogram_quantile(0.95, sum by (le,namespace,route) (request_duration_seconds_bucket:sum_rate{cluster="cluster1", job="job1"})) * 42', + native: 'histogram_quantile(0.95, sum by (namespace,route) (request_duration_seconds:sum_rate{cluster="cluster1", job="job1"})) * 42', + }, + ) +) + ++ test.case.new( + name='rate of sum defaults', + test=test.expect.eq( + actual=utils.ncHistogramSumRate('request_duration_seconds', 'cluster="cluster1", job="job1"'), + expected={ + classic: 'rate(request_duration_seconds_sum{cluster="cluster1", job="job1"}[$__rate_interval])', + native: 'histogram_sum(rate(request_duration_seconds{cluster="cluster1", job="job1"}[$__rate_interval]))', + }, + ) +) ++ test.case.new( + name='rate of sum with different interval', + test=test.expect.eq( + actual=utils.ncHistogramSumRate('request_duration_seconds', 'cluster="cluster1", job="job1"', '5m'), + expected={ + classic: 'rate(request_duration_seconds_sum{cluster="cluster1", job="job1"}[5m])', + native: 'histogram_sum(rate(request_duration_seconds{cluster="cluster1", job="job1"}[5m]))', + }, + ) +) ++ test.case.new( + name='rate of sum in recording rule with different interval', + test=test.expect.eq( + actual=utils.ncHistogramSumRate('request_duration_seconds', 'cluster="cluster1", job="job1"', '5m', true), + expected={ + classic: 'request_duration_seconds_sum:sum_rate{cluster="cluster1", job="job1"}', + native: 'histogram_sum(request_duration_seconds:sum_rate{cluster="cluster1", job="job1"})', + }, + ) +) + ++ test.case.new( + name='rate of count defaults', + test=test.expect.eq( + actual=utils.ncHistogramCountRate('request_duration_seconds', 'cluster="cluster1", job="job1"'), + expected={ + classic: 'rate(request_duration_seconds_count{cluster="cluster1", job="job1"}[$__rate_interval])', + native: 'histogram_count(rate(request_duration_seconds{cluster="cluster1", job="job1"}[$__rate_interval]))', + }, + ) +) ++ test.case.new( + name='rate of count with different interval', + test=test.expect.eq( + actual=utils.ncHistogramCountRate('request_duration_seconds', 'cluster="cluster1", job="job1"', '5m'), + expected={ + classic: 'rate(request_duration_seconds_count{cluster="cluster1", job="job1"}[5m])', + native: 'histogram_count(rate(request_duration_seconds{cluster="cluster1", job="job1"}[5m]))', + }, + ) +) ++ test.case.new( + name='rate of count in recording rule with different interval', + test=test.expect.eq( + actual=utils.ncHistogramCountRate('request_duration_seconds', 'cluster="cluster1", job="job1"', '5m', true), + expected={ + classic: 'request_duration_seconds_count:sum_rate{cluster="cluster1", job="job1"}', + native: 'histogram_count(request_duration_seconds:sum_rate{cluster="cluster1", job="job1"})', + }, + ) +) + ++ test.case.new( + name='rate of average defaults', + test=test.expect.eq( + actual=utils.ncHistogramAverageRate('request_duration_seconds', 'cluster="cluster1", job="job1"'), + expected={ + classic: 'sum(rate(request_duration_seconds_sum{cluster="cluster1", job="job1"}[$__rate_interval])) /\nsum(rate(request_duration_seconds_count{cluster="cluster1", job="job1"}[$__rate_interval]))\n', + native: 'sum(histogram_sum(rate(request_duration_seconds{cluster="cluster1", job="job1"}[$__rate_interval]))) /\nsum(histogram_count(rate(request_duration_seconds{cluster="cluster1", job="job1"}[$__rate_interval])))\n', + }, + ) +) ++ test.case.new( + name='rate of average with different interval, multiplier', + test=test.expect.eq( + actual=utils.ncHistogramAverageRate('request_duration_seconds', 'cluster="cluster1", job="job1"', '5m', '42'), + expected={ + classic: '42 * sum(rate(request_duration_seconds_sum{cluster="cluster1", job="job1"}[5m])) /\nsum(rate(request_duration_seconds_count{cluster="cluster1", job="job1"}[5m]))\n', + native: '42 * sum(histogram_sum(rate(request_duration_seconds{cluster="cluster1", job="job1"}[5m]))) /\nsum(histogram_count(rate(request_duration_seconds{cluster="cluster1", job="job1"}[5m])))\n', + }, + ) +) ++ test.case.new( + name='rate of average in recording rule with different interval, multiplier', + test=test.expect.eq( + actual=utils.ncHistogramAverageRate('request_duration_seconds', 'cluster="cluster1", job="job1"', '5m', '42', true), + expected={ + classic: '42 * sum(request_duration_seconds_sum:sum_rate{cluster="cluster1", job="job1"}) /\nsum(request_duration_seconds_count:sum_rate{cluster="cluster1", job="job1"})\n', + native: '42 * sum(histogram_sum(request_duration_seconds:sum_rate{cluster="cluster1", job="job1"})) /\nsum(histogram_count(request_duration_seconds:sum_rate{cluster="cluster1", job="job1"}))\n', + }, + ) +) + ++ test.case.new( + name='histogram sum by defaults', + test=test.expect.eq( + actual=utils.ncHistogramSumBy(utils.ncHistogramCountRate('request_duration_seconds_sum', '{cluster="cluster1", job="job1"}')), + expected={ + classic: 'sum (rate(request_duration_seconds_sum_count{{cluster="cluster1", job="job1"}}[$__rate_interval]))', + native: 'sum (histogram_count(rate(request_duration_seconds_sum{{cluster="cluster1", job="job1"}}[$__rate_interval])))', + }, + ) +) ++ test.case.new( + name='histogram sum by with different labels and multiplier', + test=test.expect.eq( + actual=utils.ncHistogramSumBy(utils.ncHistogramCountRate('request_duration_seconds_sum', '{cluster="cluster1", job="job1"}'), ['namespace', 'route'], '42'), + expected={ + classic: 'sum by (namespace, route) (rate(request_duration_seconds_sum_count{{cluster="cluster1", job="job1"}}[$__rate_interval])) * 42', + native: 'sum by (namespace, route) (histogram_count(rate(request_duration_seconds_sum{{cluster="cluster1", job="job1"}}[$__rate_interval]))) * 42', + }, + ) +) + ++ test.case.new( + name='histogram le rate defaults and le is float', + test=test.expect.eq( + actual=utils.ncHistogramLeRate('request_duration_seconds', 'cluster="cluster1", job="job1"', '0.1'), + expected={ + classic: 'rate(request_duration_seconds_bucket{cluster="cluster1", job="job1", le=~"0.1"}[$__rate_interval])', + native: 'histogram_fraction(0, 0.1, rate(request_duration_seconds{cluster="cluster1", job="job1"}[$__rate_interval]))*histogram_count(rate(request_duration_seconds{cluster="cluster1", job="job1"}[$__rate_interval]))', + }, + ) +) ++ test.case.new( + name='histogram le rate defaults and le is whole', + test=test.expect.eq( + actual=utils.ncHistogramLeRate('request_duration_seconds', 'cluster="cluster1", job="job1"', '10'), + expected={ + classic: 'rate(request_duration_seconds_bucket{cluster="cluster1", job="job1", le=~"10|10\\\\.0"}[$__rate_interval])', + native: 'histogram_fraction(0, 10.0, rate(request_duration_seconds{cluster="cluster1", job="job1"}[$__rate_interval]))*histogram_count(rate(request_duration_seconds{cluster="cluster1", job="job1"}[$__rate_interval]))', + }, + ) +) ++ test.case.new( + name='histogram le rate defaults and le is float with different interval', + test=test.expect.eq( + actual=utils.ncHistogramLeRate('request_duration_seconds', 'cluster="cluster1", job="job1"', '0.1', '5m'), + expected={ + classic: 'rate(request_duration_seconds_bucket{cluster="cluster1", job="job1", le=~"0.1"}[5m])', + native: 'histogram_fraction(0, 0.1, rate(request_duration_seconds{cluster="cluster1", job="job1"}[5m]))*histogram_count(rate(request_duration_seconds{cluster="cluster1", job="job1"}[5m]))', + }, + ) +) + ++ test.case.new( + name='commenting histogram query', + test=test.expect.eq( + actual=utils.ncHistogramComment({ classic: 'classic_query', native: 'native_query' }, 'comment\n'), + expected={ + classic: 'comment\nclassic_query\n', + native: 'comment\nnative_query\n', + }, + ) +) diff --git a/operations/tempo-mixin/vendor/github.com/grafana/jsonnet-libs/mixin-utils/utils.libsonnet b/operations/tempo-mixin/vendor/github.com/grafana/jsonnet-libs/mixin-utils/utils.libsonnet index 4d9469b7439..ada9574b15f 100644 --- a/operations/tempo-mixin/vendor/github.com/grafana/jsonnet-libs/mixin-utils/utils.libsonnet +++ b/operations/tempo-mixin/vendor/github.com/grafana/jsonnet-libs/mixin-utils/utils.libsonnet @@ -1,38 +1,229 @@ local g = import 'grafana-builder/grafana.libsonnet'; { - histogramRules(metric, labels):: + // The ncHistogramQuantile (native classic histogram quantile) function is + // used to calculate histogram quantiles from native histograms or classic + // histograms. Metric name should be provided without _bucket suffix. + // If from_recording is true, the function will assume :sum_rate metric + // suffix and no rate needed. + ncHistogramQuantile(percentile, metric, selector, sum_by=[], rate_interval='$__rate_interval', multiplier='', from_recording=false):: + local classicSumBy = if std.length(sum_by) > 0 then ' by (%(lbls)s) ' % { lbls: std.join(',', ['le'] + sum_by) } else ' by (le) '; + local nativeSumBy = if std.length(sum_by) > 0 then ' by (%(lbls)s) ' % { lbls: std.join(',', sum_by) } else ' '; + local multiplierStr = if multiplier == '' then '' else ' * %s' % multiplier; + local rateOpen = if from_recording then '' else 'rate('; + local rateClose = if from_recording then '' else '[%s])' % rate_interval; + { + classic: 'histogram_quantile(%(percentile)s, sum%(classicSumBy)s(%(rateOpen)s%(metric)s_bucket%(suffix)s{%(selector)s}%(rateClose)s))%(multiplierStr)s' % { + classicSumBy: classicSumBy, + metric: metric, + multiplierStr: multiplierStr, + percentile: percentile, + rateInterval: rate_interval, + rateOpen: rateOpen, + rateClose: rateClose, + selector: selector, + suffix: if from_recording then ':sum_rate' else '', + }, + native: 'histogram_quantile(%(percentile)s, sum%(nativeSumBy)s(%(rateOpen)s%(metric)s%(suffix)s{%(selector)s}%(rateClose)s))%(multiplierStr)s' % { + metric: metric, + multiplierStr: multiplierStr, + nativeSumBy: nativeSumBy, + percentile: percentile, + rateInterval: rate_interval, + rateOpen: rateOpen, + rateClose: rateClose, + selector: selector, + suffix: if from_recording then ':sum_rate' else '', + }, + }, + + // The ncHistogramSumRate (native classic histogram sum rate) function is + // used to calculate the histogram rate of the sum from native histograms or + // classic histograms. Metric name should be provided without _sum suffix. + // If from_recording is true, the function will assume :sum_rate metric + // suffix and no rate needed. + ncHistogramSumRate(metric, selector, rate_interval='$__rate_interval', from_recording=false):: + local rateOpen = if from_recording then '' else 'rate('; + local rateClose = if from_recording then '' else '[%s])' % rate_interval; + { + classic: '%(rateOpen)s%(metric)s_sum%(suffix)s{%(selector)s}%(rateClose)s' % { + metric: metric, + rateInterval: rate_interval, + rateOpen: rateOpen, + rateClose: rateClose, + selector: selector, + suffix: if from_recording then ':sum_rate' else '', + }, + native: 'histogram_sum(%(rateOpen)s%(metric)s%(suffix)s{%(selector)s}%(rateClose)s)' % { + metric: metric, + rateInterval: rate_interval, + rateOpen: rateOpen, + rateClose: rateClose, + selector: selector, + suffix: if from_recording then ':sum_rate' else '', + }, + }, + + + // The ncHistogramCountRate (native classic histogram count rate) function is + // used to calculate the histogram rate of count from native histograms or + // classic histograms. Metric name should be provided without _count suffix. + // If from_recording is true, the function will assume :sum_rate metric + // suffix and no rate needed. + ncHistogramCountRate(metric, selector, rate_interval='$__rate_interval', from_recording=false):: + local rateOpen = if from_recording then '' else 'rate('; + local rateClose = if from_recording then '' else '[%s])' % rate_interval; + { + classic: '%(rateOpen)s%(metric)s_count%(suffix)s{%(selector)s}%(rateClose)s' % { + metric: metric, + rateInterval: rate_interval, + rateOpen: rateOpen, + rateClose: rateClose, + selector: selector, + suffix: if from_recording then ':sum_rate' else '', + }, + native: 'histogram_count(%(rateOpen)s%(metric)s%(suffix)s{%(selector)s}%(rateClose)s)' % { + metric: metric, + rateInterval: rate_interval, + rateOpen: rateOpen, + rateClose: rateClose, + selector: selector, + suffix: if from_recording then ':sum_rate' else '', + }, + }, + + // TODO(krajorama) Switch to histogram_avg function for native histograms later. + // ncHistogramAverageRate (native classic histogram average rate) function is + // used to calculate the histogram average rate from native histograms or + // classic histograms. + // If from_recording is true, the function will assume :sum_rate metric + // suffix and no rate needed. + ncHistogramAverageRate(metric, selector, rate_interval='$__rate_interval', multiplier='', from_recording=false):: + local multiplierStr = if multiplier == '' then '' else '%s * ' % multiplier; + { + classic: ||| + %(multiplier)ssum(%(sumMetricQuery)s) / + sum(%(countMetricQuery)s) + ||| % { + sumMetricQuery: $.ncHistogramSumRate(metric, selector, rate_interval, from_recording).classic, + countMetricQuery: $.ncHistogramCountRate(metric, selector, rate_interval, from_recording).classic, + multiplier: multiplierStr, + }, + native: ||| + %(multiplier)ssum(%(sumMetricQuery)s) / + sum(%(countMetricQuery)s) + ||| % { + sumMetricQuery: $.ncHistogramSumRate(metric, selector, rate_interval, from_recording).native, + countMetricQuery: $.ncHistogramCountRate(metric, selector, rate_interval, from_recording).native, + multiplier: multiplierStr, + }, + }, + + // ncHistogramSumBy (native classic histogram sum by) function is used to + // generate a query that sums the results of a subquery by the given labels. + // The function can be used with native histograms or classic histograms. + ncHistogramSumBy(query, sum_by=[], multiplier=''):: + local sumBy = if std.length(sum_by) > 0 then ' by (%(lbls)s) ' % { lbls: std.join(', ', sum_by) } else ' '; + local multiplierStr = if multiplier == '' then '' else ' * %s' % multiplier; + { + classic: 'sum%(sumBy)s(%(query)s)%(multiplierStr)s' % { + multiplierStr: multiplierStr, + query: query.classic, + sumBy: sumBy, + }, + native: 'sum%(sumBy)s(%(query)s)%(multiplierStr)s' % { + multiplierStr: multiplierStr, + query: query.native, + sumBy: sumBy, + }, + }, + + // ncHistogramLeRate (native classic histogram le rate) calculates the rate + // of requests that have a value less than or equal to the given "le" value. + // The "le" value matcher for classic histograms can handle both Prometheus + // or OpenMetrics formats, where whole numbers may or may not have ".0" at + // the end. + ncHistogramLeRate(metric, selector, le, rate_interval='$__rate_interval'):: + local isWholeNumber(str) = str != '' && std.foldl(function(acc, c) acc && (c == '0' || c == '1' || c == '2' || c == '3' || c == '4' || c == '5' || c == '6' || c == '7' || c == '8' || c == '9'), std.stringChars(str), true); + { + native: 'histogram_fraction(0, %(le)s, rate(%(metric)s{%(selector)s}[%(rateInterval)s]))*histogram_count(rate(%(metric)s{%(selector)s}[%(rateInterval)s]))' % { + le: if isWholeNumber(le) then le + '.0' else le, // Treated as float number. + metric: metric, + rateInterval: rate_interval, + selector: selector, + }, + classic: 'rate(%(metric)s_bucket{%(selector)s, le=~"%(le)s"}[%(rateInterval)s])' % { + // le is treated as string, thus it needs to account for Prometheus text format not having '.0', but OpenMetrics having it. + // Also the resulting string in yaml is stored directly, so the \\ needs to be escaped to \\\\. + le: if isWholeNumber(le) then '%(le)s|%(le)s\\\\.0' % { le: le } else le, + metric: metric, + rateInterval: rate_interval, + selector: selector, + }, + }, + + // ncHistogramComment (native classic histogram comment) helps attach + // comments to the query and also keep multiline strings where applicable. + ncHistogramComment(query, comment):: { + native: ||| + %s%s + ||| % [comment, query.native], + classic: ||| + %s%s + ||| % [comment, query.classic], + }, + + // showClassicHistogramQuery wraps a query defined as map {classic: q, native: q}, and compares the classic query + // to dashboard variable which should take -1 or +1 as values in order to hide or show the classic query. + showClassicHistogramQuery(query, dashboard_variable='latency_metrics'):: '%s < ($%s * +Inf)' % [query.classic, dashboard_variable], + // showNativeHistogramQuery wraps a query defined as map {classic: q, native: q}, and compares the native query + // to dashboard variable which should take -1 or +1 as values in order to show or hide the native query. + showNativeHistogramQuery(query, dashboard_variable='latency_metrics'):: '%s < ($%s * -Inf)' % [query.native, dashboard_variable], + + histogramRules(metric, labels, interval='1m', record_native=false):: local vars = { metric: metric, labels_underscore: std.join('_', labels), labels_comma: std.join(', ', labels), + interval: interval, }; [ { record: '%(labels_underscore)s:%(metric)s:99quantile' % vars, - expr: 'histogram_quantile(0.99, sum(rate(%(metric)s_bucket[1m])) by (le, %(labels_comma)s))' % vars, + expr: 'histogram_quantile(0.99, sum(rate(%(metric)s_bucket[%(interval)s])) by (le, %(labels_comma)s))' % vars, }, { record: '%(labels_underscore)s:%(metric)s:50quantile' % vars, - expr: 'histogram_quantile(0.50, sum(rate(%(metric)s_bucket[1m])) by (le, %(labels_comma)s))' % vars, + expr: 'histogram_quantile(0.50, sum(rate(%(metric)s_bucket[%(interval)s])) by (le, %(labels_comma)s))' % vars, }, { record: '%(labels_underscore)s:%(metric)s:avg' % vars, - expr: 'sum(rate(%(metric)s_sum[1m])) by (%(labels_comma)s) / sum(rate(%(metric)s_count[1m])) by (%(labels_comma)s)' % vars, + expr: 'sum(rate(%(metric)s_sum[%(interval)s])) by (%(labels_comma)s) / sum(rate(%(metric)s_count[%(interval)s])) by (%(labels_comma)s)' % vars, }, { record: '%(labels_underscore)s:%(metric)s_bucket:sum_rate' % vars, - expr: 'sum(rate(%(metric)s_bucket[1m])) by (le, %(labels_comma)s)' % vars, + expr: 'sum(rate(%(metric)s_bucket[%(interval)s])) by (le, %(labels_comma)s)' % vars, }, { record: '%(labels_underscore)s:%(metric)s_sum:sum_rate' % vars, - expr: 'sum(rate(%(metric)s_sum[1m])) by (%(labels_comma)s)' % vars, + expr: 'sum(rate(%(metric)s_sum[%(interval)s])) by (%(labels_comma)s)' % vars, }, { record: '%(labels_underscore)s:%(metric)s_count:sum_rate' % vars, - expr: 'sum(rate(%(metric)s_count[1m])) by (%(labels_comma)s)' % vars, + expr: 'sum(rate(%(metric)s_count[%(interval)s])) by (%(labels_comma)s)' % vars, }, - ], + ] + if record_native then [ + // Native histogram rule, sum_rate contains the following information: + // - rate of sum, + // - rate of count, + // - rate of sum/count aka average, + // - rate of buckets, + // - implicitly the quantile information. + { + record: '%(labels_underscore)s:%(metric)s:sum_rate' % vars, + expr: 'sum(rate(%(metric)s[%(interval)s])) by (%(labels_comma)s)' % vars, + }, + ] else [], // latencyRecordingRulePanel - build a latency panel for a recording rule. @@ -65,10 +256,8 @@ local g = import 'grafana-builder/grafana.libsonnet'; sumBy: sumByHisto, }, format: 'time_series', - intervalFactor: 2, - legendFormat: '%(legend)s99th Percentile' % legend, + legendFormat: '%(legend)s99th percentile' % legend, refId: 'A', - step: 10, }, { expr: 'histogram_quantile(0.50, sum by (%(sumBy)s) (%(labels)s:%(metric)s_bucket:sum_rate%(selector)s)) * %(multiplier)s' % { @@ -79,10 +268,8 @@ local g = import 'grafana-builder/grafana.libsonnet'; sumBy: sumByHisto, }, format: 'time_series', - intervalFactor: 2, - legendFormat: '%(legend)s50th Percentile' % legend, + legendFormat: '%(legend)s50th percentile' % legend, refId: 'B', - step: 10, }, { expr: '%(multiplier)s * sum(%(labels)s:%(metric)s_sum:sum_rate%(selector)s)%(sumBy)s / sum(%(labels)s:%(metric)s_count:sum_rate%(selector)s)%(sumBy)s' % { @@ -93,10 +280,8 @@ local g = import 'grafana-builder/grafana.libsonnet'; sumBy: sumBy, }, format: 'time_series', - intervalFactor: 2, legendFormat: '%(legend)sAverage' % legend, refId: 'C', - step: 10, }, ], }, @@ -112,22 +297,83 @@ local g = import 'grafana-builder/grafana.libsonnet'; noop(label):: { label: label, op: 'nop' }, }, - toPrometheusSelector(selector):: + // latencyRecordingRulePanelNativeHistogram - build a latency panel for a recording rule. + // - metric: the base metric name (middle part of recording rule name) + // - selectors: list of selectors which will be added to first part of + // recording rule name, and to the query selector itself. + // - extra_selectors (optional): list of selectors which will be added to the + // query selector, but not to the beginnig of the recording rule name. + // Useful for external labels. + // - multiplier (optional): assumes results are in seconds, will multiply + // by 1e3 to get ms. Can be turned off. + // - sum_by (optional): additional labels to use in the sum by clause, will also be used in the legend + latencyRecordingRulePanelNativeHistogram(metric, selectors, extra_selectors=[], multiplier='1e3', sum_by=[]):: + local labels = std.join('_', [matcher.label for matcher in selectors]); + local legend = std.join('', ['{{ %(lb)s }} ' % lb for lb in sum_by]); + local metricStr = '%(labels)s:%(metric)s' % { labels: labels, metric: metric }; + local selectorStr = $.toPrometheusSelectorNaked(selectors + extra_selectors); + { + nullPointMode: 'null as zero', + yaxes: g.yaxes('ms'), + targets: [ + { + expr: $.showClassicHistogramQuery($.ncHistogramQuantile('0.99', metricStr, selectorStr, sum_by=sum_by, multiplier=multiplier, from_recording=true)), + format: 'time_series', + legendFormat: '%(legend)s99th percentile' % legend, + refId: 'A_classic', + }, + { + expr: $.showNativeHistogramQuery($.ncHistogramQuantile('0.99', metricStr, selectorStr, sum_by=sum_by, multiplier=multiplier, from_recording=true)), + format: 'time_series', + legendFormat: '%(legend)s99th percentile' % legend, + refId: 'A_native', + }, + { + expr: $.showClassicHistogramQuery($.ncHistogramQuantile('0.50', metricStr, selectorStr, sum_by=sum_by, multiplier=multiplier, from_recording=true)), + format: 'time_series', + legendFormat: '%(legend)s50th percentile' % legend, + refId: 'B_classic', + }, + { + expr: $.showNativeHistogramQuery($.ncHistogramQuantile('0.50', metricStr, selectorStr, sum_by=sum_by, multiplier=multiplier, from_recording=true)), + format: 'time_series', + legendFormat: '%(legend)s50th percentile' % legend, + refId: 'B_native', + }, + { + expr: $.showClassicHistogramQuery($.ncHistogramAverageRate(metricStr, selectorStr, multiplier=multiplier, from_recording=true)), + format: 'time_series', + legendFormat: '%(legend)sAverage' % legend, + refId: 'C_classic', + }, + { + expr: $.showNativeHistogramQuery($.ncHistogramAverageRate(metricStr, selectorStr, multiplier=multiplier, from_recording=true)), + format: 'time_series', + legendFormat: '%(legend)sAverage' % legend, + refId: 'C_native', + }, + ], + }, + + toPrometheusSelectorNaked(selector):: local pairs = [ '%(label)s%(op)s"%(value)s"' % matcher for matcher in std.filter(function(matcher) matcher.op != 'nop', selector) ]; - '{%s}' % std.join(', ', pairs), + '%s' % std.join(', ', pairs), + + toPrometheusSelector(selector):: '{%s}' % $.toPrometheusSelectorNaked(selector), // withRunbookURL - Add/Override the runbook_url annotations for all alerts inside a list of rule groups. // - url_format: an URL format for the runbook, the alert name will be substituted in the URL. // - groups: the list of rule groups containing alerts. - withRunbookURL(url_format, groups):: + // - annotation_key: the key to use for the annotation whose value will be the formatted runbook URL. + withRunbookURL(url_format, groups, annotation_key='runbook_url'):: local update_rule(rule) = if std.objectHas(rule, 'alert') then rule { annotations+: { - runbook_url: url_format % rule.alert, + [annotation_key]: url_format % rule.alert, }, } else rule; @@ -165,4 +411,15 @@ local g = import 'grafana-builder/grafana.libsonnet'; groups: std.map(overrideInGroup, super.groups), }, }, + + removeAlerts(alerts):: { + local removeRule(rule) = + if 'alert' in rule && std.objectHas(alerts, rule.alert) + then {} + else rule, + local removeInGroup(group) = group { rules: std.map(removeRule, super.rules) }, + prometheusAlerts+:: { + groups: std.prune(std.map(removeInGroup, super.groups)), + }, + }, }