diff --git a/CHANGELOG.md b/CHANGELOG.md index acad43f268..d1f6fa99df 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,6 +32,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re - [#6212](https://github.com/thanos-io/thanos/pull/6212) Query-Frontend: Disable scalar for vertical sharding. - [#6107](https://github.com/thanos-io/thanos/pull/6082) Change default user id in container image from 0(root) to 1001 - [#6228](https://github.com/thanos-io/thanos/pull/6228) Conditionally generate debug messages in ProxyStore to avoid memory bloat. +- [#6231](https://github.com/thanos-io/thanos/pull/6231) mixins: Add code/grpc-code dimension to error widgets. ### Removed diff --git a/examples/dashboards/overview.json b/examples/dashboards/overview.json index 2d274bac3c..29bd665535 100644 --- a/examples/dashboards/overview.json +++ b/examples/dashboards/overview.json @@ -161,10 +161,9 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (job) (rate(http_requests_total{handler=\"query\",code=~\"5..\"}[$interval])) / sum by (job) (rate(http_requests_total{handler=\"query\"}[$interval]))", + "expr": "sum by (job, code) (rate(http_requests_total{handler=\"query\",code=~\"5..\"}[$interval])) / ignoring (code) group_left() sum by (job) (rate(http_requests_total{handler=\"query\"}[$interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "error", "step": 10 } ], @@ -466,10 +465,9 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (job) (rate(http_requests_total{handler=\"query_range\",code=~\"5..\"}[$interval])) / sum by (job) (rate(http_requests_total{handler=\"query_range\"}[$interval]))", + "expr": "sum by (job, code) (rate(http_requests_total{handler=\"query_range\",code=~\"5..\"}[$interval])) / ignoring (code) group_left() sum by (job) (rate(http_requests_total{handler=\"query_range\"}[$interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "error", "step": 10 } ], @@ -823,10 +821,9 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",grpc_type=\"unary\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{grpc_type=\"unary\"}[$interval]))", + "expr": "sum by (job, grpc_code) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",grpc_type=\"unary\"}[$interval])) / ignoring (grpc_code) group_left() sum by (job) (rate(grpc_server_handled_total{grpc_type=\"unary\"}[$interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "error", "step": 10 } ], @@ -1180,10 +1177,9 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",grpc_type=\"unary\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{grpc_type=\"unary\"}[$interval]))", + "expr": "sum by (job, grpc_code) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",grpc_type=\"unary\"}[$interval])) / ignoring (grpc_code) group_left() sum by (job) (rate(grpc_server_handled_total{grpc_type=\"unary\"}[$interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "error", "step": 10 } ], @@ -1485,10 +1481,9 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (job) (rate(http_requests_total{handler=\"receive\",code=~\"5..\"}[$interval])) / sum by (job) (rate(http_requests_total{handler=\"receive\"}[$interval]))", + "expr": "sum by (job, code) (rate(http_requests_total{handler=\"receive\",code=~\"5..\"}[$interval])) / ignoring (code) group_left() sum by (job) (rate(http_requests_total{handler=\"receive\"}[$interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "error", "step": 10 } ], diff --git a/examples/dashboards/query-frontend.json b/examples/dashboards/query-frontend.json index ee46320359..0502d378b9 100644 --- a/examples/dashboards/query-frontend.json +++ b/examples/dashboards/query-frontend.json @@ -242,10 +242,9 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (job) (rate(http_requests_total{job=~\"$job\", handler=\"query-frontend\",code=~\"5..\"}[$interval])) / sum by (job) (rate(http_requests_total{job=~\"$job\", handler=\"query-frontend\"}[$interval]))", + "expr": "sum by (job, code) (rate(http_requests_total{job=~\"$job\", handler=\"query-frontend\",code=~\"5..\"}[$interval])) / ignoring (code) group_left() sum by (job) (rate(http_requests_total{job=~\"$job\", handler=\"query-frontend\"}[$interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "error", "step": 10 } ], diff --git a/examples/dashboards/query.json b/examples/dashboards/query.json index f25474aed0..8a5b27de51 100644 --- a/examples/dashboards/query.json +++ b/examples/dashboards/query.json @@ -145,10 +145,9 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (job) (rate(http_requests_total{job=~\"$job\", handler=\"query\",code=~\"5..\"}[$interval])) / sum by (job) (rate(http_requests_total{job=~\"$job\", handler=\"query\"}[$interval]))", + "expr": "sum by (job, code) (rate(http_requests_total{job=~\"$job\", handler=\"query\",code=~\"5..\"}[$interval])) / ignoring (code) group_left() sum by (job) (rate(http_requests_total{job=~\"$job\", handler=\"query\"}[$interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "error", "step": 10 } ], @@ -450,10 +449,9 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (job) (rate(http_requests_total{job=~\"$job\", handler=\"query_range\",code=~\"5..\"}[$interval])) / sum by (job) (rate(http_requests_total{job=~\"$job\", handler=\"query_range\"}[$interval]))", + "expr": "sum by (job, code) (rate(http_requests_total{job=~\"$job\", handler=\"query_range\",code=~\"5..\"}[$interval])) / ignoring (code) group_left() sum by (job) (rate(http_requests_total{job=~\"$job\", handler=\"query_range\"}[$interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "error", "step": 10 } ], @@ -807,10 +805,9 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (job) (rate(grpc_client_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"unary\"}[$interval])) / sum by (job) (rate(grpc_client_handled_total{job=~\"$job\", grpc_type=\"unary\"}[$interval]))", + "expr": "sum by (job, grpc_code) (rate(grpc_client_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"unary\"}[$interval])) / ignoring (grpc_code) group_left() sum by (job) (rate(grpc_client_handled_total{job=~\"$job\", grpc_type=\"unary\"}[$interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "error", "step": 10 } ], @@ -1164,10 +1161,9 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (job) (rate(grpc_client_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"server_stream\"}[$interval])) / sum by (job) (rate(grpc_client_handled_total{job=~\"$job\", grpc_type=\"server_stream\"}[$interval]))", + "expr": "sum by (job, grpc_code) (rate(grpc_client_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"server_stream\"}[$interval])) / ignoring (grpc_code) group_left() sum by (job) (rate(grpc_client_handled_total{job=~\"$job\", grpc_type=\"server_stream\"}[$interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "error", "step": 10 } ], diff --git a/examples/dashboards/receive.json b/examples/dashboards/receive.json index 1f37c9d563..39246cb054 100644 --- a/examples/dashboards/receive.json +++ b/examples/dashboards/receive.json @@ -145,10 +145,9 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (job) (rate(http_requests_total{job=~\"$job\", handler=\"receive\",code=~\"5..\"}[$interval])) / sum by (job) (rate(http_requests_total{job=~\"$job\", handler=\"receive\"}[$interval]))", + "expr": "sum by (job, code) (rate(http_requests_total{job=~\"$job\", handler=\"receive\",code=~\"5..\"}[$interval])) / ignoring (code) group_left() sum by (job) (rate(http_requests_total{job=~\"$job\", handler=\"receive\"}[$interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "error", "step": 10 } ], @@ -1632,10 +1631,9 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"unary\", grpc_method=\"RemoteWrite\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"unary\", grpc_method=\"RemoteWrite\"}[$interval]))", + "expr": "sum by (job, grpc_code) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"unary\", grpc_method=\"RemoteWrite\"}[$interval])) / ignoring (grpc_code) group_left() sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"unary\", grpc_method=\"RemoteWrite\"}[$interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "error", "step": 10 } ], @@ -1989,10 +1987,9 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"unary\", grpc_method!=\"RemoteWrite\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"unary\", grpc_method!=\"RemoteWrite\"}[$interval]))", + "expr": "sum by (job, grpc_code) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"unary\", grpc_method!=\"RemoteWrite\"}[$interval])) / ignoring (grpc_code) group_left() sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"unary\", grpc_method!=\"RemoteWrite\"}[$interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "error", "step": 10 } ], @@ -2346,10 +2343,9 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"server_stream\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"server_stream\"}[$interval]))", + "expr": "sum by (job, grpc_code) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"server_stream\"}[$interval])) / ignoring (grpc_code) group_left() sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"server_stream\"}[$interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "error", "step": 10 } ], diff --git a/examples/dashboards/rule.json b/examples/dashboards/rule.json index 569a5aaeec..cb1250f7cc 100644 --- a/examples/dashboards/rule.json +++ b/examples/dashboards/rule.json @@ -966,10 +966,9 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"unary\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"unary\"}[$interval]))", + "expr": "sum by (job, grpc_code) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"unary\"}[$interval])) / ignoring (grpc_code) group_left() sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"unary\"}[$interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "error", "step": 10 } ], @@ -1323,10 +1322,9 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"server_stream\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"server_stream\"}[$interval]))", + "expr": "sum by (job, grpc_code) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"server_stream\"}[$interval])) / ignoring (grpc_code) group_left() sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"server_stream\"}[$interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "error", "step": 10 } ], diff --git a/examples/dashboards/sidecar.json b/examples/dashboards/sidecar.json index 26f3f5db05..0ab12060a1 100644 --- a/examples/dashboards/sidecar.json +++ b/examples/dashboards/sidecar.json @@ -197,10 +197,9 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"unary\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"unary\"}[$interval]))", + "expr": "sum by (job, grpc_code) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"unary\"}[$interval])) / ignoring (grpc_code) group_left() sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"unary\"}[$interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "error", "step": 10 } ], @@ -553,10 +552,9 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"server_stream\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"server_stream\"}[$interval]))", + "expr": "sum by (job, grpc_code) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"server_stream\"}[$interval])) / ignoring (grpc_code) group_left() sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"server_stream\"}[$interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "error", "step": 10 } ], diff --git a/examples/dashboards/store.json b/examples/dashboards/store.json index 7f3b614cfb..031c6b58c1 100644 --- a/examples/dashboards/store.json +++ b/examples/dashboards/store.json @@ -197,10 +197,9 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"unary\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"unary\"}[$interval]))", + "expr": "sum by (job, grpc_code) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"unary\"}[$interval])) / ignoring (grpc_code) group_left() sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"unary\"}[$interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "error", "step": 10 } ], @@ -554,10 +553,9 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"server_stream\"}[$interval])) / sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"server_stream\"}[$interval]))", + "expr": "sum by (job, grpc_code) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss\",job=~\"$job\", grpc_type=\"server_stream\"}[$interval])) / ignoring (grpc_code) group_left() sum by (job) (rate(grpc_server_handled_total{job=~\"$job\", grpc_type=\"server_stream\"}[$interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "error", "step": 10 } ], diff --git a/mixin/lib/thanos-grafana-builder/builder.libsonnet b/mixin/lib/thanos-grafana-builder/builder.libsonnet index ecbc61e69c..39cedb404d 100644 --- a/mixin/lib/thanos-grafana-builder/builder.libsonnet +++ b/mixin/lib/thanos-grafana-builder/builder.libsonnet @@ -78,6 +78,24 @@ local utils = import '../utils.libsonnet'; ], }, + qpsErrTotalPerLabelPanel(selectorErr, selectorTotal, dimensions, perLabel):: { + local errExpr = 'sum by (%s, %s) (rate(%s[$interval]))' % [dimensions, perLabel, selectorErr], + local totalExpr = 'sum by (%s) (rate(%s[$interval]))' % [dimensions, selectorTotal], + + aliasColors: { + 'error': '#E24D42', + }, + targets: [ + { + expr: '%s / ignoring (%s) group_left() %s' % [errExpr, perLabel, totalExpr], + format: 'time_series', + intervalFactor: 2, + step: 10, + }, + ], + yaxes: $.yaxes({ format: 'percentunit' }), + } + $.stack, + qpsErrTotalPanel(selectorErr, selectorTotal, dimensions):: { local expr(selector) = 'sum by (%s) (rate(%s[$interval]))' % [dimensions, selector], diff --git a/mixin/lib/thanos-grafana-builder/grpc.libsonnet b/mixin/lib/thanos-grafana-builder/grpc.libsonnet index 6c58ad262a..62d1ae0d41 100644 --- a/mixin/lib/thanos-grafana-builder/grpc.libsonnet +++ b/mixin/lib/thanos-grafana-builder/grpc.libsonnet @@ -37,9 +37,10 @@ local utils = import '../utils.libsonnet'; } + $.stack, grpcErrorsPanel(metric, selector, dimensions):: - $.qpsErrTotalPanel( + $.qpsErrTotalPerLabelPanel( '%s{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss",%s}' % [metric, selector], '%s{%s}' % [metric, selector], - dimensions + dimensions, + 'grpc_code', ), } diff --git a/mixin/lib/thanos-grafana-builder/http.libsonnet b/mixin/lib/thanos-grafana-builder/http.libsonnet index e21caf941d..f3554411f8 100644 --- a/mixin/lib/thanos-grafana-builder/http.libsonnet +++ b/mixin/lib/thanos-grafana-builder/http.libsonnet @@ -25,9 +25,10 @@ local utils = import '../utils.libsonnet'; } + $.stack, httpErrPanel(metric, selector, dimensions):: - $.qpsErrTotalPanel( + $.qpsErrTotalPerLabelPanel( '%s{%s,code=~"5.."}' % [metric, selector], '%s{%s}' % [metric, selector], - dimensions + dimensions, + 'code', ), }