diff --git a/CHANGELOG.md b/CHANGELOG.md index a61ebcf7..e3bca8af 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ * [CHANGE] Increased `CortexIngesterReachingSeriesLimit` critical alert threshold from 80% to 85%. #363 * [CHANGE] Decreased `-server.grpc-max-concurrent-streams` from 100k to 10k. #369 * [CHANGE] Decreased blocks storage ingesters graceful termination period from 80m to 20m. #369 +* [ENHANCEMENT] Writes dashboard: fix HA-tracker KV panels; add elections panel and ingester state panel. #371 * [ENHANCEMENT] cortex-mixin: Make `cluster_namespace_deployment:kube_pod_container_resource_requests_{cpu_cores,memory_bytes}:sum` backwards compatible with `kube-state-metrics` v2.0.0. #317 * [ENHANCEMENT] Cortex-mixin: Include `cortex-gw-internal` naming variation in default `gateway` job names. #328 * [ENHANCEMENT] Ruler dashboard: added object storage metrics. #354 diff --git a/cortex-mixin/dashboards/writes.libsonnet b/cortex-mixin/dashboards/writes.libsonnet index e99faee4..bf383ab2 100644 --- a/cortex-mixin/dashboards/writes.libsonnet +++ b/cortex-mixin/dashboards/writes.libsonnet @@ -104,11 +104,22 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Key-value store for high-availability (HA) deduplication') .addPanel( $.panel('Requests / sec') + - $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.distributor)) + $.qpsPanel('cortex_kv_request_duration_seconds_count{%s,kv_name="distributor-hatracker"}' % $.jobMatcher($._config.job_names.distributor)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.distributor)) + utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.distributor) + [utils.selector.eq('kv_name', 'distributor-hatracker')]) + ) + .addPanel( + $.panel('Elected replica changes / min') + + $.queryPanel([ + 'max by(exported_cluster, user)(increase(cortex_ha_tracker_elected_replica_changes_total{%s}[1m])) >0' % $.jobMatcher($._config.job_names.distributor), + ], [ + '{{user}}/{{exported_cluster}}', + ]) + + $.stack + { + yaxes: $.yaxes('cpm'), + }, ) ) .addRow( @@ -133,11 +144,19 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Key-value store for the ingesters ring') .addPanel( $.panel('Requests / sec') + - $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.ingester)) + $.qpsPanel('cortex_kv_request_duration_seconds_count{%s,kv_name="ingester-lifecycler"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.ingester)) + utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.eq('kv_name', 'ingester-lifecycler')]) + ) + .addPanel( + $.panel('Ingester status') + + $.queryPanel([ + 'max by (state)(cortex_ring_members{%s}) >0' % $.jobMatcher($._config.job_names.distributor), + ], [ + '{{state}}', + ]) ) ) .addRowIf( diff --git a/cortex-mixin/recording_rules.libsonnet b/cortex-mixin/recording_rules.libsonnet index 433fa8e6..041a099b 100644 --- a/cortex-mixin/recording_rules.libsonnet +++ b/cortex-mixin/recording_rules.libsonnet @@ -42,7 +42,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; utils.histogramRules('cortex_chunk_store_chunks_per_query', ['cluster', 'job']) + utils.histogramRules('cortex_database_request_duration_seconds', ['cluster', 'job', 'method']) + utils.histogramRules('cortex_gcs_request_duration_seconds', ['cluster', 'job', 'operation']) + - utils.histogramRules('cortex_kv_request_duration_seconds', ['cluster', 'job']), + utils.histogramRules('cortex_kv_request_duration_seconds', ['cluster', 'job', 'kv_name']), }, { name: 'cortex_queries',