From 798f40b5e7f7bffb5c9713ad72bc70ce2af7e411 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Krupa?= Date: Sun, 16 Aug 2020 08:56:14 +0200 Subject: [PATCH 1/6] mixin: Adhere to monitoring mixins annotation guidelines (#3035) * mixin: Adhere to monitoring mixins annotation guidelines * replaced `message` annotation field with `description` * added simple `summary` field Signed-off-by: paulfantom * examples/alerts: regenerate & adjust tests Signed-off-by: paulfantom --- examples/alerts/alerts.md | 174 +++++++++++++++--------- examples/alerts/alerts.yaml | 172 ++++++++++++++--------- examples/alerts/tests.yaml | 18 ++- mixin/alerts/absent.libsonnet | 3 +- mixin/alerts/bucket_replicate.libsonnet | 9 +- mixin/alerts/compact.libsonnet | 15 +- mixin/alerts/query.libsonnet | 21 ++- mixin/alerts/receive.libsonnet | 21 ++- mixin/alerts/rule.libsonnet | 33 +++-- mixin/alerts/sidecar.libsonnet | 6 +- mixin/alerts/store.libsonnet | 12 +- 11 files changed, 313 insertions(+), 171 deletions(-) diff --git a/examples/alerts/alerts.md b/examples/alerts/alerts.md index 89e567a07dd..dc83d3d35d6 100644 --- a/examples/alerts/alerts.md +++ b/examples/alerts/alerts.md @@ -10,23 +10,26 @@ name: thanos-compact.rules rules: - alert: ThanosCompactMultipleRunning annotations: - message: No more than one Thanos Compact instance should be running at once. There - are {{ $value }} + description: No more than one Thanos Compact instance should be running at once. + There are {{ $value }} + summary: Thanos Compact has multiple instances running. expr: sum(up{job=~"thanos-compact.*"}) > 1 for: 5m labels: severity: warning - alert: ThanosCompactHalted annotations: - message: Thanos Compact {{$labels.job}} has failed to run and now is halted. + description: Thanos Compact {{$labels.job}} has failed to run and now is halted. + summary: Thanos Compact has failed to run ans is now halted. expr: thanos_compactor_halted{job=~"thanos-compact.*"} == 1 for: 5m labels: severity: warning - alert: ThanosCompactHighCompactionFailures annotations: - message: Thanos Compact {{$labels.job}} is failing to execute {{ $value | humanize - }}% of compactions. + description: Thanos Compact {{$labels.job}} is failing to execute {{ $value | + humanize }}% of compactions. + summary: Thanos Compact is failing to execute compactions. expr: | ( sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~"thanos-compact.*"}[5m])) @@ -39,8 +42,9 @@ rules: severity: warning - alert: ThanosCompactBucketHighOperationFailures annotations: - message: Thanos Compact {{$labels.job}} Bucket is failing to execute {{ $value + description: Thanos Compact {{$labels.job}} Bucket is failing to execute {{ $value | humanize }}% of operations. + summary: Thanos Compact Bucket is having a high number of operation failures. expr: | ( sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-compact.*"}[5m])) @@ -53,7 +57,8 @@ rules: severity: warning - alert: ThanosCompactHasNotRun annotations: - message: Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours. + description: Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours. + summary: Thanos Compact has not uploaded anything for last 24 hours. expr: (time() - max(max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~"thanos-compact.*"}[24h]))) / 60 / 60 > 24 labels: @@ -70,7 +75,8 @@ name: thanos-rule.rules rules: - alert: ThanosRuleQueueIsDroppingAlerts annotations: - message: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to queue alerts. + description: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to queue alerts. + summary: Thanos Rule is failing to queue alerts. expr: | sum by (job) (rate(thanos_alert_queue_alerts_dropped_total{job=~"thanos-rule.*"}[5m])) > 0 for: 5m @@ -78,8 +84,9 @@ rules: severity: critical - alert: ThanosRuleSenderIsFailingAlerts annotations: - message: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to send alerts + description: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to send alerts to alertmanager. + summary: Thanos Rule is failing to send alerts to alertmanager. expr: | sum by (job) (rate(thanos_alert_sender_alerts_dropped_total{job=~"thanos-rule.*"}[5m])) > 0 for: 5m @@ -87,7 +94,9 @@ rules: severity: critical - alert: ThanosRuleHighRuleEvaluationFailures annotations: - message: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to evaluate rules. + description: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to evaluate + rules. + summary: Thanos Rule is failing to evaluate rules. expr: | ( sum by (job) (rate(prometheus_rule_evaluation_failures_total{job=~"thanos-rule.*"}[5m])) @@ -100,8 +109,9 @@ rules: severity: critical - alert: ThanosRuleHighRuleEvaluationWarnings annotations: - message: Thanos Rule {{$labels.job}} {{$labels.pod}} has high number of evaluation + description: Thanos Rule {{$labels.job}} {{$labels.pod}} has high number of evaluation warnings. + summary: Thanos Rule has high number of evaluation warnings. expr: | sum by (job) (rate(thanos_rule_evaluation_with_warnings_total{job=~"thanos-rule.*"}[5m])) > 0 for: 15m @@ -109,8 +119,9 @@ rules: severity: info - alert: ThanosRuleRuleEvaluationLatencyHigh annotations: - message: Thanos Rule {{$labels.job}}/{{$labels.pod}} has higher evaluation latency - than interval for {{$labels.rule_group}}. + description: Thanos Rule {{$labels.job}}/{{$labels.pod}} has higher evaluation + latency than interval for {{$labels.rule_group}}. + summary: Thanos Rule has high rule evaluation latency. expr: | ( sum by (job, pod, rule_group) (prometheus_rule_group_last_duration_seconds{job=~"thanos-rule.*"}) @@ -122,8 +133,9 @@ rules: severity: warning - alert: ThanosRuleGrpcErrorRate annotations: - message: Thanos Rule {{$labels.job}} is failing to handle {{ $value | humanize + description: Thanos Rule {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests. + summary: Thanos Rule is failing to handle grpc requests. expr: | ( sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-rule.*"}[5m])) @@ -136,7 +148,8 @@ rules: severity: warning - alert: ThanosRuleConfigReloadFailure annotations: - message: Thanos Rule {{$labels.job}} has not been able to reload its configuration. + description: Thanos Rule {{$labels.job}} has not been able to reload its configuration. + summary: Thanos Rule has not been able to reload configuration. expr: avg(thanos_rule_config_last_reload_successful{job=~"thanos-rule.*"}) by (job) != 1 for: 5m @@ -144,8 +157,9 @@ rules: severity: info - alert: ThanosRuleQueryHighDNSFailures annotations: - message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS - queries for query endpoints. + description: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing + DNS queries for query endpoints. + summary: Thanos Rule is having high number of DNS failures. expr: | ( sum by (job) (rate(thanos_ruler_query_apis_dns_failures_total{job=~"thanos-rule.*"}[5m])) @@ -158,8 +172,9 @@ rules: severity: warning - alert: ThanosRuleAlertmanagerHighDNSFailures annotations: - message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS - queries for Alertmanager endpoints. + description: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing + DNS queries for Alertmanager endpoints. + summary: Thanos Rule is having high number of DNS failures. expr: | ( sum by (job) (rate(thanos_ruler_alertmanagers_dns_failures_total{job=~"thanos-rule.*"}[5m])) @@ -172,8 +187,9 @@ rules: severity: warning - alert: ThanosRuleNoEvaluationFor10Intervals annotations: - message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% rule groups + description: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% rule groups that did not evaluate for at least 10x of their expected interval. + summary: Thanos Rule has rule groups that did not evaluate for 10 intervals. expr: | time() - max by (job, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~"thanos-rule.*"}) > @@ -183,8 +199,9 @@ rules: severity: info - alert: ThanosNoRuleEvaluations annotations: - message: Thanos Rule {{$labels.job}} did not perform any rule evaluations in the - past 2 minutes. + description: Thanos Rule {{$labels.job}} did not perform any rule evaluations + in the past 2 minutes. + summary: Thanos Rule did not perform any rule evaluations. expr: | sum(rate(prometheus_rule_evaluations_total{job=~"thanos-rule.*"}[2m])) <= 0 and @@ -202,8 +219,9 @@ name: thanos-store.rules rules: - alert: ThanosStoreGrpcErrorRate annotations: - message: Thanos Store {{$labels.job}} is failing to handle {{ $value | humanize + description: Thanos Store {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests. + summary: Thanos Store is failing to handle qrpcd requests. expr: | ( sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-store.*"}[5m])) @@ -216,8 +234,9 @@ rules: severity: warning - alert: ThanosStoreSeriesGateLatencyHigh annotations: - message: Thanos Store {{$labels.job}} has a 99th percentile latency of {{ $value - }} seconds for store series gate requests. + description: Thanos Store {{$labels.job}} has a 99th percentile latency of {{ + $value }} seconds for store series gate requests. + summary: Thanos Store has high latency for store series gate requests. expr: | ( histogram_quantile(0.9, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2 @@ -229,8 +248,9 @@ rules: severity: warning - alert: ThanosStoreBucketHighOperationFailures annotations: - message: Thanos Store {{$labels.job}} Bucket is failing to execute {{ $value | - humanize }}% of operations. + description: Thanos Store {{$labels.job}} Bucket is failing to execute {{ $value + | humanize }}% of operations. + summary: Thanos Store Bucket is failing to execute operations. expr: | ( sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-store.*"}[5m])) @@ -243,8 +263,9 @@ rules: severity: warning - alert: ThanosStoreObjstoreOperationLatencyHigh annotations: - message: Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of - {{ $value }} seconds for the bucket operations. + description: Thanos Store {{$labels.job}} Bucket has a 99th percentile latency + of {{ $value }} seconds for the bucket operations. + summary: Thanos Store is having high latency for bucket operations. expr: | ( histogram_quantile(0.9, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2 @@ -264,7 +285,9 @@ name: thanos-sidecar.rules rules: - alert: ThanosSidecarPrometheusDown annotations: - message: Thanos Sidecar {{$labels.job}} {{$labels.pod}} cannot connect to Prometheus. + description: Thanos Sidecar {{$labels.job}} {{$labels.pod}} cannot connect to + Prometheus. + summary: Thanos Sidecar cannot connect to Prometheus expr: | sum by (job, pod) (thanos_sidecar_prometheus_up{job=~"thanos-sidecar.*"} == 0) for: 5m @@ -272,8 +295,9 @@ rules: severity: critical - alert: ThanosSidecarUnhealthy annotations: - message: Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for {{ $value - }} seconds. + description: Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for {{ + $value }} seconds. + summary: Thanos Sidecar is unhealthy. expr: | time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"thanos-sidecar.*"}) by (job, pod) >= 600 labels: @@ -288,8 +312,9 @@ name: thanos-query.rules rules: - alert: ThanosQueryHttpRequestQueryErrorRateHigh annotations: - message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize + description: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of "query" requests. + summary: Thanos Query is failing to handle requests. expr: | ( sum(rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query"}[5m])) @@ -301,8 +326,9 @@ rules: severity: critical - alert: ThanosQueryHttpRequestQueryRangeErrorRateHigh annotations: - message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize + description: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of "query_range" requests. + summary: Thanos Query is failing to handle requests. expr: | ( sum(rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query_range"}[5m])) @@ -314,8 +340,9 @@ rules: severity: critical - alert: ThanosQueryGrpcServerErrorRate annotations: - message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize + description: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests. + summary: Thanos Query is failing to handle requests. expr: | ( sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-query.*"}[5m])) @@ -328,8 +355,9 @@ rules: severity: warning - alert: ThanosQueryGrpcClientErrorRate annotations: - message: Thanos Query {{$labels.job}} is failing to send {{ $value | humanize + description: Thanos Query {{$labels.job}} is failing to send {{ $value | humanize }}% of requests. + summary: Thanos Query is failing to send requests. expr: | ( sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~"thanos-query.*"}[5m])) @@ -341,8 +369,9 @@ rules: severity: warning - alert: ThanosQueryHighDNSFailures annotations: - message: Thanos Query {{$labels.job}} have {{ $value | humanize }}% of failing + description: Thanos Query {{$labels.job}} have {{ $value | humanize }}% of failing DNS queries for store endpoints. + summary: Thanos Query is having high number of DNS failures. expr: | ( sum by (job) (rate(thanos_querier_store_apis_dns_failures_total{job=~"thanos-query.*"}[5m])) @@ -354,8 +383,9 @@ rules: severity: warning - alert: ThanosQueryInstantLatencyHigh annotations: - message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value - }} seconds for instant queries. + description: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ + $value }} seconds for instant queries. + summary: Thanos Query has high latency for queries. expr: | ( histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m]))) > 40 @@ -367,8 +397,9 @@ rules: severity: critical - alert: ThanosQueryRangeLatencyHigh annotations: - message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value - }} seconds for range queries. + description: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ + $value }} seconds for range queries. + summary: Thanos Query has high latency for queries. expr: | ( histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query_range"}[5m]))) > 90 @@ -388,8 +419,9 @@ name: thanos-receive.rules rules: - alert: ThanosReceiveHttpRequestErrorRateHigh annotations: - message: Thanos Receive {{$labels.job}} is failing to handle {{ $value | humanize + description: Thanos Receive {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests. + summary: Thanos Receive is failing to handle requests. expr: | ( sum(rate(http_requests_total{code=~"5..", job=~"thanos-receive.*", handler="receive"}[5m])) @@ -401,8 +433,9 @@ rules: severity: critical - alert: ThanosReceiveHttpRequestLatencyHigh annotations: - message: Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ $value - }} seconds for requests. + description: Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ + $value }} seconds for requests. + summary: Thanos Receive has high HTTP requests latency. expr: | ( histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-receive.*", handler="receive"}[5m]))) > 10 @@ -414,8 +447,9 @@ rules: severity: critical - alert: ThanosReceiveHighReplicationFailures annotations: - message: Thanos Receive {{$labels.job}} is failing to replicate {{ $value | humanize - }}% of requests. + description: Thanos Receive {{$labels.job}} is failing to replicate {{ $value + | humanize }}% of requests. + summary: Thanos Receive is having high number of replication failures. expr: | thanos_receive_replication_factor > 1 and @@ -437,8 +471,9 @@ rules: severity: warning - alert: ThanosReceiveHighForwardRequestFailures annotations: - message: Thanos Receive {{$labels.job}} is failing to forward {{ $value | humanize - }}% of requests. + description: Thanos Receive {{$labels.job}} is failing to forward {{ $value | + humanize }}% of requests. + summary: Thanos Receive is failing to forward requests. expr: | ( sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~"thanos-receive.*"}[5m])) @@ -450,8 +485,9 @@ rules: severity: warning - alert: ThanosReceiveHighHashringFileRefreshFailures annotations: - message: Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{ - $value | humanize }} of attempts failed. + description: Thanos Receive {{$labels.job}} is failing to refresh hashring file, + {{ $value | humanize }} of attempts failed. + summary: Thanos Receive is failing to refresh hasring file. expr: | ( sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~"thanos-receive.*"}[5m])) @@ -464,7 +500,9 @@ rules: severity: warning - alert: ThanosReceiveConfigReloadFailure annotations: - message: Thanos Receive {{$labels.job}} has not been able to reload hashring configurations. + description: Thanos Receive {{$labels.job}} has not been able to reload hashring + configurations. + summary: Thanos Receive has not been able to reload configuration. expr: avg(thanos_receive_config_last_reload_successful{job=~"thanos-receive.*"}) by (job) != 1 for: 5m @@ -472,8 +510,9 @@ rules: severity: warning - alert: ThanosReceiveNoUpload annotations: - message: Thanos Receive {{ $labels.instance }} of {{$labels.job}} has not uploaded - latest data to object storage. + description: Thanos Receive {{ $labels.instance }} of {{$labels.job}} has not + uploaded latest data to object storage. + summary: Thanos Receive has not uploaded latest data to object storage. expr: | (up{job=~"thanos-receive.*"} - 1) + on (instance) # filters to only alert on current instance last 3h @@ -491,7 +530,8 @@ name: thanos-bucket-replicate.rules rules: - alert: ThanosBucketReplicateIsDown annotations: - message: Thanos Replicate has disappeared from Prometheus target discovery. + description: Thanos Replicate has disappeared from Prometheus target discovery. + summary: Thanos Replicate has disappeared from Prometheus target discovery. expr: | absent(up{job=~"thanos-bucket-replicate.*"}) for: 5m @@ -499,8 +539,9 @@ rules: severity: critical - alert: ThanosBucketReplicateErrorRate annotations: - message: Thanos Replicate failing to run, {{ $value | humanize }}% of attempts + description: Thanos Replicate failing to run, {{ $value | humanize }}% of attempts failed. + summary: Thanose Replicate is failing to run. expr: | ( sum(rate(thanos_replicate_replication_runs_total{result="error", job=~"thanos-bucket-replicate.*"}[5m])) @@ -512,8 +553,9 @@ rules: severity: critical - alert: ThanosBucketReplicateRunLatency annotations: - message: Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{ - $value }} seconds for the replicate operations. + description: Thanos Replicate {{$labels.job}} has a 99th percentile latency of + {{ $value }} seconds for the replicate operations. + summary: Thanos Replicate has a high latency for replicate operations. expr: | ( histogram_quantile(0.9, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-bucket-replicate.*"}[5m]))) > 20 @@ -535,7 +577,8 @@ name: thanos-component-absent.rules rules: - alert: ThanosCompactIsDown annotations: - message: ThanosCompact has disappeared from Prometheus target discovery. + description: ThanosCompact has disappeared from Prometheus target discovery. + summary: thanos component has disappeared from Prometheus target discovery. expr: | absent(up{job=~"thanos-compact.*"} == 1) for: 5m @@ -543,7 +586,8 @@ rules: severity: critical - alert: ThanosQueryIsDown annotations: - message: ThanosQuery has disappeared from Prometheus target discovery. + description: ThanosQuery has disappeared from Prometheus target discovery. + summary: thanos component has disappeared from Prometheus target discovery. expr: | absent(up{job=~"thanos-query.*"} == 1) for: 5m @@ -551,7 +595,8 @@ rules: severity: critical - alert: ThanosReceiveIsDown annotations: - message: ThanosReceive has disappeared from Prometheus target discovery. + description: ThanosReceive has disappeared from Prometheus target discovery. + summary: thanos component has disappeared from Prometheus target discovery. expr: | absent(up{job=~"thanos-receive.*"} == 1) for: 5m @@ -559,7 +604,8 @@ rules: severity: critical - alert: ThanosRuleIsDown annotations: - message: ThanosRule has disappeared from Prometheus target discovery. + description: ThanosRule has disappeared from Prometheus target discovery. + summary: thanos component has disappeared from Prometheus target discovery. expr: | absent(up{job=~"thanos-rule.*"} == 1) for: 5m @@ -567,7 +613,8 @@ rules: severity: critical - alert: ThanosSidecarIsDown annotations: - message: ThanosSidecar has disappeared from Prometheus target discovery. + description: ThanosSidecar has disappeared from Prometheus target discovery. + summary: thanos component has disappeared from Prometheus target discovery. expr: | absent(up{job=~"thanos-sidecar.*"} == 1) for: 5m @@ -575,7 +622,8 @@ rules: severity: critical - alert: ThanosStoreIsDown annotations: - message: ThanosStore has disappeared from Prometheus target discovery. + description: ThanosStore has disappeared from Prometheus target discovery. + summary: thanos component has disappeared from Prometheus target discovery. expr: | absent(up{job=~"thanos-store.*"} == 1) for: 5m diff --git a/examples/alerts/alerts.yaml b/examples/alerts/alerts.yaml index ad5f75301bc..98886a3fb67 100644 --- a/examples/alerts/alerts.yaml +++ b/examples/alerts/alerts.yaml @@ -3,23 +3,26 @@ groups: rules: - alert: ThanosCompactMultipleRunning annotations: - message: No more than one Thanos Compact instance should be running at once. + description: No more than one Thanos Compact instance should be running at once. There are {{ $value }} + summary: Thanos Compact has multiple instances running. expr: sum(up{job=~"thanos-compact.*"}) > 1 for: 5m labels: severity: warning - alert: ThanosCompactHalted annotations: - message: Thanos Compact {{$labels.job}} has failed to run and now is halted. + description: Thanos Compact {{$labels.job}} has failed to run and now is halted. + summary: Thanos Compact has failed to run ans is now halted. expr: thanos_compactor_halted{job=~"thanos-compact.*"} == 1 for: 5m labels: severity: warning - alert: ThanosCompactHighCompactionFailures annotations: - message: Thanos Compact {{$labels.job}} is failing to execute {{ $value | humanize - }}% of compactions. + description: Thanos Compact {{$labels.job}} is failing to execute {{ $value + | humanize }}% of compactions. + summary: Thanos Compact is failing to execute compactions. expr: | ( sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~"thanos-compact.*"}[5m])) @@ -32,8 +35,9 @@ groups: severity: warning - alert: ThanosCompactBucketHighOperationFailures annotations: - message: Thanos Compact {{$labels.job}} Bucket is failing to execute {{ $value - | humanize }}% of operations. + description: Thanos Compact {{$labels.job}} Bucket is failing to execute {{ + $value | humanize }}% of operations. + summary: Thanos Compact Bucket is having a high number of operation failures. expr: | ( sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-compact.*"}[5m])) @@ -46,7 +50,9 @@ groups: severity: warning - alert: ThanosCompactHasNotRun annotations: - message: Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours. + description: Thanos Compact {{$labels.job}} has not uploaded anything for 24 + hours. + summary: Thanos Compact has not uploaded anything for last 24 hours. expr: (time() - max(max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~"thanos-compact.*"}[24h]))) / 60 / 60 > 24 labels: @@ -55,8 +61,9 @@ groups: rules: - alert: ThanosQueryHttpRequestQueryErrorRateHigh annotations: - message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize + description: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of "query" requests. + summary: Thanos Query is failing to handle requests. expr: | ( sum(rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query"}[5m])) @@ -68,8 +75,9 @@ groups: severity: critical - alert: ThanosQueryHttpRequestQueryRangeErrorRateHigh annotations: - message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize + description: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of "query_range" requests. + summary: Thanos Query is failing to handle requests. expr: | ( sum(rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query_range"}[5m])) @@ -81,8 +89,9 @@ groups: severity: critical - alert: ThanosQueryGrpcServerErrorRate annotations: - message: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize + description: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests. + summary: Thanos Query is failing to handle requests. expr: | ( sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-query.*"}[5m])) @@ -95,8 +104,9 @@ groups: severity: warning - alert: ThanosQueryGrpcClientErrorRate annotations: - message: Thanos Query {{$labels.job}} is failing to send {{ $value | humanize + description: Thanos Query {{$labels.job}} is failing to send {{ $value | humanize }}% of requests. + summary: Thanos Query is failing to send requests. expr: | ( sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~"thanos-query.*"}[5m])) @@ -108,8 +118,9 @@ groups: severity: warning - alert: ThanosQueryHighDNSFailures annotations: - message: Thanos Query {{$labels.job}} have {{ $value | humanize }}% of failing + description: Thanos Query {{$labels.job}} have {{ $value | humanize }}% of failing DNS queries for store endpoints. + summary: Thanos Query is having high number of DNS failures. expr: | ( sum by (job) (rate(thanos_querier_store_apis_dns_failures_total{job=~"thanos-query.*"}[5m])) @@ -121,8 +132,9 @@ groups: severity: warning - alert: ThanosQueryInstantLatencyHigh annotations: - message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value - }} seconds for instant queries. + description: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ + $value }} seconds for instant queries. + summary: Thanos Query has high latency for queries. expr: | ( histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m]))) > 40 @@ -134,8 +146,9 @@ groups: severity: critical - alert: ThanosQueryRangeLatencyHigh annotations: - message: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value - }} seconds for range queries. + description: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ + $value }} seconds for range queries. + summary: Thanos Query has high latency for queries. expr: | ( histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query_range"}[5m]))) > 90 @@ -149,8 +162,9 @@ groups: rules: - alert: ThanosReceiveHttpRequestErrorRateHigh annotations: - message: Thanos Receive {{$labels.job}} is failing to handle {{ $value | humanize - }}% of requests. + description: Thanos Receive {{$labels.job}} is failing to handle {{ $value | + humanize }}% of requests. + summary: Thanos Receive is failing to handle requests. expr: | ( sum(rate(http_requests_total{code=~"5..", job=~"thanos-receive.*", handler="receive"}[5m])) @@ -162,8 +176,9 @@ groups: severity: critical - alert: ThanosReceiveHttpRequestLatencyHigh annotations: - message: Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ - $value }} seconds for requests. + description: Thanos Receive {{$labels.job}} has a 99th percentile latency of + {{ $value }} seconds for requests. + summary: Thanos Receive has high HTTP requests latency. expr: | ( histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-receive.*", handler="receive"}[5m]))) > 10 @@ -175,8 +190,9 @@ groups: severity: critical - alert: ThanosReceiveHighReplicationFailures annotations: - message: Thanos Receive {{$labels.job}} is failing to replicate {{ $value | - humanize }}% of requests. + description: Thanos Receive {{$labels.job}} is failing to replicate {{ $value + | humanize }}% of requests. + summary: Thanos Receive is having high number of replication failures. expr: | thanos_receive_replication_factor > 1 and @@ -198,8 +214,9 @@ groups: severity: warning - alert: ThanosReceiveHighForwardRequestFailures annotations: - message: Thanos Receive {{$labels.job}} is failing to forward {{ $value | humanize - }}% of requests. + description: Thanos Receive {{$labels.job}} is failing to forward {{ $value + | humanize }}% of requests. + summary: Thanos Receive is failing to forward requests. expr: | ( sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~"thanos-receive.*"}[5m])) @@ -211,8 +228,9 @@ groups: severity: warning - alert: ThanosReceiveHighHashringFileRefreshFailures annotations: - message: Thanos Receive {{$labels.job}} is failing to refresh hashring file, + description: Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{ $value | humanize }} of attempts failed. + summary: Thanos Receive is failing to refresh hasring file. expr: | ( sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~"thanos-receive.*"}[5m])) @@ -225,8 +243,9 @@ groups: severity: warning - alert: ThanosReceiveConfigReloadFailure annotations: - message: Thanos Receive {{$labels.job}} has not been able to reload hashring + description: Thanos Receive {{$labels.job}} has not been able to reload hashring configurations. + summary: Thanos Receive has not been able to reload configuration. expr: avg(thanos_receive_config_last_reload_successful{job=~"thanos-receive.*"}) by (job) != 1 for: 5m @@ -234,8 +253,9 @@ groups: severity: warning - alert: ThanosReceiveNoUpload annotations: - message: Thanos Receive {{ $labels.instance }} of {{$labels.job}} has not uploaded - latest data to object storage. + description: Thanos Receive {{ $labels.instance }} of {{$labels.job}} has not + uploaded latest data to object storage. + summary: Thanos Receive has not uploaded latest data to object storage. expr: | (up{job=~"thanos-receive.*"} - 1) + on (instance) # filters to only alert on current instance last 3h @@ -247,7 +267,9 @@ groups: rules: - alert: ThanosSidecarPrometheusDown annotations: - message: Thanos Sidecar {{$labels.job}} {{$labels.pod}} cannot connect to Prometheus. + description: Thanos Sidecar {{$labels.job}} {{$labels.pod}} cannot connect to + Prometheus. + summary: Thanos Sidecar cannot connect to Prometheus expr: | sum by (job, pod) (thanos_sidecar_prometheus_up{job=~"thanos-sidecar.*"} == 0) for: 5m @@ -255,8 +277,9 @@ groups: severity: critical - alert: ThanosSidecarUnhealthy annotations: - message: Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for {{ - $value }} seconds. + description: Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for + {{ $value }} seconds. + summary: Thanos Sidecar is unhealthy. expr: | time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"thanos-sidecar.*"}) by (job, pod) >= 600 labels: @@ -265,8 +288,9 @@ groups: rules: - alert: ThanosStoreGrpcErrorRate annotations: - message: Thanos Store {{$labels.job}} is failing to handle {{ $value | humanize + description: Thanos Store {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests. + summary: Thanos Store is failing to handle qrpcd requests. expr: | ( sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-store.*"}[5m])) @@ -279,8 +303,9 @@ groups: severity: warning - alert: ThanosStoreSeriesGateLatencyHigh annotations: - message: Thanos Store {{$labels.job}} has a 99th percentile latency of {{ $value - }} seconds for store series gate requests. + description: Thanos Store {{$labels.job}} has a 99th percentile latency of {{ + $value }} seconds for store series gate requests. + summary: Thanos Store has high latency for store series gate requests. expr: | ( histogram_quantile(0.9, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2 @@ -292,8 +317,9 @@ groups: severity: warning - alert: ThanosStoreBucketHighOperationFailures annotations: - message: Thanos Store {{$labels.job}} Bucket is failing to execute {{ $value + description: Thanos Store {{$labels.job}} Bucket is failing to execute {{ $value | humanize }}% of operations. + summary: Thanos Store Bucket is failing to execute operations. expr: | ( sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-store.*"}[5m])) @@ -306,8 +332,9 @@ groups: severity: warning - alert: ThanosStoreObjstoreOperationLatencyHigh annotations: - message: Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of - {{ $value }} seconds for the bucket operations. + description: Thanos Store {{$labels.job}} Bucket has a 99th percentile latency + of {{ $value }} seconds for the bucket operations. + summary: Thanos Store is having high latency for bucket operations. expr: | ( histogram_quantile(0.9, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2 @@ -321,7 +348,9 @@ groups: rules: - alert: ThanosRuleQueueIsDroppingAlerts annotations: - message: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to queue alerts. + description: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to queue + alerts. + summary: Thanos Rule is failing to queue alerts. expr: | sum by (job) (rate(thanos_alert_queue_alerts_dropped_total{job=~"thanos-rule.*"}[5m])) > 0 for: 5m @@ -329,8 +358,9 @@ groups: severity: critical - alert: ThanosRuleSenderIsFailingAlerts annotations: - message: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to send alerts - to alertmanager. + description: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to send + alerts to alertmanager. + summary: Thanos Rule is failing to send alerts to alertmanager. expr: | sum by (job) (rate(thanos_alert_sender_alerts_dropped_total{job=~"thanos-rule.*"}[5m])) > 0 for: 5m @@ -338,8 +368,9 @@ groups: severity: critical - alert: ThanosRuleHighRuleEvaluationFailures annotations: - message: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to evaluate + description: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to evaluate rules. + summary: Thanos Rule is failing to evaluate rules. expr: | ( sum by (job) (rate(prometheus_rule_evaluation_failures_total{job=~"thanos-rule.*"}[5m])) @@ -352,8 +383,9 @@ groups: severity: critical - alert: ThanosRuleHighRuleEvaluationWarnings annotations: - message: Thanos Rule {{$labels.job}} {{$labels.pod}} has high number of evaluation - warnings. + description: Thanos Rule {{$labels.job}} {{$labels.pod}} has high number of + evaluation warnings. + summary: Thanos Rule has high number of evaluation warnings. expr: | sum by (job) (rate(thanos_rule_evaluation_with_warnings_total{job=~"thanos-rule.*"}[5m])) > 0 for: 15m @@ -361,8 +393,9 @@ groups: severity: info - alert: ThanosRuleRuleEvaluationLatencyHigh annotations: - message: Thanos Rule {{$labels.job}}/{{$labels.pod}} has higher evaluation latency - than interval for {{$labels.rule_group}}. + description: Thanos Rule {{$labels.job}}/{{$labels.pod}} has higher evaluation + latency than interval for {{$labels.rule_group}}. + summary: Thanos Rule has high rule evaluation latency. expr: | ( sum by (job, pod, rule_group) (prometheus_rule_group_last_duration_seconds{job=~"thanos-rule.*"}) @@ -374,8 +407,9 @@ groups: severity: warning - alert: ThanosRuleGrpcErrorRate annotations: - message: Thanos Rule {{$labels.job}} is failing to handle {{ $value | humanize + description: Thanos Rule {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests. + summary: Thanos Rule is failing to handle grpc requests. expr: | ( sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-rule.*"}[5m])) @@ -388,7 +422,8 @@ groups: severity: warning - alert: ThanosRuleConfigReloadFailure annotations: - message: Thanos Rule {{$labels.job}} has not been able to reload its configuration. + description: Thanos Rule {{$labels.job}} has not been able to reload its configuration. + summary: Thanos Rule has not been able to reload configuration. expr: avg(thanos_rule_config_last_reload_successful{job=~"thanos-rule.*"}) by (job) != 1 for: 5m @@ -396,8 +431,9 @@ groups: severity: info - alert: ThanosRuleQueryHighDNSFailures annotations: - message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing + description: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS queries for query endpoints. + summary: Thanos Rule is having high number of DNS failures. expr: | ( sum by (job) (rate(thanos_ruler_query_apis_dns_failures_total{job=~"thanos-rule.*"}[5m])) @@ -410,8 +446,9 @@ groups: severity: warning - alert: ThanosRuleAlertmanagerHighDNSFailures annotations: - message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing + description: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS queries for Alertmanager endpoints. + summary: Thanos Rule is having high number of DNS failures. expr: | ( sum by (job) (rate(thanos_ruler_alertmanagers_dns_failures_total{job=~"thanos-rule.*"}[5m])) @@ -424,8 +461,9 @@ groups: severity: warning - alert: ThanosRuleNoEvaluationFor10Intervals annotations: - message: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% rule groups + description: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% rule groups that did not evaluate for at least 10x of their expected interval. + summary: Thanos Rule has rule groups that did not evaluate for 10 intervals. expr: | time() - max by (job, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~"thanos-rule.*"}) > @@ -435,8 +473,9 @@ groups: severity: info - alert: ThanosNoRuleEvaluations annotations: - message: Thanos Rule {{$labels.job}} did not perform any rule evaluations in - the past 2 minutes. + description: Thanos Rule {{$labels.job}} did not perform any rule evaluations + in the past 2 minutes. + summary: Thanos Rule did not perform any rule evaluations. expr: | sum(rate(prometheus_rule_evaluations_total{job=~"thanos-rule.*"}[2m])) <= 0 and @@ -448,7 +487,8 @@ groups: rules: - alert: ThanosCompactIsDown annotations: - message: ThanosCompact has disappeared from Prometheus target discovery. + description: ThanosCompact has disappeared from Prometheus target discovery. + summary: thanos component has disappeared from Prometheus target discovery. expr: | absent(up{job=~"thanos-compact.*"} == 1) for: 5m @@ -456,7 +496,8 @@ groups: severity: critical - alert: ThanosQueryIsDown annotations: - message: ThanosQuery has disappeared from Prometheus target discovery. + description: ThanosQuery has disappeared from Prometheus target discovery. + summary: thanos component has disappeared from Prometheus target discovery. expr: | absent(up{job=~"thanos-query.*"} == 1) for: 5m @@ -464,7 +505,8 @@ groups: severity: critical - alert: ThanosReceiveIsDown annotations: - message: ThanosReceive has disappeared from Prometheus target discovery. + description: ThanosReceive has disappeared from Prometheus target discovery. + summary: thanos component has disappeared from Prometheus target discovery. expr: | absent(up{job=~"thanos-receive.*"} == 1) for: 5m @@ -472,7 +514,8 @@ groups: severity: critical - alert: ThanosRuleIsDown annotations: - message: ThanosRule has disappeared from Prometheus target discovery. + description: ThanosRule has disappeared from Prometheus target discovery. + summary: thanos component has disappeared from Prometheus target discovery. expr: | absent(up{job=~"thanos-rule.*"} == 1) for: 5m @@ -480,7 +523,8 @@ groups: severity: critical - alert: ThanosSidecarIsDown annotations: - message: ThanosSidecar has disappeared from Prometheus target discovery. + description: ThanosSidecar has disappeared from Prometheus target discovery. + summary: thanos component has disappeared from Prometheus target discovery. expr: | absent(up{job=~"thanos-sidecar.*"} == 1) for: 5m @@ -488,7 +532,8 @@ groups: severity: critical - alert: ThanosStoreIsDown annotations: - message: ThanosStore has disappeared from Prometheus target discovery. + description: ThanosStore has disappeared from Prometheus target discovery. + summary: thanos component has disappeared from Prometheus target discovery. expr: | absent(up{job=~"thanos-store.*"} == 1) for: 5m @@ -498,7 +543,8 @@ groups: rules: - alert: ThanosBucketReplicateIsDown annotations: - message: Thanos Replicate has disappeared from Prometheus target discovery. + description: Thanos Replicate has disappeared from Prometheus target discovery. + summary: Thanos Replicate has disappeared from Prometheus target discovery. expr: | absent(up{job=~"thanos-bucket-replicate.*"}) for: 5m @@ -506,8 +552,9 @@ groups: severity: critical - alert: ThanosBucketReplicateErrorRate annotations: - message: Thanos Replicate failing to run, {{ $value | humanize }}% of attempts + description: Thanos Replicate failing to run, {{ $value | humanize }}% of attempts failed. + summary: Thanose Replicate is failing to run. expr: | ( sum(rate(thanos_replicate_replication_runs_total{result="error", job=~"thanos-bucket-replicate.*"}[5m])) @@ -519,8 +566,9 @@ groups: severity: critical - alert: ThanosBucketReplicateRunLatency annotations: - message: Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{ - $value }} seconds for the replicate operations. + description: Thanos Replicate {{$labels.job}} has a 99th percentile latency + of {{ $value }} seconds for the replicate operations. + summary: Thanos Replicate has a high latency for replicate operations. expr: | ( histogram_quantile(0.9, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-bucket-replicate.*"}[5m]))) > 20 diff --git a/examples/alerts/tests.yaml b/examples/alerts/tests.yaml index adac87b9a4a..3d75fd7353f 100644 --- a/examples/alerts/tests.yaml +++ b/examples/alerts/tests.yaml @@ -79,13 +79,15 @@ tests: job: thanos-sidecar pod: thanos-sidecar-pod-0 exp_annotations: - message: 'Thanos Sidecar thanos-sidecar thanos-sidecar-pod-0 is unhealthy for 600 seconds.' + description: 'Thanos Sidecar thanos-sidecar thanos-sidecar-pod-0 is unhealthy for 600 seconds.' + summary: 'Thanos Sidecar is unhealthy.' - exp_labels: severity: critical job: thanos-sidecar pod: thanos-sidecar-pod-1 exp_annotations: - message: 'Thanos Sidecar thanos-sidecar thanos-sidecar-pod-1 is unhealthy for 600 seconds.' + description: 'Thanos Sidecar thanos-sidecar thanos-sidecar-pod-1 is unhealthy for 600 seconds.' + summary: 'Thanos Sidecar is unhealthy.' - eval_time: 11m alertname: ThanosSidecarUnhealthy exp_alerts: @@ -94,13 +96,15 @@ tests: job: thanos-sidecar pod: thanos-sidecar-pod-0 exp_annotations: - message: 'Thanos Sidecar thanos-sidecar thanos-sidecar-pod-0 is unhealthy for 660 seconds.' + description: 'Thanos Sidecar thanos-sidecar thanos-sidecar-pod-0 is unhealthy for 660 seconds.' + summary: 'Thanos Sidecar is unhealthy.' - exp_labels: severity: critical job: thanos-sidecar pod: thanos-sidecar-pod-1 exp_annotations: - message: 'Thanos Sidecar thanos-sidecar thanos-sidecar-pod-1 is unhealthy for 660 seconds.' + description: 'Thanos Sidecar thanos-sidecar thanos-sidecar-pod-1 is unhealthy for 660 seconds.' + summary: 'Thanos Sidecar is unhealthy.' - eval_time: 12m alertname: ThanosSidecarUnhealthy exp_alerts: @@ -109,10 +113,12 @@ tests: job: thanos-sidecar pod: thanos-sidecar-pod-0 exp_annotations: - message: 'Thanos Sidecar thanos-sidecar thanos-sidecar-pod-0 is unhealthy for 720 seconds.' + description: 'Thanos Sidecar thanos-sidecar thanos-sidecar-pod-0 is unhealthy for 720 seconds.' + summary: 'Thanos Sidecar is unhealthy.' - exp_labels: severity: critical job: thanos-sidecar pod: thanos-sidecar-pod-1 exp_annotations: - message: 'Thanos Sidecar thanos-sidecar thanos-sidecar-pod-1 is unhealthy for 720 seconds.' + description: 'Thanos Sidecar thanos-sidecar thanos-sidecar-pod-1 is unhealthy for 720 seconds.' + summary: 'Thanos Sidecar is unhealthy.' diff --git a/mixin/alerts/absent.libsonnet b/mixin/alerts/absent.libsonnet index 4c7bf6bdff1..ba1e56c3ff4 100644 --- a/mixin/alerts/absent.libsonnet +++ b/mixin/alerts/absent.libsonnet @@ -26,7 +26,8 @@ severity: 'critical', }, annotations: { - message: '%s has disappeared from Prometheus target discovery.' % name, + description: '%s has disappeared from Prometheus target discovery.' % name, + summary: 'thanos component has disappeared from Prometheus target discovery.', }, } for name in std.objectFields(thanos.jobs) diff --git a/mixin/alerts/bucket_replicate.libsonnet b/mixin/alerts/bucket_replicate.libsonnet index 7517187c4e2..6235f133761 100644 --- a/mixin/alerts/bucket_replicate.libsonnet +++ b/mixin/alerts/bucket_replicate.libsonnet @@ -20,13 +20,15 @@ severity: 'critical', }, annotations: { - message: 'Thanos Replicate has disappeared from Prometheus target discovery.', + description: 'Thanos Replicate has disappeared from Prometheus target discovery.', + summary: 'Thanos Replicate has disappeared from Prometheus target discovery.', }, }, { alert: 'ThanosBucketReplicateErrorRate', annotations: { - message: 'Thanos Replicate failing to run, {{ $value | humanize }}% of attempts failed.', + description: 'Thanos Replicate failing to run, {{ $value | humanize }}% of attempts failed.', + summary: 'Thanose Replicate is failing to run.', }, expr: ||| ( @@ -43,7 +45,8 @@ { alert: 'ThanosBucketReplicateRunLatency', annotations: { - message: 'Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for the replicate operations.', + description: 'Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for the replicate operations.', + summary: 'Thanos Replicate has a high latency for replicate operations.', }, expr: ||| ( diff --git a/mixin/alerts/compact.libsonnet b/mixin/alerts/compact.libsonnet index 3fb8f474a5d..decc6d184b2 100644 --- a/mixin/alerts/compact.libsonnet +++ b/mixin/alerts/compact.libsonnet @@ -13,7 +13,8 @@ { alert: 'ThanosCompactMultipleRunning', annotations: { - message: 'No more than one Thanos Compact instance should be running at once. There are {{ $value }}', + description: 'No more than one Thanos Compact instance should be running at once. There are {{ $value }}', + summary: 'Thanos Compact has multiple instances running.', }, expr: 'sum(up{%(selector)s}) > 1' % thanos.compact, 'for': '5m', @@ -24,7 +25,8 @@ { alert: 'ThanosCompactHalted', annotations: { - message: 'Thanos Compact {{$labels.job}} has failed to run and now is halted.', + description: 'Thanos Compact {{$labels.job}} has failed to run and now is halted.', + summary: 'Thanos Compact has failed to run ans is now halted.', }, expr: 'thanos_compactor_halted{%(selector)s} == 1' % thanos.compact, 'for': '5m', @@ -35,7 +37,8 @@ { alert: 'ThanosCompactHighCompactionFailures', annotations: { - message: 'Thanos Compact {{$labels.job}} is failing to execute {{ $value | humanize }}% of compactions.', + description: 'Thanos Compact {{$labels.job}} is failing to execute {{ $value | humanize }}% of compactions.', + summary: 'Thanos Compact is failing to execute compactions.', }, expr: ||| ( @@ -53,7 +56,8 @@ { alert: 'ThanosCompactBucketHighOperationFailures', annotations: { - message: 'Thanos Compact {{$labels.job}} Bucket is failing to execute {{ $value | humanize }}% of operations.', + description: 'Thanos Compact {{$labels.job}} Bucket is failing to execute {{ $value | humanize }}% of operations.', + summary: 'Thanos Compact Bucket is having a high number of operation failures.', }, expr: ||| ( @@ -71,7 +75,8 @@ { alert: 'ThanosCompactHasNotRun', annotations: { - message: 'Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours.', + description: 'Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours.', + summary: 'Thanos Compact has not uploaded anything for last 24 hours.', }, expr: '(time() - max(max_over_time(thanos_objstore_bucket_last_successful_upload_time{%(selector)s}[24h]))) / 60 / 60 > 24' % thanos.compact, labels: { diff --git a/mixin/alerts/query.libsonnet b/mixin/alerts/query.libsonnet index 6326c846a7c..fdfd5f611a1 100644 --- a/mixin/alerts/query.libsonnet +++ b/mixin/alerts/query.libsonnet @@ -16,7 +16,8 @@ { alert: 'ThanosQueryHttpRequestQueryErrorRateHigh', annotations: { - message: 'Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of "query" requests.', + description: 'Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of "query" requests.', + summary: 'Thanos Query is failing to handle requests.', }, expr: ||| ( @@ -33,7 +34,8 @@ { alert: 'ThanosQueryHttpRequestQueryRangeErrorRateHigh', annotations: { - message: 'Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of "query_range" requests.', + description: 'Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of "query_range" requests.', + summary: 'Thanos Query is failing to handle requests.', }, expr: ||| ( @@ -50,7 +52,8 @@ { alert: 'ThanosQueryGrpcServerErrorRate', annotations: { - message: 'Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.', + description: 'Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.', + summary: 'Thanos Query is failing to handle requests.', }, expr: ||| ( @@ -68,7 +71,8 @@ { alert: 'ThanosQueryGrpcClientErrorRate', annotations: { - message: 'Thanos Query {{$labels.job}} is failing to send {{ $value | humanize }}% of requests.', + description: 'Thanos Query {{$labels.job}} is failing to send {{ $value | humanize }}% of requests.', + summary: 'Thanos Query is failing to send requests.', }, expr: ||| ( @@ -85,7 +89,8 @@ { alert: 'ThanosQueryHighDNSFailures', annotations: { - message: 'Thanos Query {{$labels.job}} have {{ $value | humanize }}% of failing DNS queries for store endpoints.', + description: 'Thanos Query {{$labels.job}} have {{ $value | humanize }}% of failing DNS queries for store endpoints.', + summary: 'Thanos Query is having high number of DNS failures.', }, expr: ||| ( @@ -102,7 +107,8 @@ { alert: 'ThanosQueryInstantLatencyHigh', annotations: { - message: 'Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for instant queries.', + description: 'Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for instant queries.', + summary: 'Thanos Query has high latency for queries.', }, expr: ||| ( @@ -119,7 +125,8 @@ { alert: 'ThanosQueryRangeLatencyHigh', annotations: { - message: 'Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for range queries.', + description: 'Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for range queries.', + summary: 'Thanos Query has high latency for queries.', }, expr: ||| ( diff --git a/mixin/alerts/receive.libsonnet b/mixin/alerts/receive.libsonnet index 7c3dbee5e0e..e07bf84dbbb 100644 --- a/mixin/alerts/receive.libsonnet +++ b/mixin/alerts/receive.libsonnet @@ -15,7 +15,8 @@ { alert: 'ThanosReceiveHttpRequestErrorRateHigh', annotations: { - message: 'Thanos Receive {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.', + description: 'Thanos Receive {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.', + summary: 'Thanos Receive is failing to handle requests.', }, expr: ||| ( @@ -32,7 +33,8 @@ { alert: 'ThanosReceiveHttpRequestLatencyHigh', annotations: { - message: 'Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for requests.', + description: 'Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for requests.', + summary: 'Thanos Receive has high HTTP requests latency.', }, expr: ||| ( @@ -49,7 +51,8 @@ { alert: 'ThanosReceiveHighReplicationFailures', annotations: { - message: 'Thanos Receive {{$labels.job}} is failing to replicate {{ $value | humanize }}% of requests.', + description: 'Thanos Receive {{$labels.job}} is failing to replicate {{ $value | humanize }}% of requests.', + summary: 'Thanos Receive is having high number of replication failures.', }, expr: ||| thanos_receive_replication_factor > 1 @@ -76,7 +79,8 @@ { alert: 'ThanosReceiveHighForwardRequestFailures', annotations: { - message: 'Thanos Receive {{$labels.job}} is failing to forward {{ $value | humanize }}% of requests.', + description: 'Thanos Receive {{$labels.job}} is failing to forward {{ $value | humanize }}% of requests.', + summary: 'Thanos Receive is failing to forward requests.', }, expr: ||| ( @@ -93,7 +97,8 @@ { alert: 'ThanosReceiveHighHashringFileRefreshFailures', annotations: { - message: 'Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{ $value | humanize }} of attempts failed.', + description: 'Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{ $value | humanize }} of attempts failed.', + summary: 'Thanos Receive is failing to refresh hasring file.', }, expr: ||| ( @@ -111,7 +116,8 @@ { alert: 'ThanosReceiveConfigReloadFailure', annotations: { - message: 'Thanos Receive {{$labels.job}} has not been able to reload hashring configurations.', + description: 'Thanos Receive {{$labels.job}} has not been able to reload hashring configurations.', + summary: 'Thanos Receive has not been able to reload configuration.', }, expr: 'avg(thanos_receive_config_last_reload_successful{%(selector)s}) by (job) != 1' % thanos.receive, 'for': '5m', @@ -122,7 +128,8 @@ { alert: 'ThanosReceiveNoUpload', annotations: { - message: 'Thanos Receive {{ $labels.instance }} of {{$labels.job}} has not uploaded latest data to object storage.', + description: 'Thanos Receive {{ $labels.instance }} of {{$labels.job}} has not uploaded latest data to object storage.', + summary: 'Thanos Receive has not uploaded latest data to object storage.', }, expr: ||| (up{%(selector)s} - 1) diff --git a/mixin/alerts/rule.libsonnet b/mixin/alerts/rule.libsonnet index 80f6ebd58b7..552625bd201 100644 --- a/mixin/alerts/rule.libsonnet +++ b/mixin/alerts/rule.libsonnet @@ -15,7 +15,8 @@ { alert: 'ThanosRuleQueueIsDroppingAlerts', annotations: { - message: 'Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to queue alerts.', + description: 'Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to queue alerts.', + summary: 'Thanos Rule is failing to queue alerts.', }, expr: ||| sum by (job) (rate(thanos_alert_queue_alerts_dropped_total{%(selector)s}[5m])) > 0 @@ -28,7 +29,8 @@ { alert: 'ThanosRuleSenderIsFailingAlerts', annotations: { - message: 'Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to send alerts to alertmanager.', + description: 'Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to send alerts to alertmanager.', + summary: 'Thanos Rule is failing to send alerts to alertmanager.', }, expr: ||| sum by (job) (rate(thanos_alert_sender_alerts_dropped_total{%(selector)s}[5m])) > 0 @@ -41,7 +43,8 @@ { alert: 'ThanosRuleHighRuleEvaluationFailures', annotations: { - message: 'Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to evaluate rules.', + description: 'Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to evaluate rules.', + summary: 'Thanos Rule is failing to evaluate rules.', }, expr: ||| ( @@ -60,7 +63,8 @@ { alert: 'ThanosRuleHighRuleEvaluationWarnings', annotations: { - message: 'Thanos Rule {{$labels.job}} {{$labels.pod}} has high number of evaluation warnings.', + description: 'Thanos Rule {{$labels.job}} {{$labels.pod}} has high number of evaluation warnings.', + summary: 'Thanos Rule has high number of evaluation warnings.', }, expr: ||| sum by (job) (rate(thanos_rule_evaluation_with_warnings_total{%(selector)s}[5m])) > 0 @@ -74,7 +78,8 @@ { alert: 'ThanosRuleRuleEvaluationLatencyHigh', annotations: { - message: 'Thanos Rule {{$labels.job}}/{{$labels.pod}} has higher evaluation latency than interval for {{$labels.rule_group}}.', + description: 'Thanos Rule {{$labels.job}}/{{$labels.pod}} has higher evaluation latency than interval for {{$labels.rule_group}}.', + summary: 'Thanos Rule has high rule evaluation latency.', }, expr: ||| ( @@ -91,7 +96,8 @@ { alert: 'ThanosRuleGrpcErrorRate', annotations: { - message: 'Thanos Rule {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.', + description: 'Thanos Rule {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.', + summary: 'Thanos Rule is failing to handle grpc requests.', }, expr: ||| ( @@ -109,7 +115,8 @@ { alert: 'ThanosRuleConfigReloadFailure', annotations: { - message: 'Thanos Rule {{$labels.job}} has not been able to reload its configuration.', + description: 'Thanos Rule {{$labels.job}} has not been able to reload its configuration.', + summary: 'Thanos Rule has not been able to reload configuration.', }, expr: 'avg(thanos_rule_config_last_reload_successful{%(selector)s}) by (job) != 1' % thanos.rule, 'for': '5m', @@ -120,7 +127,8 @@ { alert: 'ThanosRuleQueryHighDNSFailures', annotations: { - message: 'Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS queries for query endpoints.', + description: 'Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS queries for query endpoints.', + summary: 'Thanos Rule is having high number of DNS failures.', }, expr: ||| ( @@ -138,7 +146,8 @@ { alert: 'ThanosRuleAlertmanagerHighDNSFailures', annotations: { - message: 'Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS queries for Alertmanager endpoints.', + description: 'Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing DNS queries for Alertmanager endpoints.', + summary: 'Thanos Rule is having high number of DNS failures.', }, expr: ||| ( @@ -157,7 +166,8 @@ // NOTE: This alert will give false positive if no rules are configured. alert: 'ThanosRuleNoEvaluationFor10Intervals', annotations: { - message: 'Thanos Rule {{$labels.job}} has {{ $value | humanize }}% rule groups that did not evaluate for at least 10x of their expected interval.', + description: 'Thanos Rule {{$labels.job}} has {{ $value | humanize }}% rule groups that did not evaluate for at least 10x of their expected interval.', + summary: 'Thanos Rule has rule groups that did not evaluate for 10 intervals.', }, expr: ||| time() - max by (job, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{%(selector)s}) @@ -173,7 +183,8 @@ { alert: 'ThanosNoRuleEvaluations', annotations: { - message: 'Thanos Rule {{$labels.job}} did not perform any rule evaluations in the past 2 minutes.', + description: 'Thanos Rule {{$labels.job}} did not perform any rule evaluations in the past 2 minutes.', + summary: 'Thanos Rule did not perform any rule evaluations.', }, expr: ||| sum(rate(prometheus_rule_evaluations_total{%(selector)s}[2m])) <= 0 diff --git a/mixin/alerts/sidecar.libsonnet b/mixin/alerts/sidecar.libsonnet index e1790dbac63..20f217ce15e 100644 --- a/mixin/alerts/sidecar.libsonnet +++ b/mixin/alerts/sidecar.libsonnet @@ -11,7 +11,8 @@ { alert: 'ThanosSidecarPrometheusDown', annotations: { - message: 'Thanos Sidecar {{$labels.job}} {{$labels.pod}} cannot connect to Prometheus.', + description: 'Thanos Sidecar {{$labels.job}} {{$labels.pod}} cannot connect to Prometheus.', + summary: 'Thanos Sidecar cannot connect to Prometheus', }, expr: ||| sum by (job, pod) (thanos_sidecar_prometheus_up{%(selector)s} == 0) @@ -24,7 +25,8 @@ { alert: 'ThanosSidecarUnhealthy', annotations: { - message: 'Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for {{ $value }} seconds.', + description: 'Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for {{ $value }} seconds.', + summary: 'Thanos Sidecar is unhealthy.', }, expr: ||| time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{%(selector)s}) by (job, pod) >= 600 diff --git a/mixin/alerts/store.libsonnet b/mixin/alerts/store.libsonnet index 04d723ffaa5..f895b4602f5 100644 --- a/mixin/alerts/store.libsonnet +++ b/mixin/alerts/store.libsonnet @@ -16,7 +16,8 @@ { alert: 'ThanosStoreGrpcErrorRate', annotations: { - message: 'Thanos Store {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.', + description: 'Thanos Store {{$labels.job}} is failing to handle {{ $value | humanize }}% of requests.', + summary: 'Thanos Store is failing to handle qrpcd requests.', }, expr: ||| ( @@ -34,7 +35,8 @@ { alert: 'ThanosStoreSeriesGateLatencyHigh', annotations: { - message: 'Thanos Store {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for store series gate requests.', + description: 'Thanos Store {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for store series gate requests.', + summary: 'Thanos Store has high latency for store series gate requests.', }, expr: ||| ( @@ -51,7 +53,8 @@ { alert: 'ThanosStoreBucketHighOperationFailures', annotations: { - message: 'Thanos Store {{$labels.job}} Bucket is failing to execute {{ $value | humanize }}% of operations.', + description: 'Thanos Store {{$labels.job}} Bucket is failing to execute {{ $value | humanize }}% of operations.', + summary: 'Thanos Store Bucket is failing to execute operations.', }, expr: ||| ( @@ -69,7 +72,8 @@ { alert: 'ThanosStoreObjstoreOperationLatencyHigh', annotations: { - message: 'Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of {{ $value }} seconds for the bucket operations.', + description: 'Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of {{ $value }} seconds for the bucket operations.', + summary: 'Thanos Store is having high latency for bucket operations.', }, expr: ||| ( From d17cd0e89c6ce696d86813026c8278a5759da507 Mon Sep 17 00:00:00 2001 From: Ben Ye Date: Mon, 17 Aug 2020 09:19:13 -0400 Subject: [PATCH 2/6] add instrumentation middlewares to query frontend (#3030) Signed-off-by: Ben Ye --- cmd/thanos/query-frontend.go | 41 ++++++++++++++++++++++++++----- docs/components/query-frontend.md | 7 ++++++ 2 files changed, 42 insertions(+), 6 deletions(-) diff --git a/cmd/thanos/query-frontend.go b/cmd/thanos/query-frontend.go index f7e376d97cb..2e05cb62b4b 100644 --- a/cmd/thanos/query-frontend.go +++ b/cmd/thanos/query-frontend.go @@ -7,6 +7,7 @@ import ( "net/http" "time" + "github.com/NYTimes/gziphandler" "github.com/cortexproject/cortex/pkg/querier/frontend" "github.com/cortexproject/cortex/pkg/querier/queryrange" "github.com/go-kit/kit/log" @@ -22,10 +23,14 @@ import ( "github.com/thanos-io/thanos/pkg/component" "github.com/thanos-io/thanos/pkg/extflag" "github.com/thanos-io/thanos/pkg/extprom" + extpromhttp "github.com/thanos-io/thanos/pkg/extprom/http" + "github.com/thanos-io/thanos/pkg/logging" "github.com/thanos-io/thanos/pkg/prober" "github.com/thanos-io/thanos/pkg/queryfrontend" "github.com/thanos-io/thanos/pkg/queryfrontend/cache" httpserver "github.com/thanos-io/thanos/pkg/server/http" + "github.com/thanos-io/thanos/pkg/server/http/middleware" + "github.com/thanos-io/thanos/pkg/tracing" ) type queryFrontendConfig struct { @@ -35,6 +40,8 @@ type queryFrontendConfig struct { downstreamURL string compressResponses bool LogQueriesLongerThan time.Duration + + requestLoggingDecision string } type queryRangeConfig struct { @@ -77,6 +84,8 @@ func (c *queryFrontendConfig) registerFlag(cmd *kingpin.CmdClause) { cmd.Flag("query-frontend.log_queries_longer_than", "Log queries that are slower than the specified duration. "+ "Set to 0 to disable. Set to < 0 to enable on all queries.").Default("0").DurationVar(&c.LogQueriesLongerThan) + + cmd.Flag("log.request.decision", "Request Logging for logging the start and end of requests. LogFinishCall is enabled by default. LogFinishCall : Logs the finish call of the requests. LogStartAndFinishCall : Logs the start and finish call of the requests. NoLogCall : Disable request logging.").Default("LogFinishCall").EnumVar(&c.requestLoggingDecision, "NoLogCall", "LogFinishCall", "LogStartAndFinishCall") } func registerQueryFrontend(m map[string]setupFunc, app *kingpin.Application) { @@ -85,8 +94,8 @@ func registerQueryFrontend(m map[string]setupFunc, app *kingpin.Application) { conf := &queryFrontendConfig{} conf.registerFlag(cmd) - m[comp.String()] = func(g *run.Group, logger log.Logger, reg *prometheus.Registry, _ opentracing.Tracer, _ <-chan struct{}, _ bool) error { - return runQueryFrontend(g, logger, reg, conf, comp) + m[comp.String()] = func(g *run.Group, logger log.Logger, reg *prometheus.Registry, tracer opentracing.Tracer, _ <-chan struct{}, _ bool) error { + return runQueryFrontend(g, logger, reg, tracer, conf, comp) } } @@ -94,6 +103,7 @@ func runQueryFrontend( g *run.Group, logger log.Logger, reg *prometheus.Registry, + tracer opentracing.Tracer, conf *queryFrontendConfig, comp component.Component, ) error { @@ -153,6 +163,13 @@ func runQueryFrontend( prober.NewInstrumentation(comp, logger, extprom.WrapRegistererWithPrefix("thanos_", reg)), ) + // Configure Request Logging for HTTP calls. + opts := []logging.Option{logging.WithDecider(func() logging.Decision { + return logging.LogDecision[conf.requestLoggingDecision] + })} + logMiddleware := logging.NewHTTPServerMiddleware(logger, opts...) + ins := extpromhttp.NewInstrumentationMiddleware(reg) + // Start metrics HTTP server. { srv := httpserver.New(logger, reg, comp, httpProbe, @@ -160,14 +177,26 @@ func runQueryFrontend( httpserver.WithGracePeriod(time.Duration(conf.http.gracePeriod)), ) - injectf := func(f http.HandlerFunc) http.HandlerFunc { + instr := func(f http.HandlerFunc) http.HandlerFunc { hf := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - // Cortex frontend middlewares require orgID. - f.ServeHTTP(w, r.WithContext(user.InjectOrgID(r.Context(), "fake"))) + name := "query-frontend" + ins.NewHandler( + name, + logMiddleware.HTTPMiddleware( + name, + tracing.HTTPMiddleware( + tracer, + name, + logger, + gziphandler.GzipHandler(middleware.RequestID(f)), + ), + ), + // Cortex frontend middlewares require orgID. + ).ServeHTTP(w, r.WithContext(user.InjectOrgID(r.Context(), "fake"))) }) return hf } - srv.Handle("/", injectf(fe.Handler().ServeHTTP)) + srv.Handle("/", instr(fe.Handler().ServeHTTP)) g.Add(func() error { statusProber.Healthy() diff --git a/docs/components/query-frontend.md b/docs/components/query-frontend.md index 23ffa9eb743..80211e71e63 100644 --- a/docs/components/query-frontend.md +++ b/docs/components/query-frontend.md @@ -125,5 +125,12 @@ Flags: Log queries that are slower than the specified duration. Set to 0 to disable. Set to < 0 to enable on all queries. + --log.request.decision=LogFinishCall + Request Logging for logging the start and end of + requests. LogFinishCall is enabled by default. + LogFinishCall : Logs the finish call of the + requests. LogStartAndFinishCall : Logs the start + and finish call of the requests. NoLogCall : + Disable request logging. ``` From ef3bbaef313e4d2e77f30e7d76ad18466b48d1f1 Mon Sep 17 00:00:00 2001 From: Ben Ye Date: Mon, 17 Aug 2020 12:27:41 -0400 Subject: [PATCH 3/6] Add more validations for memcached client config (#3034) * Add more rules when validting memcached config Signed-off-by: Ben Ye * update error message description Co-authored-by: Marco Pracucci Signed-off-by: Ben Ye Co-authored-by: Marco Pracucci --- pkg/cacheutil/memcached_client.go | 16 ++++++++++++++-- pkg/cacheutil/memcached_client_test.go | 23 +++++++++++++++++++++-- 2 files changed, 35 insertions(+), 4 deletions(-) diff --git a/pkg/cacheutil/memcached_client.go b/pkg/cacheutil/memcached_client.go index 90431d1251d..1ac20d436b6 100644 --- a/pkg/cacheutil/memcached_client.go +++ b/pkg/cacheutil/memcached_client.go @@ -38,8 +38,10 @@ const ( ) var ( - errMemcachedAsyncBufferFull = errors.New("the async buffer is full") - errMemcachedConfigNoAddrs = errors.New("no memcached addresses provided") + errMemcachedAsyncBufferFull = errors.New("the async buffer is full") + errMemcachedConfigNoAddrs = errors.New("no memcached addresses provided") + errMemcachedDNSUpdateIntervalNotPositive = errors.New("DNS provider update interval must be positive") + errMemcachedMaxAsyncConcurrencyNotPositive = errors.New("max async concurrency must be positive") defaultMemcachedClientConfig = MemcachedClientConfig{ Timeout: 500 * time.Millisecond, @@ -120,6 +122,16 @@ func (c *MemcachedClientConfig) validate() error { return errMemcachedConfigNoAddrs } + // Avoid panic in time ticker. + if c.DNSProviderUpdateInterval <= 0 { + return errMemcachedDNSUpdateIntervalNotPositive + } + + // Set async only available when MaxAsyncConcurrency > 0. + if c.MaxAsyncConcurrency <= 0 { + return errMemcachedMaxAsyncConcurrencyNotPositive + } + return nil } diff --git a/pkg/cacheutil/memcached_client_test.go b/pkg/cacheutil/memcached_client_test.go index 02a96bf61f4..61729e7a575 100644 --- a/pkg/cacheutil/memcached_client_test.go +++ b/pkg/cacheutil/memcached_client_test.go @@ -27,16 +27,35 @@ func TestMemcachedClientConfig_validate(t *testing.T) { }{ "should pass on valid config": { config: MemcachedClientConfig{ - Addresses: []string{"127.0.0.1:11211"}, + Addresses: []string{"127.0.0.1:11211"}, + MaxAsyncConcurrency: 1, + DNSProviderUpdateInterval: time.Second, }, expected: nil, }, "should fail on no addresses": { config: MemcachedClientConfig{ - Addresses: []string{}, + Addresses: []string{}, + MaxAsyncConcurrency: 1, + DNSProviderUpdateInterval: time.Second, }, expected: errMemcachedConfigNoAddrs, }, + "should fail on max_async_concurrency <= 0": { + config: MemcachedClientConfig{ + Addresses: []string{"127.0.0.1:11211"}, + MaxAsyncConcurrency: 0, + DNSProviderUpdateInterval: time.Second, + }, + expected: errMemcachedMaxAsyncConcurrencyNotPositive, + }, + "should fail on dns_provider_update_interval <= 0": { + config: MemcachedClientConfig{ + Addresses: []string{"127.0.0.1:11211"}, + MaxAsyncConcurrency: 1, + }, + expected: errMemcachedDNSUpdateIntervalNotPositive, + }, } for testName, testData := range tests { From 032771b278722f7970f1e768f349f259bdeb56b5 Mon Sep 17 00:00:00 2001 From: Ben Ye Date: Tue, 18 Aug 2020 04:26:30 -0400 Subject: [PATCH 4/6] update storeMatch[] usage (#3044) Signed-off-by: Ben Ye --- CHANGELOG.md | 2 +- docs/components/query.md | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9bed66cb5e1..56aefcc1e8d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -47,7 +47,7 @@ We use *breaking* word for marking changes that are not backward compatible (rel - [#2893](https://github.com/thanos-io/thanos/pull/2893) Store: Rename metric `thanos_bucket_store_cached_postings_compression_time_seconds` to `thanos_bucket_store_cached_postings_compression_time_seconds_total`. - [#2915](https://github.com/thanos-io/thanos/pull/2915) Receive,Ruler: Enable TSDB directory locking by default. Add a new flag (`--tsdb.no-lockfile`) to override behavior. - [#2902](https://github.com/thanos-io/thanos/pull/2902) ui: React: Separate dedupe and partial response checkboxes per panel. -- [#2931](https://github.com/thanos-io/thanos/pull/2931) Query: Allow passing a `storeMatcher[]` to select matching stores when debugging the querier. See [documentation](https://thanos.io/components/query.md/#store-filtering) +- [#2931](https://github.com/thanos-io/thanos/pull/2931) Query: Allow passing a `storeMatch[]` to select matching stores when debugging the querier. See [documentation](https://thanos.io/components/query.md/#store-filtering) - [#2991](https://github.com/thanos-io/thanos/pull/2991) store: `operation` label value `getrange` changed to `get_range` for `thanos_store_bucket_cache_operation_requests_total` and `thanos_store_bucket_cache_operation_hits_total` to be consistent with bucket operation metrics. - [#2876](https://github.com/thanos-io/thanos/pull/2876) Receive,Ruler: Updated TSDB and switched to ChunkIterators instead of sample one, which avoids unnecessary decoding / encoding. diff --git a/docs/components/query.md b/docs/components/query.md index 94d7fe2fa7b..bf7808ac1b3 100644 --- a/docs/components/query.md +++ b/docs/components/query.md @@ -226,7 +226,7 @@ Keep in mind that the maximum number of concurrent queries that are handled by q It's possible to provide a set of matchers to the Querier api to select specific stores to be used during the query using the `storeMatch[]` parameter. It is useful when debugging a slow/broken store. It uses the same format as the matcher of [Prometheus' federate api](https://prometheus.io/docs/prometheus/latest/querying/api/#finding-series-by-label-matchers). -Note that at the moment the querier only supports the `__address__` which contain the address of the store as it is shown on the `/stores` endoint of the UI. +Note that at the moment the querier only supports the `__address__` which contain the address of the store as it is shown on the `/stores` endpoint of the UI. Example: ``` @@ -236,7 +236,7 @@ Example: ``` ``` -http://localhost:10901/api/v1/query?query=up&dedup=true&partial_response=true&storeMatch={__address__=~"prometheus-foo.*"} +http://localhost:10901/api/v1/query?query=up&dedup=true&partial_response=true&storeMatch[]={__address__=~"prometheus-foo.*"} ``` Will only return metrics from `prometheus-foo.thanos-sidecar:10901` From cd704d07f07a7fed2420ee697beb66ce7ec4dc80 Mon Sep 17 00:00:00 2001 From: Prem Kumar Date: Tue, 18 Aug 2020 15:14:24 +0530 Subject: [PATCH 5/6] ui: Fix too many redirects when using query frontend (#3049) Signed-off-by: Prem Kumar --- pkg/ui/query.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/ui/query.go b/pkg/ui/query.go index a7440ced4d9..53f7bd13396 100644 --- a/pkg/ui/query.go +++ b/pkg/ui/query.go @@ -84,7 +84,7 @@ func (q *Query) Register(r *route.Router, ins extpromhttp.InstrumentationMiddlew // and which breaks users with a --web.route-prefix that deviates from the path derived // from the external URL. r.Get("/new", func(w http.ResponseWriter, r *http.Request) { - http.Redirect(w, r, path.Join(GetWebPrefix(q.logger, q.externalPrefix, q.prefixHeader, r), "new")+"/", http.StatusFound) + http.Redirect(w, r, path.Join(GetWebPrefix(q.logger, q.externalPrefix, q.prefixHeader, r), "new")+"/graph", http.StatusFound) }) r.Get("/new/*filepath", instrf("react-static", q.serveReactUI)) From 73a75e638596d314ab922c244d8dfc43a37fd3df Mon Sep 17 00:00:00 2001 From: Max Neverov <1296281+mneverov@users.noreply.github.com> Date: Tue, 18 Aug 2020 20:46:58 +0200 Subject: [PATCH 6/6] Replace leaktest with goleak (#3029) Signed-off-by: Max Neverov --- cmd/thanos/receive.go | 1 + go.mod | 2 +- go.sum | 3 +- pkg/api/query/v1_test.go | 7 +-- pkg/block/block_test.go | 12 ++--- pkg/block/metadata/deletionmark_test.go | 9 ++-- pkg/cacheutil/cacheutil_test.go | 14 ++++++ pkg/cacheutil/memcached_client_test.go | 9 ---- .../memcached_server_selector_test.go | 13 +----- pkg/compact/downsample/aggr_test.go | 5 +- pkg/compact/downsample/downsample_test.go | 13 +++--- pkg/objstore/objtesting/foreach.go | 6 +-- pkg/pool/pool_test.go | 13 +++--- .../test-storeset-pre-v0.8.0/storeset_test.go | 16 +++---- pkg/query/querier_test.go | 21 --------- pkg/query/query_test.go | 14 ++++++ pkg/query/storeset_test.go | 8 +--- pkg/receive/config.go | 6 +-- pkg/receive/config_test.go | 46 +++++++++---------- pkg/receive/handler_test.go | 6 +-- pkg/receive/multitsdb_test.go | 5 +- pkg/receive/receive_test.go | 14 ++++++ pkg/reloader/reloader_test.go | 13 +++--- pkg/rules/manager_test.go | 12 +++-- pkg/rules/prometheus_test.go | 5 +- pkg/rules/rules_test.go | 4 ++ pkg/store/bucket_test.go | 23 ++++++---- pkg/store/cache/cache_test.go | 5 ++ pkg/store/cache/inmemory_test.go | 12 ----- pkg/store/cache/memcached_test.go | 2 - pkg/store/multitsdb_test.go | 13 +++--- pkg/store/prometheus_test.go | 27 ++++------- pkg/store/proxy_test.go | 28 +++++------ pkg/store/tsdb_test.go | 16 +++---- pkg/testutil/testutil.go | 25 ++++++++++ pkg/tracing/stackdriver/tracer_test.go | 8 ++-- 36 files changed, 222 insertions(+), 214 deletions(-) create mode 100644 pkg/cacheutil/cacheutil_test.go create mode 100644 pkg/query/query_test.go create mode 100644 pkg/receive/receive_test.go diff --git a/cmd/thanos/receive.go b/cmd/thanos/receive.go index f150faa7115..ef56d0a0441 100644 --- a/cmd/thanos/receive.go +++ b/cmd/thanos/receive.go @@ -373,6 +373,7 @@ func runReceive( if cw != nil { // Check the hashring configuration on before running the watcher. if err := cw.ValidateConfig(); err != nil { + cw.Stop() close(updates) return errors.Wrap(err, "failed to validate hashring configuration file") } diff --git a/go.mod b/go.mod index f93287a7ed9..63b24c27e39 100644 --- a/go.mod +++ b/go.mod @@ -16,7 +16,6 @@ require ( github.com/davecgh/go-spew v1.1.1 github.com/facette/natsort v0.0.0-20181210072756-2cd4dd1e2dcb github.com/fatih/structtag v1.1.0 - github.com/fortytw2/leaktest v1.3.0 github.com/fsnotify/fsnotify v1.4.7 github.com/go-kit/kit v0.10.0 github.com/go-openapi/strfmt v0.19.5 @@ -56,6 +55,7 @@ require ( go.elastic.co/apm/module/apmot v1.5.0 go.uber.org/atomic v1.6.0 go.uber.org/automaxprocs v1.2.0 + go.uber.org/goleak v1.1.0 golang.org/x/crypto v0.0.0-20200728195943-123391ffb6de golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d golang.org/x/sync v0.0.0-20200625203802-6e8e738ad208 diff --git a/go.sum b/go.sum index 69bb84b59aa..d99b5acb175 100644 --- a/go.sum +++ b/go.sum @@ -272,7 +272,6 @@ github.com/fatih/color v1.9.0/go.mod h1:eQcE1qtQxscV5RaZvpXrrb8Drkc3/DdQ+uUYCNjL github.com/fatih/structtag v1.1.0 h1:6j4mUV/ES2duvnAzKMFkN6/A5mCaNYPD3xfbAkLLOF8= github.com/fatih/structtag v1.1.0/go.mod h1:mBJUNpUnHmRKrKlQQlmCrh5PuhftFbNv8Ys4/aAZl94= github.com/fogleman/gg v1.2.1-0.20190220221249-0403632d5b90/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k= -github.com/fortytw2/leaktest v1.3.0 h1:u8491cBMTQ8ft8aeV+adlcytMZylmA5nnwwkRZjI8vw= github.com/fortytw2/leaktest v1.3.0/go.mod h1:jDsjWgpAGjm2CA7WthBh/CdZYEPF31XHquHwclZch5g= github.com/franela/goblin v0.0.0-20200105215937-c9ffbefa60db/go.mod h1:7dvUGVsVBjqR7JHJk0brhHOZYGmfBYOrK0ZhYMEtBr4= github.com/franela/goreq v0.0.0-20171204163338-bcd34c9993f8/go.mod h1:ZhphrRTfi2rbfLwlschooIH4+wKKDR4Pdxhh+TRoA20= @@ -1042,6 +1041,8 @@ go.uber.org/automaxprocs v1.2.0 h1:+RUihKM+nmYUoB9w0D0Ov5TJ2PpFO2FgenTxMJiZBZA= go.uber.org/automaxprocs v1.2.0/go.mod h1:YfO3fm683kQpzETxlTGZhGIVmXAhaw3gxeBADbpZtnU= go.uber.org/goleak v1.0.0 h1:qsup4IcBdlmsnGfqyLl4Ntn3C2XCCuKAE7DwHpScyUo= go.uber.org/goleak v1.0.0/go.mod h1:8a7PlsEVH3e/a/GLqe5IIrQx6GzcnRmZEufDUTk4A7A= +go.uber.org/goleak v1.1.0 h1:MJDxhkyAAWXEJf/y4NSOPYD/bBx7JAzIjUbv12/4FFs= +go.uber.org/goleak v1.1.0/go.mod h1:8a7PlsEVH3e/a/GLqe5IIrQx6GzcnRmZEufDUTk4A7A= go.uber.org/multierr v1.1.0/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/0= go.uber.org/multierr v1.3.0/go.mod h1:VgVr7evmIr6uPjLBxg28wmKNXyqE9akIJ5XnfpiKl+4= go.uber.org/multierr v1.4.0/go.mod h1:VgVr7evmIr6uPjLBxg28wmKNXyqE9akIJ5XnfpiKl+4= diff --git a/pkg/api/query/v1_test.go b/pkg/api/query/v1_test.go index 91c222a3d88..05a91dcc15d 100644 --- a/pkg/api/query/v1_test.go +++ b/pkg/api/query/v1_test.go @@ -29,7 +29,6 @@ import ( "testing" "time" - "github.com/fortytw2/leaktest" "github.com/prometheus/common/route" "github.com/prometheus/prometheus/pkg/labels" "github.com/prometheus/prometheus/pkg/timestamp" @@ -51,9 +50,11 @@ import ( "github.com/thanos-io/thanos/pkg/testutil/testpromcompatibility" ) -func TestEndpoints(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() +func TestMain(m *testing.M) { + testutil.TolerantVerifyLeakMain(m) +} +func TestEndpoints(t *testing.T) { lbls := []labels.Labels{ { labels.Label{Name: "__name__", Value: "test_metric1"}, diff --git a/pkg/block/block_test.go b/pkg/block/block_test.go index 2c35197e7c9..1df38bce718 100644 --- a/pkg/block/block_test.go +++ b/pkg/block/block_test.go @@ -14,8 +14,8 @@ import ( "testing" "time" - "github.com/fortytw2/leaktest" "github.com/go-kit/kit/log" + "github.com/oklog/ulid" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" promtest "github.com/prometheus/client_golang/prometheus/testutil" @@ -24,8 +24,6 @@ import ( "github.com/thanos-io/thanos/pkg/objstore" "github.com/thanos-io/thanos/pkg/testutil" "github.com/thanos-io/thanos/pkg/testutil/e2eutil" - - "github.com/oklog/ulid" ) func TestIsBlockDir(t *testing.T) { @@ -75,7 +73,7 @@ func TestIsBlockDir(t *testing.T) { } func TestUpload(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() + defer testutil.TolerantVerifyLeak(t) ctx := context.Background() @@ -179,8 +177,7 @@ func TestUpload(t *testing.T) { } func TestDelete(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() - + defer testutil.TolerantVerifyLeak(t) ctx := context.Background() tmpDir, err := ioutil.TempDir("", "test-block-delete") @@ -226,8 +223,7 @@ func TestDelete(t *testing.T) { } func TestMarkForDeletion(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() - + defer testutil.TolerantVerifyLeak(t) ctx := context.Background() tmpDir, err := ioutil.TempDir("", "test-block-mark-for-delete") diff --git a/pkg/block/metadata/deletionmark_test.go b/pkg/block/metadata/deletionmark_test.go index 02f259f856d..220c9dc1718 100644 --- a/pkg/block/metadata/deletionmark_test.go +++ b/pkg/block/metadata/deletionmark_test.go @@ -13,16 +13,19 @@ import ( "testing" "time" - "github.com/fortytw2/leaktest" "github.com/oklog/ulid" "github.com/pkg/errors" + "go.uber.org/goleak" + "github.com/thanos-io/thanos/pkg/objstore" "github.com/thanos-io/thanos/pkg/testutil" ) -func TestReadDeletionMark(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() +func TestMain(m *testing.M) { + goleak.VerifyTestMain(m) +} +func TestReadDeletionMark(t *testing.T) { ctx := context.Background() tmpDir, err := ioutil.TempDir("", "test-read-deletion-mark") diff --git a/pkg/cacheutil/cacheutil_test.go b/pkg/cacheutil/cacheutil_test.go new file mode 100644 index 00000000000..4c422fa4822 --- /dev/null +++ b/pkg/cacheutil/cacheutil_test.go @@ -0,0 +1,14 @@ +// Copyright (c) The Thanos Authors. +// Licensed under the Apache License 2.0. + +package cacheutil + +import ( + "testing" + + "go.uber.org/goleak" +) + +func TestMain(m *testing.M) { + goleak.VerifyTestMain(m) +} diff --git a/pkg/cacheutil/memcached_client_test.go b/pkg/cacheutil/memcached_client_test.go index 61729e7a575..209de5dfb76 100644 --- a/pkg/cacheutil/memcached_client_test.go +++ b/pkg/cacheutil/memcached_client_test.go @@ -10,7 +10,6 @@ import ( "time" "github.com/bradfitz/gomemcache/memcache" - "github.com/fortytw2/leaktest" "github.com/go-kit/kit/log" "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" @@ -66,8 +65,6 @@ func TestMemcachedClientConfig_validate(t *testing.T) { } func TestNewMemcachedClient(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() - // Should return error on empty YAML config. conf := []byte{} cache, err := NewMemcachedClient(log.NewNopLogger(), "test", conf, nil) @@ -130,8 +127,6 @@ dns_provider_update_interval: 1s } func TestMemcachedClient_SetAsync(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() - ctx := context.Background() config := defaultMemcachedClientConfig config.Addresses = []string{"127.0.0.1:11211"} @@ -157,8 +152,6 @@ func TestMemcachedClient_SetAsync(t *testing.T) { } func TestMemcachedClient_SetAsyncWithCustomMaxItemSize(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() - ctx := context.Background() config := defaultMemcachedClientConfig config.Addresses = []string{"127.0.0.1:11211"} @@ -185,8 +178,6 @@ func TestMemcachedClient_SetAsyncWithCustomMaxItemSize(t *testing.T) { } func TestMemcachedClient_GetMulti(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() - tests := map[string]struct { maxBatchSize int maxConcurrency int diff --git a/pkg/cacheutil/memcached_server_selector_test.go b/pkg/cacheutil/memcached_server_selector_test.go index ab5848bb077..a827d2c1290 100644 --- a/pkg/cacheutil/memcached_server_selector_test.go +++ b/pkg/cacheutil/memcached_server_selector_test.go @@ -7,12 +7,11 @@ import ( "fmt" "net" "testing" - "time" "github.com/bradfitz/gomemcache/memcache" "github.com/facette/natsort" - "github.com/fortytw2/leaktest" "github.com/pkg/errors" + "github.com/thanos-io/thanos/pkg/testutil" ) @@ -42,8 +41,6 @@ func TestNatSort(t *testing.T) { } func TestMemcachedJumpHashSelector_PickServer(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() - tests := []struct { addrs []string key string @@ -90,8 +87,6 @@ func TestMemcachedJumpHashSelector_PickServer(t *testing.T) { } func TestMemcachedJumpHashSelector_Each_ShouldRespectServersOrdering(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() - tests := []struct { input []string expected []string @@ -123,8 +118,6 @@ func TestMemcachedJumpHashSelector_Each_ShouldRespectServersOrdering(t *testing. } func TestMemcachedJumpHashSelector_PickServer_ShouldEvenlyDistributeKeysToServers(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() - servers := []string{"127.0.0.1:11211", "127.0.0.2:11211", "127.0.0.3:11211"} selector := MemcachedJumpHashSelector{} testutil.Ok(t, selector.SetServers(servers...)) @@ -151,8 +144,6 @@ func TestMemcachedJumpHashSelector_PickServer_ShouldEvenlyDistributeKeysToServer } func TestMemcachedJumpHashSelector_PickServer_ShouldUseConsistentHashing(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() - servers := []string{ "127.0.0.1:11211", "127.0.0.2:11211", @@ -205,8 +196,6 @@ func TestMemcachedJumpHashSelector_PickServer_ShouldUseConsistentHashing(t *test } func TestMemcachedJumpHashSelector_PickServer_ShouldReturnErrNoServersOnNoServers(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() - s := MemcachedJumpHashSelector{} _, err := s.PickServer("foo") testutil.Equals(t, memcache.ErrNoServers, err) diff --git a/pkg/compact/downsample/aggr_test.go b/pkg/compact/downsample/aggr_test.go index 62a1b7fe1ab..52f92b65df2 100644 --- a/pkg/compact/downsample/aggr_test.go +++ b/pkg/compact/downsample/aggr_test.go @@ -5,16 +5,13 @@ package downsample import ( "testing" - "time" - "github.com/fortytw2/leaktest" "github.com/prometheus/prometheus/tsdb/chunkenc" + "github.com/thanos-io/thanos/pkg/testutil" ) func TestAggrChunk(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() - var input [5][]sample input[AggrCount] = []sample{{100, 30}, {200, 50}, {300, 60}, {400, 67}} diff --git a/pkg/compact/downsample/downsample_test.go b/pkg/compact/downsample/downsample_test.go index 15360a29987..bfbd32ac75e 100644 --- a/pkg/compact/downsample/downsample_test.go +++ b/pkg/compact/downsample/downsample_test.go @@ -10,9 +10,7 @@ import ( "path/filepath" "sort" "testing" - "time" - "github.com/fortytw2/leaktest" "github.com/go-kit/kit/log" "github.com/pkg/errors" "github.com/prometheus/prometheus/pkg/labels" @@ -23,13 +21,18 @@ import ( "github.com/prometheus/prometheus/tsdb/chunks" "github.com/prometheus/prometheus/tsdb/index" "github.com/prometheus/prometheus/tsdb/tombstones" + "go.uber.org/goleak" + "github.com/thanos-io/thanos/pkg/block" "github.com/thanos-io/thanos/pkg/block/metadata" "github.com/thanos-io/thanos/pkg/testutil" ) -func TestDownsampleCounterBoundaryReset(t *testing.T) { +func TestMain(m *testing.M) { + goleak.VerifyTestMain(m) +} +func TestDownsampleCounterBoundaryReset(t *testing.T) { toAggrChunks := func(t *testing.T, cm []chunks.Meta) (res []*AggrChunk) { for i := range cm { achk, ok := cm[i].Chunk.(*AggrChunk) @@ -207,8 +210,6 @@ var ( ) func TestDownsample(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() - type downsampleTestCase struct { name string @@ -594,8 +595,6 @@ var ( ) func TestApplyCounterResetsIterator(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() - for _, tcase := range []struct { name string diff --git a/pkg/objstore/objtesting/foreach.go b/pkg/objstore/objtesting/foreach.go index a4eb15d5211..3cc54a27e02 100644 --- a/pkg/objstore/objtesting/foreach.go +++ b/pkg/objstore/objtesting/foreach.go @@ -74,7 +74,7 @@ func ForeachStore(t *testing.T, testFn func(t *testing.T, bkt objstore.Bucket)) t.Parallel() defer closeFn() - // TODO(bwplotka): Add leaktest when https://github.com/GoogleCloudPlatform/google-cloud-go/issues/1025 is resolved. + // TODO(bwplotka): Add goleak when https://github.com/GoogleCloudPlatform/google-cloud-go/issues/1025 is resolved. testFn(t, bkt) }) } @@ -89,8 +89,8 @@ func ForeachStore(t *testing.T, testFn func(t *testing.T, bkt objstore.Bucket)) t.Parallel() defer closeFn() - // TODO(bwplotka): Add leaktest when we fix potential leak in minio library. - // We cannot use leaktest for detecting our own potential leaks, when leaktest detects leaks in minio itself. + // TODO(bwplotka): Add goleak when we fix potential leak in minio library. + // We cannot use goleak for detecting our own potential leaks, when goleak detects leaks in minio itself. // This needs to be investigated more. testFn(t, bkt) diff --git a/pkg/pool/pool_test.go b/pkg/pool/pool_test.go index 8bf3c302c3b..6e530ac0ef3 100644 --- a/pkg/pool/pool_test.go +++ b/pkg/pool/pool_test.go @@ -10,11 +10,16 @@ import ( "testing" "time" - "github.com/fortytw2/leaktest" "github.com/pkg/errors" + "go.uber.org/goleak" + "github.com/thanos-io/thanos/pkg/testutil" ) +func TestMain(m *testing.M) { + goleak.VerifyTestMain(m) +} + func TestBytesPool(t *testing.T) { chunkPool, err := NewBucketedBytesPool(10, 100, 2, 1000) testutil.Ok(t, err) @@ -63,7 +68,6 @@ func TestBytesPool(t *testing.T) { func TestRacePutGet(t *testing.T) { chunkPool, err := NewBucketedBytesPool(3, 100, 2, 5000) testutil.Ok(t, err) - defer leaktest.CheckTimeout(t, 10*time.Second)() s := sync.WaitGroup{} @@ -74,16 +78,15 @@ func TestRacePutGet(t *testing.T) { stop := make(chan bool, 2) f := func(txt string) { + defer s.Done() for { select { case <-stop: - s.Done() return default: c, err := chunkPool.Get(3) if err != nil { errs <- errors.Wrapf(err, "goroutine %s", txt) - s.Done() return } @@ -92,13 +95,11 @@ func TestRacePutGet(t *testing.T) { _, err = fmt.Fprintf(buf, "%s", txt) if err != nil { errs <- errors.Wrapf(err, "goroutine %s", txt) - s.Done() return } if buf.String() != txt { errs <- errors.New("expected to get the data just written") - s.Done() return } diff --git a/pkg/query/internal/test-storeset-pre-v0.8.0/storeset_test.go b/pkg/query/internal/test-storeset-pre-v0.8.0/storeset_test.go index 14b3635a804..706758957ae 100644 --- a/pkg/query/internal/test-storeset-pre-v0.8.0/storeset_test.go +++ b/pkg/query/internal/test-storeset-pre-v0.8.0/storeset_test.go @@ -9,19 +9,17 @@ import ( "math" "net" "os" + "sort" "testing" "time" + "github.com/thanos-io/thanos/pkg/component" "github.com/thanos-io/thanos/pkg/store" + "github.com/thanos-io/thanos/pkg/store/storepb" + "github.com/thanos-io/thanos/pkg/testutil" - "sort" - - "github.com/fortytw2/leaktest" "github.com/go-kit/kit/log" "github.com/go-kit/kit/log/level" - "github.com/thanos-io/thanos/pkg/component" - "github.com/thanos-io/thanos/pkg/store/storepb" - "github.com/thanos-io/thanos/pkg/testutil" "google.golang.org/grpc" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" @@ -124,9 +122,11 @@ func specsFromAddrFunc(addrs []string) func() []StoreSpec { } } -func TestPre0_8_0_StoreSet_AgainstNewStoreGW(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() +func TestMain(m *testing.M) { + testutil.TolerantVerifyLeakMain(m) +} +func TestPre0_8_0_StoreSet_AgainstNewStoreGW(t *testing.T) { st, err := startTestStores([]testStoreMeta{ { storeType: component.Sidecar, diff --git a/pkg/query/querier_test.go b/pkg/query/querier_test.go index 9ac6196c4e3..18d24f6efa6 100644 --- a/pkg/query/querier_test.go +++ b/pkg/query/querier_test.go @@ -16,7 +16,6 @@ import ( "testing" "time" - "github.com/fortytw2/leaktest" "github.com/go-kit/kit/log" "github.com/pkg/errors" "github.com/prometheus/prometheus/pkg/gate" @@ -39,7 +38,6 @@ type sample struct { } func TestQueryableCreator_MaxResolution(t *testing.T) { - t.Cleanup(leaktest.CheckTimeout(t, 10*time.Second)) testProxy := &storeServer{resps: []*storepb.SeriesResponse{}} queryableCreator := NewQueryableCreator(nil, nil, testProxy, 2, 5*time.Second) @@ -59,7 +57,6 @@ func TestQueryableCreator_MaxResolution(t *testing.T) { // Tests E2E how PromQL works with downsampled data. func TestQuerier_DownsampledData(t *testing.T) { - t.Cleanup(leaktest.CheckTimeout(t, 10*time.Second)) testProxy := &storeServer{ resps: []*storepb.SeriesResponse{ storeSeriesResponse(t, labels.FromStrings("__name__", "a", "zzz", "a", "aaa", "bbb"), []sample{{99, 1}, {199, 5}}), // Downsampled chunk from Store. @@ -515,8 +512,6 @@ func TestQuerier_Select(t *testing.T) { t.Run(fmt.Sprintf("dedup=%v", sc.dedup), func(t *testing.T) { t.Run("querier.Select", func(t *testing.T) { - t.Cleanup(leaktest.CheckTimeout(t, 10*time.Second)) - res := q.Select(false, tcase.hints, tcase.matchers...) testSelectResponse(t, sc.expected, res) @@ -528,8 +523,6 @@ func TestQuerier_Select(t *testing.T) { }) // Integration test: Make sure the PromQL would select exactly the same. t.Run("through PromQL with 100s step", func(t *testing.T) { - t.Cleanup(leaktest.CheckTimeout(t, 10*time.Second)) - catcher := &querierResponseCatcher{t: t, Querier: q} q, err := e.NewRangeQuery(&mockedQueryable{catcher}, tcase.equivalentQuery, timestamp.Time(tcase.mint), timestamp.Time(tcase.maxt), 100*time.Second) testutil.Ok(t, err) @@ -691,8 +684,6 @@ func TestQuerierWithDedupUnderstoodByPromQL_Rate(t *testing.T) { MaxSamples: math.MaxInt64, }) t.Run("Rate=5mStep=100s", func(t *testing.T) { - t.Cleanup(leaktest.CheckTimeout(t, 10*time.Second)) - q, err := e.NewRangeQuery(&mockedQueryable{q}, `rate(gitlab_transaction_cache_read_hit_count_total[5m])`, timestamp.Time(realSeriesWithStaleMarkerMint).Add(5*time.Minute), timestamp.Time(realSeriesWithStaleMarkerMaxt), 100*time.Second) testutil.Ok(t, err) @@ -722,8 +713,6 @@ func TestQuerierWithDedupUnderstoodByPromQL_Rate(t *testing.T) { }, vec) }) t.Run("Rate=30mStep=500s", func(t *testing.T) { - t.Cleanup(leaktest.CheckTimeout(t, 10*time.Second)) - q, err := e.NewRangeQuery(&mockedQueryable{q}, `rate(gitlab_transaction_cache_read_hit_count_total[30m])`, timestamp.Time(realSeriesWithStaleMarkerMint).Add(30*time.Minute), timestamp.Time(realSeriesWithStaleMarkerMaxt), 500*time.Second) testutil.Ok(t, err) @@ -765,8 +754,6 @@ func TestQuerierWithDedupUnderstoodByPromQL_Rate(t *testing.T) { MaxSamples: math.MaxInt64, }) t.Run("Rate=5mStep=100s", func(t *testing.T) { - t.Cleanup(leaktest.CheckTimeout(t, 10*time.Second)) - q, err := e.NewRangeQuery(&mockedQueryable{q}, `rate(gitlab_transaction_cache_read_hit_count_total[5m])`, timestamp.Time(realSeriesWithStaleMarkerMint).Add(5*time.Minute), timestamp.Time(realSeriesWithStaleMarkerMaxt), 100*time.Second) testutil.Ok(t, err) @@ -791,8 +778,6 @@ func TestQuerierWithDedupUnderstoodByPromQL_Rate(t *testing.T) { }, vec) }) t.Run("Rate=30mStep=500s", func(t *testing.T) { - t.Cleanup(leaktest.CheckTimeout(t, 10*time.Second)) - q, err := e.NewRangeQuery(&mockedQueryable{q}, `rate(gitlab_transaction_cache_read_hit_count_total[30m])`, timestamp.Time(realSeriesWithStaleMarkerMint).Add(30*time.Minute), timestamp.Time(realSeriesWithStaleMarkerMaxt), 500*time.Second) testutil.Ok(t, err) @@ -815,8 +800,6 @@ func TestQuerierWithDedupUnderstoodByPromQL_Rate(t *testing.T) { } func TestSortReplicaLabel(t *testing.T) { - t.Cleanup(leaktest.CheckTimeout(t, 10*time.Second)) - tests := []struct { input []storepb.Series exp []storepb.Series @@ -882,8 +865,6 @@ func expandSeries(t testing.TB, it chunkenc.Iterator) (res []sample) { } func TestDedupSeriesSet(t *testing.T) { - t.Cleanup(leaktest.CheckTimeout(t, 10*time.Second)) - tests := []struct { input []series exp []series @@ -1213,8 +1194,6 @@ func TestDedupSeriesSet(t *testing.T) { } func TestDedupSeriesIterator(t *testing.T) { - t.Cleanup(leaktest.CheckTimeout(t, 10*time.Second)) - // The deltas between timestamps should be at least 10000 to not be affected // by the initial penalty of 5000, that will cause the second iterator to seek // ahead this far at least once. diff --git a/pkg/query/query_test.go b/pkg/query/query_test.go new file mode 100644 index 00000000000..d72a6d72dde --- /dev/null +++ b/pkg/query/query_test.go @@ -0,0 +1,14 @@ +// Copyright (c) The Thanos Authors. +// Licensed under the Apache License 2.0. + +package query + +import ( + "testing" + + "github.com/thanos-io/thanos/pkg/testutil" +) + +func TestMain(m *testing.M) { + testutil.TolerantVerifyLeakMain(m) +} diff --git a/pkg/query/storeset_test.go b/pkg/query/storeset_test.go index 739c5434f74..c9eaf8a7556 100644 --- a/pkg/query/storeset_test.go +++ b/pkg/query/storeset_test.go @@ -12,7 +12,6 @@ import ( "testing" "time" - "github.com/fortytw2/leaktest" "github.com/pkg/errors" "google.golang.org/grpc" "google.golang.org/grpc/codes" @@ -131,8 +130,6 @@ func (s *testStores) CloseOne(addr string) { } func TestStoreSet_Update(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() - stores, err := startTestStores([]testStoreMeta{ { storeType: component.Sidecar, @@ -500,8 +497,6 @@ func TestStoreSet_Update(t *testing.T) { } func TestStoreSet_Update_NoneAvailable(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() - st, err := startTestStores([]testStoreMeta{ { extlsetFn: func(addr string) []storepb.LabelSet { @@ -565,8 +560,6 @@ func TestStoreSet_Update_NoneAvailable(t *testing.T) { // TestQuerierStrict tests what happens when the strict mode is enabled/disabled. func TestQuerierStrict(t *testing.T) { - defer leaktest.CheckTimeout(t, 5*time.Second)() - st, err := startTestStores([]testStoreMeta{ { minTime: 12345, @@ -767,6 +760,7 @@ func TestStoreSet_Update_Rules(t *testing.T) { testGRPCOpts, time.Minute) t.Run(tc.name, func(t *testing.T) { + defer storeSet.Close() storeSet.Update(context.Background()) testutil.Equals(t, tc.expectedStores, len(storeSet.stores)) diff --git a/pkg/receive/config.go b/pkg/receive/config.go index f03ffbfced6..1eb198cace6 100644 --- a/pkg/receive/config.go +++ b/pkg/receive/config.go @@ -126,7 +126,7 @@ func NewConfigWatcher(logger log.Logger, reg prometheus.Registerer, path string, // Run starts the ConfigWatcher until the given context is canceled. func (cw *ConfigWatcher) Run(ctx context.Context) { - defer cw.stop() + defer cw.Stop() cw.refresh(ctx) @@ -238,8 +238,8 @@ func (cw *ConfigWatcher) refresh(ctx context.Context) { } } -// stop shuts down the config watcher. -func (cw *ConfigWatcher) stop() { +// Stop shuts down the config watcher. +func (cw *ConfigWatcher) Stop() { level.Debug(cw.logger).Log("msg", "stopping hashring configuration watcher...", "path", cw.path) done := make(chan struct{}) diff --git a/pkg/receive/config_test.go b/pkg/receive/config_test.go index 1fca1c8ab85..bf5bf05c0ea 100644 --- a/pkg/receive/config_test.go +++ b/pkg/receive/config_test.go @@ -10,6 +10,8 @@ import ( "testing" "github.com/pkg/errors" + + "github.com/thanos-io/thanos/pkg/testutil" ) func TestValidateConfig(t *testing.T) { @@ -43,34 +45,30 @@ func TestValidateConfig(t *testing.T) { err: nil, // means it's valid. }, } { - var content []byte - var err error - if content, err = json.Marshal(tc.cfg); err != nil { - t.Error(err) - } + t.Run(tc.name, func(t *testing.T) { + content, err := json.Marshal(tc.cfg) + testutil.Ok(t, err) + + tmpfile, err := ioutil.TempFile("", "configwatcher_test.*.json") + testutil.Ok(t, err) - tmpfile, err := ioutil.TempFile("", "configwatcher_test.*.json") - if err != nil { - t.Fatalf("case %q: unexpectedly failed creating the temp file: %v", tc.name, err) - } - defer os.Remove(tmpfile.Name()) + defer func() { + testutil.Ok(t, os.Remove(tmpfile.Name())) + }() - if _, err := tmpfile.Write(content); err != nil { - t.Fatalf("case %q: unexpectedly failed writing to the temp file: %v", tc.name, err) - } + _, err = tmpfile.Write(content) + testutil.Ok(t, err) - if err := tmpfile.Close(); err != nil { - t.Fatalf("case %q: unexpectedly failed closing the temp file: %v", tc.name, err) - } + err = tmpfile.Close() + testutil.Ok(t, err) - cw, err := NewConfigWatcher(nil, nil, tmpfile.Name(), 1) - if err != nil { - t.Fatalf("case %q: unexpectedly failed creating config watcher: %v", tc.name, err) - } + cw, err := NewConfigWatcher(nil, nil, tmpfile.Name(), 1) + testutil.Ok(t, err) + defer cw.Stop() - if err := cw.ValidateConfig(); err != nil && !errors.Is(err, tc.err) { - t.Errorf("case %q: got unexpected error: %v", tc.name, err) - continue - } + if err := cw.ValidateConfig(); err != nil && !errors.Is(err, tc.err) { + t.Errorf("case %q: got unexpected error: %v", tc.name, err) + } + }) } } diff --git a/pkg/receive/handler_test.go b/pkg/receive/handler_test.go index 7cefb844ff1..efd12ce7ce2 100644 --- a/pkg/receive/handler_test.go +++ b/pkg/receive/handler_test.go @@ -15,7 +15,6 @@ import ( "testing" "time" - "github.com/fortytw2/leaktest" "github.com/go-kit/kit/log" "github.com/gogo/protobuf/proto" "github.com/golang/snappy" @@ -23,9 +22,10 @@ import ( "github.com/prometheus/prometheus/pkg/labels" "github.com/prometheus/prometheus/storage" terrors "github.com/prometheus/prometheus/tsdb/errors" + "google.golang.org/grpc" + "github.com/thanos-io/thanos/pkg/store/storepb" "github.com/thanos-io/thanos/pkg/store/storepb/prompb" - "google.golang.org/grpc" ) func TestCountCause(t *testing.T) { @@ -183,7 +183,6 @@ func newHandlerHashring(appendables []*fakeAppendable, replicationFactor uint64) } func TestReceiveQuorum(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() appenderErrFn := func() error { return errors.New("failed to get appender") } conflictErrFn := func() error { return storage.ErrOutOfBounds } commitErrFn := func() error { return errors.New("failed to commit") } @@ -520,7 +519,6 @@ func TestReceiveQuorum(t *testing.T) { } func TestReceiveWithConsistencyDelay(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() appenderErrFn := func() error { return errors.New("failed to get appender") } conflictErrFn := func() error { return storage.ErrOutOfBounds } commitErrFn := func() error { return errors.New("failed to commit") } diff --git a/pkg/receive/multitsdb_test.go b/pkg/receive/multitsdb_test.go index ff21ac788c7..eb3274e6af4 100644 --- a/pkg/receive/multitsdb_test.go +++ b/pkg/receive/multitsdb_test.go @@ -10,21 +10,20 @@ import ( "testing" "time" - "github.com/fortytw2/leaktest" "github.com/go-kit/kit/log" "github.com/gogo/protobuf/types" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/prometheus/pkg/labels" "github.com/prometheus/prometheus/storage" "github.com/prometheus/prometheus/tsdb" + "golang.org/x/sync/errgroup" + "github.com/thanos-io/thanos/pkg/runutil" "github.com/thanos-io/thanos/pkg/store/storepb" "github.com/thanos-io/thanos/pkg/testutil" - "golang.org/x/sync/errgroup" ) func TestMultiTSDB(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() dir, err := ioutil.TempDir("", "test") testutil.Ok(t, err) defer func() { testutil.Ok(t, os.RemoveAll(dir)) }() diff --git a/pkg/receive/receive_test.go b/pkg/receive/receive_test.go new file mode 100644 index 00000000000..8bc7dfa0034 --- /dev/null +++ b/pkg/receive/receive_test.go @@ -0,0 +1,14 @@ +// Copyright (c) The Thanos Authors. +// Licensed under the Apache License 2.0. + +package receive + +import ( + "testing" + + "github.com/thanos-io/thanos/pkg/testutil" +) + +func TestMain(m *testing.M) { + testutil.TolerantVerifyLeakMain(m) +} diff --git a/pkg/reloader/reloader_test.go b/pkg/reloader/reloader_test.go index 5d4cf01f551..952ed47e01e 100644 --- a/pkg/reloader/reloader_test.go +++ b/pkg/reloader/reloader_test.go @@ -18,14 +18,17 @@ import ( "testing" "time" - "github.com/fortytw2/leaktest" - "github.com/thanos-io/thanos/pkg/testutil" "go.uber.org/atomic" + "go.uber.org/goleak" + + "github.com/thanos-io/thanos/pkg/testutil" ) -func TestReloader_ConfigApply(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() +func TestMain(m *testing.M) { + goleak.VerifyTestMain(m) +} +func TestReloader_ConfigApply(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute) defer cancel() @@ -159,8 +162,6 @@ config: } func TestReloader_RuleApply(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() - l, err := net.Listen("tcp", "localhost:0") testutil.Ok(t, err) diff --git a/pkg/rules/manager_test.go b/pkg/rules/manager_test.go index 5cebfd2d902..231c33a9b38 100644 --- a/pkg/rules/manager_test.go +++ b/pkg/rules/manager_test.go @@ -14,15 +14,15 @@ import ( "testing" "time" - "github.com/fortytw2/leaktest" "github.com/go-kit/kit/log" "github.com/prometheus/prometheus/pkg/labels" "github.com/prometheus/prometheus/promql" "github.com/prometheus/prometheus/rules" "github.com/prometheus/prometheus/storage" + "gopkg.in/yaml.v3" + "github.com/thanos-io/thanos/pkg/store/storepb" "github.com/thanos-io/thanos/pkg/testutil" - "gopkg.in/yaml.v3" ) type nopAppendable struct{} @@ -249,6 +249,12 @@ groups: testutil.Equals(t, exp[i].file, p.File) }) } + defer func() { + // Update creates go routines. We don't need rules mngrs to run, just to parse things, but let it start and stop + // at the end to correctly test leaked go routines. + thanosRuleMgr.Run() + thanosRuleMgr.Stop() + }() } func TestConfigRuleAdapterUnmarshalMarshalYAML(t *testing.T) { @@ -280,8 +286,6 @@ func TestConfigRuleAdapterUnmarshalMarshalYAML(t *testing.T) { } func TestManager_Rules(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() - dir, err := ioutil.TempDir("", "test_rule_run") testutil.Ok(t, err) defer func() { testutil.Ok(t, os.RemoveAll(dir)) }() diff --git a/pkg/rules/prometheus_test.go b/pkg/rules/prometheus_test.go index 49b7134d09c..726313fe366 100644 --- a/pkg/rules/prometheus_test.go +++ b/pkg/rules/prometheus_test.go @@ -9,18 +9,15 @@ import ( "os" "path/filepath" "testing" - "time" - "github.com/fortytw2/leaktest" "github.com/prometheus/prometheus/pkg/labels" + "github.com/thanos-io/thanos/pkg/promclient" "github.com/thanos-io/thanos/pkg/testutil" "github.com/thanos-io/thanos/pkg/testutil/e2eutil" ) func TestPrometheus_Rules_e2e(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() - p, err := e2eutil.NewPrometheus() testutil.Ok(t, err) defer func() { testutil.Ok(t, p.Stop()) }() diff --git a/pkg/rules/rules_test.go b/pkg/rules/rules_test.go index e3c7f580a1c..e5178eddec8 100644 --- a/pkg/rules/rules_test.go +++ b/pkg/rules/rules_test.go @@ -18,6 +18,10 @@ import ( "github.com/thanos-io/thanos/pkg/testutil" ) +func TestMain(m *testing.M) { + testutil.TolerantVerifyLeakMain(m) +} + // testRulesAgainstExamples tests against alerts.yaml and rules.yaml examples. func testRulesAgainstExamples(t *testing.T, dir string, server rulespb.RulesServer) { t.Helper() diff --git a/pkg/store/bucket_test.go b/pkg/store/bucket_test.go index 7ea52149125..521f33adbc0 100644 --- a/pkg/store/bucket_test.go +++ b/pkg/store/bucket_test.go @@ -20,9 +20,7 @@ import ( "strconv" "sync" "testing" - "time" - "github.com/fortytw2/leaktest" "github.com/go-kit/kit/log" "github.com/gogo/protobuf/proto" "github.com/gogo/protobuf/types" @@ -36,7 +34,6 @@ import ( "github.com/prometheus/prometheus/storage" "github.com/prometheus/prometheus/tsdb" "github.com/prometheus/prometheus/tsdb/encoding" - "go.uber.org/atomic" "github.com/thanos-io/thanos/pkg/block" @@ -187,7 +184,7 @@ func TestBucketBlock_Property(t *testing.T) { } func TestBucketBlock_matchLabels(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() + defer testutil.TolerantVerifyLeak(t) dir, err := ioutil.TempDir("", "bucketblock-test") testutil.Ok(t, err) @@ -285,7 +282,7 @@ func TestBucketBlock_matchLabels(t *testing.T) { } func TestBucketBlockSet_addGet(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() + defer testutil.TolerantVerifyLeak(t) set := newBucketBlockSet(labels.Labels{}) @@ -396,7 +393,7 @@ func TestBucketBlockSet_addGet(t *testing.T) { } func TestBucketBlockSet_remove(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() + defer testutil.TolerantVerifyLeak(t) set := newBucketBlockSet(labels.Labels{}) @@ -426,7 +423,7 @@ func TestBucketBlockSet_remove(t *testing.T) { } func TestBucketBlockSet_labelMatchers(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() + defer testutil.TolerantVerifyLeak(t) set := newBucketBlockSet(labels.FromStrings("a", "b", "c", "d")) @@ -495,7 +492,7 @@ func TestBucketBlockSet_labelMatchers(t *testing.T) { } func TestGapBasedPartitioner_Partition(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() + defer testutil.TolerantVerifyLeak(t) const maxGapSize = 1024 * 512 @@ -555,7 +552,7 @@ func TestGapBasedPartitioner_Partition(t *testing.T) { } func TestBucketStore_Info(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() + defer testutil.TolerantVerifyLeak(t) ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -1735,6 +1732,8 @@ func TestBigEndianPostingsCount(t *testing.T) { } func TestBlockWithLargeChunks(t *testing.T) { + defer testutil.TolerantVerifyLeak(t) + tmpDir, err := ioutil.TempDir(os.TempDir(), "large-chunk-test") testutil.Ok(t, err) t.Cleanup(func() { @@ -1830,10 +1829,16 @@ func createBlockWithLargeChunk(t testutil.TB, dir string, lbls labels.Labels, ra } db, err := tsdb.Open(dir, nil, nil, tsdb.DefaultOptions()) + defer func() { + testutil.Ok(t, db.Close()) + }() testutil.Ok(t, err) bs := db.Blocks() testutil.Equals(t, 1, len(bs)) cr, err := bs[0].Chunks() + defer func() { + testutil.Ok(t, cr.Close()) + }() testutil.Ok(t, err) // Ref is ( << 32 + offset in the file). In TSDB v1 first chunk is always at offset 8. c, err := cr.Chunk(8) diff --git a/pkg/store/cache/cache_test.go b/pkg/store/cache/cache_test.go index 670fd4d1333..9b59f75b95b 100644 --- a/pkg/store/cache/cache_test.go +++ b/pkg/store/cache/cache_test.go @@ -13,9 +13,14 @@ import ( "github.com/oklog/ulid" "github.com/prometheus/prometheus/pkg/labels" "github.com/thanos-io/thanos/pkg/testutil" + "go.uber.org/goleak" "golang.org/x/crypto/blake2b" ) +func TestMain(m *testing.M) { + goleak.VerifyTestMain(m) +} + func TestCacheKey_string(t *testing.T) { t.Parallel() diff --git a/pkg/store/cache/inmemory_test.go b/pkg/store/cache/inmemory_test.go index c10cb2dcf6b..119a5a32c4a 100644 --- a/pkg/store/cache/inmemory_test.go +++ b/pkg/store/cache/inmemory_test.go @@ -10,9 +10,7 @@ import ( "fmt" "math" "testing" - "time" - "github.com/fortytw2/leaktest" "github.com/go-kit/kit/log" "github.com/hashicorp/golang-lru/simplelru" "github.com/oklog/ulid" @@ -23,8 +21,6 @@ import ( ) func TestNewInMemoryIndexCache(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() - // Should return error on invalid YAML config. conf := []byte("invalid") cache, err := NewInMemoryIndexCache(log.NewNopLogger(), nil, conf) @@ -51,8 +47,6 @@ max_item_size: 2KB } func TestInMemoryIndexCache_AvoidsDeadlock(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() - metrics := prometheus.NewRegistry() cache, err := NewInMemoryIndexCacheWithConfig(log.NewNopLogger(), metrics, InMemoryIndexCacheConfig{ MaxItemSize: sliceHeaderSize + 5, @@ -85,8 +79,6 @@ func TestInMemoryIndexCache_AvoidsDeadlock(t *testing.T) { } func TestInMemoryIndexCache_UpdateItem(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() - const maxSize = 2 * (sliceHeaderSize + 1) var errorLogs []string @@ -190,8 +182,6 @@ func TestInMemoryIndexCache_UpdateItem(t *testing.T) { // This should not happen as we hardcode math.MaxInt, but we still add test to check this out. func TestInMemoryIndexCache_MaxNumberOfItemsHit(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() - metrics := prometheus.NewRegistry() cache, err := NewInMemoryIndexCacheWithConfig(log.NewNopLogger(), metrics, InMemoryIndexCacheConfig{ MaxItemSize: 2*sliceHeaderSize + 10, @@ -224,8 +214,6 @@ func TestInMemoryIndexCache_MaxNumberOfItemsHit(t *testing.T) { } func TestInMemoryIndexCache_Eviction_WithMetrics(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() - metrics := prometheus.NewRegistry() cache, err := NewInMemoryIndexCacheWithConfig(log.NewNopLogger(), metrics, InMemoryIndexCacheConfig{ MaxItemSize: 2*sliceHeaderSize + 5, diff --git a/pkg/store/cache/memcached_test.go b/pkg/store/cache/memcached_test.go index 7710a2d543a..5daacc4be0f 100644 --- a/pkg/store/cache/memcached_test.go +++ b/pkg/store/cache/memcached_test.go @@ -8,7 +8,6 @@ import ( "testing" "time" - "github.com/fortytw2/leaktest" "github.com/go-kit/kit/log" "github.com/oklog/ulid" "github.com/pkg/errors" @@ -110,7 +109,6 @@ func TestMemcachedIndexCache_FetchMultiPostings(t *testing.T) { func TestMemcachedIndexCache_FetchMultiSeries(t *testing.T) { t.Parallel() - defer leaktest.CheckTimeout(t, 10*time.Second)() // Init some data to conveniently define test cases later one. block1 := ulid.MustNew(1, nil) diff --git a/pkg/store/multitsdb_test.go b/pkg/store/multitsdb_test.go index b4fa2767aa0..0c04490dc5d 100644 --- a/pkg/store/multitsdb_test.go +++ b/pkg/store/multitsdb_test.go @@ -12,12 +12,11 @@ import ( "os" "path/filepath" "testing" - "time" - "github.com/fortytw2/leaktest" "github.com/go-kit/kit/log" "github.com/prometheus/prometheus/pkg/labels" "github.com/prometheus/prometheus/tsdb" + "github.com/thanos-io/thanos/pkg/component" "github.com/thanos-io/thanos/pkg/store/storepb" storetestutil "github.com/thanos-io/thanos/pkg/store/storepb/testutil" @@ -25,7 +24,7 @@ import ( ) func TestMultiTSDBSeries(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() + defer testutil.TolerantVerifyLeak(t) tb := testutil.NewTB(t) storetestutil.RunSeriesInterestingCases(tb, 200e3, 200e3, func(t testutil.TB, samplesPerSeries, series int) { @@ -178,9 +177,9 @@ func (m *mockedStoreServer) Series(_ *storepb.SeriesRequest, server storepb.Stor // Regression test against https://github.com/thanos-io/thanos/issues/2823. func TestTenantSeriesSetServert_NotLeakingIfNotExhausted(t *testing.T) { - t.Run("exhausted StoreSet", func(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() + defer testutil.TolerantVerifyLeak(t) + t.Run("exhausted StoreSet", func(t *testing.T) { s := newTenantSeriesSetServer(context.Background(), "a", nil) resps := []*storepb.SeriesResponse{ @@ -210,7 +209,7 @@ func TestTenantSeriesSetServert_NotLeakingIfNotExhausted(t *testing.T) { }) t.Run("canceled, not exhausted StoreSet", func(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() + defer testutil.TolerantVerifyLeak(t) ctx, cancel := context.WithCancel(context.Background()) s := newTenantSeriesSetServer(ctx, "a", nil) @@ -245,7 +244,7 @@ func (s *mockedSeriesServer) Context() context.Context { return s.ctx } // Regression test against https://github.com/thanos-io/thanos/issues/2823. // This is different leak than in TestTenantSeriesSetServert_NotLeakingIfNotExhausted. func TestMultiTSDBStore_NotLeakingOnPrematureFinish(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() + defer testutil.TolerantVerifyLeak(t) m := NewMultiTSDBStore(log.NewNopLogger(), nil, component.Receive, func() map[string]storepb.StoreServer { return map[string]storepb.StoreServer{ diff --git a/pkg/store/prometheus_test.go b/pkg/store/prometheus_test.go index d8ee68ce024..dd853e9b908 100644 --- a/pkg/store/prometheus_test.go +++ b/pkg/store/prometheus_test.go @@ -11,12 +11,12 @@ import ( "testing" "time" - "github.com/fortytw2/leaktest" "github.com/pkg/errors" "github.com/prometheus/prometheus/pkg/labels" "github.com/prometheus/prometheus/pkg/timestamp" "github.com/prometheus/prometheus/storage" "github.com/prometheus/prometheus/tsdb/chunkenc" + "github.com/thanos-io/thanos/pkg/component" "github.com/thanos-io/thanos/pkg/promclient" "github.com/thanos-io/thanos/pkg/store/storepb" @@ -37,7 +37,7 @@ func TestPrometheusStore_Series_promOnPath_e2e(t *testing.T) { func testPrometheusStoreSeriesE2e(t *testing.T, prefix string) { t.Helper() - defer leaktest.CheckTimeout(t, 10*time.Second)() + defer testutil.TolerantVerifyLeak(t) p, err := e2eutil.NewPrometheusOnPath(prefix) testutil.Ok(t, err) @@ -171,7 +171,7 @@ func getExternalLabels() labels.Labels { func TestPrometheusStore_SeriesLabels_e2e(t *testing.T) { t.Helper() - defer leaktest.CheckTimeout(t, 10*time.Second)() + defer testutil.TolerantVerifyLeak(t) p, err := e2eutil.NewPrometheus() testutil.Ok(t, err) @@ -351,8 +351,9 @@ func TestPrometheusStore_SeriesLabels_e2e(t *testing.T) { }) } } + func TestPrometheusStore_LabelNames_e2e(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() + defer testutil.TolerantVerifyLeak(t) p, err := e2eutil.NewPrometheus() testutil.Ok(t, err) @@ -397,7 +398,7 @@ func TestPrometheusStore_LabelNames_e2e(t *testing.T) { } func TestPrometheusStore_LabelValues_e2e(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() + defer testutil.TolerantVerifyLeak(t) p, err := e2eutil.NewPrometheus() testutil.Ok(t, err) @@ -445,7 +446,7 @@ func TestPrometheusStore_LabelValues_e2e(t *testing.T) { // Test to check external label values retrieve. func TestPrometheusStore_ExternalLabelValues_e2e(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() + defer testutil.TolerantVerifyLeak(t) p, err := e2eutil.NewPrometheus() testutil.Ok(t, err) @@ -485,7 +486,7 @@ func TestPrometheusStore_ExternalLabelValues_e2e(t *testing.T) { } func TestPrometheusStore_Series_MatchExternalLabel_e2e(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() + defer testutil.TolerantVerifyLeak(t) p, err := e2eutil.NewPrometheus() testutil.Ok(t, err) @@ -550,7 +551,7 @@ func TestPrometheusStore_Series_MatchExternalLabel_e2e(t *testing.T) { } func TestPrometheusStore_Info(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() + defer testutil.TolerantVerifyLeak(t) ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -621,7 +622,7 @@ func testSeries_SplitSamplesIntoChunksWithMaxSizeOf120(t *testing.T, appender st // Regression test for https://github.com/thanos-io/thanos/issues/396. func TestPrometheusStore_Series_SplitSamplesIntoChunksWithMaxSizeOf120(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() + defer testutil.TolerantVerifyLeak(t) p, err := e2eutil.NewPrometheus() testutil.Ok(t, err) @@ -644,11 +645,3 @@ func TestPrometheusStore_Series_SplitSamplesIntoChunksWithMaxSizeOf120(t *testin return proxy }) } - -func TestRuleGroupToProto(t *testing.T) { - -} - -func TestRuleGroupFromProto(t *testing.T) { - -} diff --git a/pkg/store/proxy_test.go b/pkg/store/proxy_test.go index 3da59567734..ff53f79bfda 100644 --- a/pkg/store/proxy_test.go +++ b/pkg/store/proxy_test.go @@ -15,7 +15,6 @@ import ( "testing" "time" - "github.com/fortytw2/leaktest" "github.com/go-kit/kit/log" "github.com/gogo/protobuf/proto" "github.com/gogo/protobuf/types" @@ -23,13 +22,14 @@ import ( "github.com/prometheus/prometheus/pkg/labels" "github.com/prometheus/prometheus/pkg/timestamp" "github.com/prometheus/prometheus/tsdb/chunkenc" + "google.golang.org/grpc" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" + "github.com/thanos-io/thanos/pkg/component" "github.com/thanos-io/thanos/pkg/store/storepb" storetestutil "github.com/thanos-io/thanos/pkg/store/storepb/testutil" "github.com/thanos-io/thanos/pkg/testutil" - "google.golang.org/grpc" - "google.golang.org/grpc/codes" - "google.golang.org/grpc/status" ) type testClient struct { @@ -58,7 +58,7 @@ func (c testClient) Addr() string { } func TestProxyStore_Info(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() + defer testutil.TolerantVerifyLeak(t) ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -79,7 +79,7 @@ func TestProxyStore_Info(t *testing.T) { } func TestProxyStore_Series(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() + defer testutil.TolerantVerifyLeak(t) for _, tc := range []struct { title string @@ -452,7 +452,7 @@ func TestProxyStore_SeriesSlowStores(t *testing.T) { t.Skip("enable THANOS_ENABLE_STORE_READ_TIMEOUT_TESTS to run store-read-timeout tests") } - defer leaktest.CheckTimeout(t, 20*time.Second)() + defer testutil.TolerantVerifyLeak(t) for _, tc := range []struct { title string @@ -973,7 +973,7 @@ func TestProxyStore_SeriesSlowStores(t *testing.T) { } func TestProxyStore_Series_RequestParamsProxied(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() + defer testutil.TolerantVerifyLeak(t) m := &mockedStoreAPI{ RespSeries: []*storepb.SeriesResponse{ @@ -1016,7 +1016,7 @@ func TestProxyStore_Series_RequestParamsProxied(t *testing.T) { } func TestProxyStore_Series_RegressionFillResponseChannel(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() + defer testutil.TolerantVerifyLeak(t) var cls []Client for i := 0; i < 10; i++ { @@ -1071,7 +1071,7 @@ func TestProxyStore_Series_RegressionFillResponseChannel(t *testing.T) { } func TestProxyStore_LabelValues(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() + defer testutil.TolerantVerifyLeak(t) m1 := &mockedStoreAPI{ RespLabelValues: &storepb.LabelValuesResponse{ @@ -1111,7 +1111,7 @@ func TestProxyStore_LabelValues(t *testing.T) { } func TestProxyStore_LabelNames(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() + defer testutil.TolerantVerifyLeak(t) for _, tc := range []struct { title string @@ -1225,6 +1225,8 @@ func TestProxyStore_LabelNames(t *testing.T) { } func TestProxyStore_storeMatch(t *testing.T) { + defer testutil.TolerantVerifyLeak(t) + storeAPIs := []Client{ &testClient{ StoreClient: &mockedStoreAPI{ @@ -1310,8 +1312,6 @@ func seriesEquals(t *testing.T, expected []rawSeries, got []storepb.Series) { } func TestStoreMatches(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() - cases := []struct { s Client mint, maxt int64 @@ -1718,7 +1718,7 @@ func benchProxySeries(t testutil.TB, totalSamples, totalSeries int) { } func TestProxyStore_NotLeakingOnPrematureFinish(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() + defer testutil.TolerantVerifyLeak(t) clients := []Client{ &testClient{ diff --git a/pkg/store/tsdb_test.go b/pkg/store/tsdb_test.go index 468d9cec032..cf4413492d5 100644 --- a/pkg/store/tsdb_test.go +++ b/pkg/store/tsdb_test.go @@ -9,9 +9,9 @@ import ( "testing" "time" - "github.com/fortytw2/leaktest" "github.com/prometheus/prometheus/pkg/labels" "github.com/prometheus/prometheus/pkg/timestamp" + "github.com/thanos-io/thanos/pkg/component" "github.com/thanos-io/thanos/pkg/store/storepb" "github.com/thanos-io/thanos/pkg/testutil" @@ -19,7 +19,7 @@ import ( ) func TestTSDBStore_Info(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() + defer testutil.TolerantVerifyLeak(t) ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -53,7 +53,7 @@ func TestTSDBStore_Info(t *testing.T) { } func TestTSDBStore_Series(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() + defer testutil.TolerantVerifyLeak(t) ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -178,9 +178,9 @@ func TestTSDBStore_Series(t *testing.T) { } func TestTSDBStore_LabelNames(t *testing.T) { - var err error - defer leaktest.CheckTimeout(t, 10*time.Second)() + defer testutil.TolerantVerifyLeak(t) + var err error ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -284,9 +284,9 @@ func TestTSDBStore_LabelNames(t *testing.T) { } func TestTSDBStore_LabelValues(t *testing.T) { - var err error - defer leaktest.CheckTimeout(t, 10*time.Second)() + defer testutil.TolerantVerifyLeak(t) + var err error ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -383,7 +383,7 @@ func TestTSDBStore_LabelValues(t *testing.T) { // Regression test for https://github.com/thanos-io/thanos/issues/1038. func TestTSDBStore_Series_SplitSamplesIntoChunksWithMaxSizeOf120(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() + defer testutil.TolerantVerifyLeak(t) db, err := e2eutil.NewTSDB() defer func() { testutil.Ok(t, db.Close()) }() diff --git a/pkg/testutil/testutil.go b/pkg/testutil/testutil.go index 8b1e8e1d4f1..d98e02f60f1 100644 --- a/pkg/testutil/testutil.go +++ b/pkg/testutil/testutil.go @@ -14,6 +14,7 @@ import ( "github.com/pmezard/go-difflib/difflib" "github.com/prometheus/client_golang/prometheus" dto "github.com/prometheus/client_model/go" + "go.uber.org/goleak" ) // Assert fails the test if the condition is false. @@ -152,3 +153,27 @@ func GatherAndCompare(t *testing.T, g1 prometheus.Gatherer, g2 prometheus.Gather } Equals(t, m1.String(), m2.String()) } + +// TolerantVerifyLeakMain verifies go leaks but excludes the go routines that are +// launched as side effects of some of our dependencies. +func TolerantVerifyLeakMain(m *testing.M) { + goleak.VerifyTestMain(m, + // https://github.com/census-instrumentation/opencensus-go/blob/d7677d6af5953e0506ac4c08f349c62b917a443a/stats/view/worker.go#L34 + goleak.IgnoreTopFunction("go.opencensus.io/stats/view.(*worker).start"), + // https://github.com/kubernetes/klog/blob/c85d02d1c76a9ebafa81eb6d35c980734f2c4727/klog.go#L417 + goleak.IgnoreTopFunction("k8s.io/klog/v2.(*loggingT).flushDaemon"), + goleak.IgnoreTopFunction("k8s.io/klog.(*loggingT).flushDaemon"), + ) +} + +// TolerantVerifyLeak verifies go leaks but excludes the go routines that are +// launched as side effects of some of our dependencies. +func TolerantVerifyLeak(t *testing.T) { + goleak.VerifyNone(t, + // https://github.com/census-instrumentation/opencensus-go/blob/d7677d6af5953e0506ac4c08f349c62b917a443a/stats/view/worker.go#L34 + goleak.IgnoreTopFunction("go.opencensus.io/stats/view.(*worker).start"), + // https://github.com/kubernetes/klog/blob/c85d02d1c76a9ebafa81eb6d35c980734f2c4727/klog.go#L417 + goleak.IgnoreTopFunction("k8s.io/klog/v2.(*loggingT).flushDaemon"), + goleak.IgnoreTopFunction("k8s.io/klog.(*loggingT).flushDaemon"), + ) +} diff --git a/pkg/tracing/stackdriver/tracer_test.go b/pkg/tracing/stackdriver/tracer_test.go index b0d4790475e..f869e090fdd 100644 --- a/pkg/tracing/stackdriver/tracer_test.go +++ b/pkg/tracing/stackdriver/tracer_test.go @@ -9,20 +9,20 @@ package stackdriver import ( "context" "testing" - "time" "github.com/thanos-io/thanos/pkg/testutil" "github.com/thanos-io/thanos/pkg/tracing" - "github.com/fortytw2/leaktest" "github.com/opentracing/basictracer-go" ) +func TestMain(m *testing.M) { + testutil.TolerantVerifyLeakMain(m) +} + // This test shows that if sample factor will enable tracing on client process, even when it would be disabled on server // it will be still enabled for all spans within this span. func TestContextTracing_ClientEnablesTracing(t *testing.T) { - defer leaktest.CheckTimeout(t, 10*time.Second)() - m := &basictracer.InMemorySpanRecorder{} r := &forceRecorder{wrapped: m}