diff --git a/cmd/pint/scan.go b/cmd/pint/scan.go index f05b7cff..87fa4de8 100644 --- a/cmd/pint/scan.go +++ b/cmd/pint/scan.go @@ -20,6 +20,8 @@ import ( var yamlErrRe = regexp.MustCompile("^yaml: line (.+): (.+)$") +const yamlParseReporter = "yaml/parse" + func tryDecodingYamlError(e string) (int, string) { parts := yamlErrRe.FindStringSubmatch(e) if len(parts) > 2 { @@ -38,7 +40,7 @@ func scanProblem(path string, err error) reporter.Report { Path: path, Problem: checks.Problem{ Lines: []int{line}, - Reporter: "yaml/parse", + Reporter: yamlParseReporter, Text: e, Severity: checks.Fatal, }, @@ -174,7 +176,7 @@ func scanWorker(ctx context.Context, jobs <-chan scanJob, results chan<- reporte results <- reporter.Report{Path: job.path, Rule: job.rule, Problem: checks.Problem{ Fragment: job.rule.Error.Fragment, Lines: []int{job.rule.Error.Line}, - Reporter: "yaml/parse", + Reporter: yamlParseReporter, Text: job.rule.Error.Err.Error(), Severity: checks.Fatal, }} diff --git a/cmd/pint/tests/0023_enabled_checks.txt b/cmd/pint/tests/0023_enabled_checks.txt index 89757b71..0fb6ef00 100644 --- a/cmd/pint/tests/0023_enabled_checks.txt +++ b/cmd/pint/tests/0023_enabled_checks.txt @@ -18,8 +18,9 @@ stderr 'level=debug msg="Configured checks for rule" enabled=\["promql/syntax"," -- .pint.hcl -- prometheus "prom" { - uri = "https://" - timeout = "2m" + uri = "https://" + timeout = "2m" + required = true } rule{} diff --git a/cmd/pint/tests/0025_config.txt b/cmd/pint/tests/0025_config.txt index f9ba2907..721b6465 100644 --- a/cmd/pint/tests/0025_config.txt +++ b/cmd/pint/tests/0025_config.txt @@ -13,7 +13,8 @@ level=info msg="Loading configuration file" path=.pint.hcl { "name": "prom", "uri": "https://", - "timeout": "2m" + "timeout": "2m", + "required": true } ], "checks": { @@ -114,6 +115,7 @@ level=info msg="Loading configuration file" path=.pint.hcl prometheus "prom" { uri = "https://" timeout = "2m" + required = true } rule{ } diff --git a/cmd/pint/tests/0039_prom_selected_path.txt b/cmd/pint/tests/0039_prom_selected_path.txt index db6203c0..3eb26b0b 100644 --- a/cmd/pint/tests/0039_prom_selected_path.txt +++ b/cmd/pint/tests/0039_prom_selected_path.txt @@ -29,7 +29,7 @@ rules/0001.yml:6: job label is required and should be preserved when aggregating prometheus "disabled" { uri = "http://127.0.0.1:123" timeout = "5s" - + required = true paths = ["invalid/.+"] } rule { diff --git a/cmd/pint/tests/0043_watch_cancel.txt b/cmd/pint/tests/0043_watch_cancel.txt index 672e74a5..541599c2 100644 --- a/cmd/pint/tests/0043_watch_cancel.txt +++ b/cmd/pint/tests/0043_watch_cancel.txt @@ -24,6 +24,7 @@ cat prometheus.pid | xargs kill prometheus "slow" { uri = "http://127.0.0.1:7043" timeout = "2m" + required = true } -- prometheus.go -- diff --git a/cmd/pint/tests/0054_watch_metrics_prometheus.txt b/cmd/pint/tests/0054_watch_metrics_prometheus.txt index c8e78814..4e29f3d3 100644 --- a/cmd/pint/tests/0054_watch_metrics_prometheus.txt +++ b/cmd/pint/tests/0054_watch_metrics_prometheus.txt @@ -26,11 +26,13 @@ cat prometheus.pid | xargs kill prometheus "prom1" { uri = "http://127.0.0.1:7054" timeout = "5s" + required = true } prometheus "prom2" { uri = "http://127.0.0.1:1054" timeout = "5s" + required = true } -- metrics.txt -- @@ -62,14 +64,14 @@ pint_check_iterations_total pint_last_run_time_seconds # HELP pint_problem Prometheus rule problem reported by pint # TYPE pint_problem gauge -pint_problem{kind="alerting",name="comparison",problem="failed to query prom1 prometheus config: failed to query Prometheus config: server_error: server error: 500",reporter="promql/rate",severity="bug"} -pint_problem{kind="alerting",name="comparison",problem="failed to query prom2 prometheus config: failed to query Prometheus config: Get \"http://127.0.0.1:1054/api/v1/status/config\": dial tcp 127.0.0.1:1054: connect: connection refused",reporter="promql/rate",severity="bug"} -pint_problem{kind="alerting",name="comparison",problem="query using prom1 failed with: bad_response: Unmarshal: there are bytes left after unmarshal, error found in #10 byte of ...|ry\"\n }Fatal error|..., bigger context ...|pe\":\"bad_data\",\n \"error\":\"bogus query\"\n }Fatal error|...",reporter="promql/series",severity="bug"} -pint_problem{kind="alerting",name="comparison",problem="query using prom2 failed with: Post \"http://127.0.0.1:1054/api/v1/query\": dial tcp 127.0.0.1:1054: connect: connection refused",reporter="promql/series",severity="bug"} -pint_problem{kind="recording",name="aggregate",problem="failed to query prom1 prometheus config: failed to query Prometheus config: server_error: server error: 500",reporter="promql/rate",severity="bug"} -pint_problem{kind="recording",name="aggregate",problem="failed to query prom2 prometheus config: failed to query Prometheus config: Get \"http://127.0.0.1:1054/api/v1/status/config\": dial tcp 127.0.0.1:1054: connect: connection refused",reporter="promql/rate",severity="bug"} -pint_problem{kind="recording",name="aggregate",problem="query using prom1 failed with: bad_response: Unmarshal: there are bytes left after unmarshal, error found in #10 byte of ...|ry\"\n }Fatal error|..., bigger context ...|pe\":\"bad_data\",\n \"error\":\"bogus query\"\n }Fatal error|...",reporter="promql/series",severity="bug"} -pint_problem{kind="recording",name="aggregate",problem="query using prom2 failed with: Post \"http://127.0.0.1:1054/api/v1/query\": dial tcp 127.0.0.1:1054: connect: connection refused",reporter="promql/series",severity="bug"} +pint_problem{kind="alerting",name="comparison",problem="cound't run \"promql/rate\" checks due to \"prom1\" prometheus connection error: failed to query Prometheus config: server_error: server error: 500",reporter="promql/rate",severity="bug"} +pint_problem{kind="alerting",name="comparison",problem="cound't run \"promql/rate\" checks due to \"prom2\" prometheus connection error: failed to query Prometheus config: Get \"http://127.0.0.1:1054/api/v1/status/config\": dial tcp 127.0.0.1:1054: connect: connection refused",reporter="promql/rate",severity="bug"} +pint_problem{kind="alerting",name="comparison",problem="cound't run \"promql/series\" checks due to \"prom2\" prometheus connection error: Post \"http://127.0.0.1:1054/api/v1/query\": dial tcp 127.0.0.1:1054: connect: connection refused",reporter="promql/series",severity="bug"} +pint_problem{kind="alerting",name="comparison",problem="query using prom1 failed with: bad_response: Unmarshal: there are bytes left after unmarshal, error found in #10 byte of ...|y\"\n }Fatal error|..., bigger context ...|:\"bad_data\",\n \"error\":\"bogus query\"\n }Fatal error|...",reporter="promql/series",severity="bug"} +pint_problem{kind="recording",name="aggregate",problem="cound't run \"promql/rate\" checks due to \"prom1\" prometheus connection error: failed to query Prometheus config: server_error: server error: 500",reporter="promql/rate",severity="bug"} +pint_problem{kind="recording",name="aggregate",problem="cound't run \"promql/rate\" checks due to \"prom2\" prometheus connection error: failed to query Prometheus config: Get \"http://127.0.0.1:1054/api/v1/status/config\": dial tcp 127.0.0.1:1054: connect: connection refused",reporter="promql/rate",severity="bug"} +pint_problem{kind="recording",name="aggregate",problem="cound't run \"promql/series\" checks due to \"prom2\" prometheus connection error: Post \"http://127.0.0.1:1054/api/v1/query\": dial tcp 127.0.0.1:1054: connect: connection refused",reporter="promql/series",severity="bug"} +pint_problem{kind="recording",name="aggregate",problem="query using prom1 failed with: bad_response: Unmarshal: there are bytes left after unmarshal, error found in #10 byte of ...|y\"\n }Fatal error|..., bigger context ...|:\"bad_data\",\n \"error\":\"bogus query\"\n }Fatal error|...",reporter="promql/series",severity="bug"} pint_problem{kind="recording",name="broken",problem="syntax error: no arguments for aggregate expression provided",reporter="promql/syntax",severity="fatal"} # HELP pint_problems Total number of problems reported by pint # TYPE pint_problems gauge @@ -112,18 +114,20 @@ import ( func main() { http.HandleFunc("/api/v1/status/config", func(w http.ResponseWriter, r *http.Request) { - w.WriteHeader(500) + w.WriteHeader(500) + time.Sleep(time.Millisecond * 100) io.WriteString(w, "Fatal error") }) http.HandleFunc("/api/v1/query", func(w http.ResponseWriter, r *http.Request) { - w.WriteHeader(400) - w.Header().Set("Content-Type", "application/json") - _, _ = w.Write([]byte(`{ - "status":"error", - "errorType":"bad_data", - "error":"bogus query" - }`)) + w.WriteHeader(400) + time.Sleep(time.Millisecond * 200) + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{ + "status":"error", + "errorType":"bad_data", + "error":"bogus query" + }`)) io.WriteString(w, "Fatal error") }) diff --git a/cmd/pint/tests/0055_prometheus_failover.txt b/cmd/pint/tests/0055_prometheus_failover.txt index 567fff1c..94c8b0c5 100644 --- a/cmd/pint/tests/0055_prometheus_failover.txt +++ b/cmd/pint/tests/0055_prometheus_failover.txt @@ -18,6 +18,7 @@ prometheus "prom" { uri = "http://127.0.0.1:1055" failover = ["http://127.0.0.1:7055"] timeout = "5s" + required = true } -- prometheus.go -- diff --git a/cmd/pint/tests/0056_prometheus_required.txt b/cmd/pint/tests/0056_prometheus_required.txt new file mode 100644 index 00000000..e4d2fbd0 --- /dev/null +++ b/cmd/pint/tests/0056_prometheus_required.txt @@ -0,0 +1,25 @@ +pint.ok -l debug --no-color lint rules +! stdout . +stderr 'level=error msg=\"Query failed" error=\"Post \\"https:///api/v1/query\\": http: no Host in request URL\" query=count\(up\) uri=https://' +stderr 'level=error msg=\"Failed to query Prometheus configuration\" error=\"Get \\"https:///api/v1/status/config\\": http: no Host in request URL\" uri=https://' +! stderr 'level=info msg="Problems found"' + +-- rules/1.yaml -- +- record: one + expr: up == 0 +- record: two + expr: up == 0 +-- rules/2.yaml -- +- record: one + expr: up == 0 +- record: two + expr: up == 0 + +-- .pint.hcl -- +prometheus "prom" { + uri = "https://" + timeout = "2m" + required = false +} + +rule{} diff --git a/cmd/pint/tests/0057_watch_metrics_prometheus_ignore.txt b/cmd/pint/tests/0057_watch_metrics_prometheus_ignore.txt new file mode 100644 index 00000000..53cb8c6a --- /dev/null +++ b/cmd/pint/tests/0057_watch_metrics_prometheus_ignore.txt @@ -0,0 +1,160 @@ +exec bash -x ./prometheus.sh & +exec bash -c 'I=0 ; while [ ! -f prometheus.pid ] && [ $I -lt 30 ]; do sleep 1; I=$((I+1)); done' + +exec bash -x ./test.sh & + +pint.ok watch --listen=:6057 --pidfile=pint.pid rules +cmp curl.txt metrics.txt + +-- test.sh -- +sleep 15 +curl -s http://127.0.0.1:6057/metrics | grep 'pint_' | perl -pe "s/^([a-zA-Z].+)[ ]([0-9\.\-\+eE]+)$/\1/g" > curl.txt +cat pint.pid | xargs kill +cat prometheus.pid | xargs kill + +-- rules/1.yml -- +- record: broken + expr: foo / count()) + +- record: aggregate + expr: sum(foo) without(job) + +- alert: comparison + expr: foo + +-- .pint.hcl -- +prometheus "prom1" { + uri = "http://127.0.0.1:7057" + timeout = "5s" + required = false +} + +prometheus "prom2" { + uri = "http://127.0.0.1:1057" + timeout = "5s" + required = false +} + +-- metrics.txt -- +# HELP pint_check_duration_seconds How long did a check took to complete +# TYPE pint_check_duration_seconds summary +pint_check_duration_seconds_sum{check="alerts/comparison"} +pint_check_duration_seconds_count{check="alerts/comparison"} +pint_check_duration_seconds_sum{check="alerts/for"} +pint_check_duration_seconds_count{check="alerts/for"} +pint_check_duration_seconds_sum{check="alerts/template"} +pint_check_duration_seconds_count{check="alerts/template"} +pint_check_duration_seconds_sum{check="promql/fragile"} +pint_check_duration_seconds_count{check="promql/fragile"} +pint_check_duration_seconds_sum{check="promql/rate"} +pint_check_duration_seconds_count{check="promql/rate"} +pint_check_duration_seconds_sum{check="promql/regexp"} +pint_check_duration_seconds_count{check="promql/regexp"} +pint_check_duration_seconds_sum{check="promql/series"} +pint_check_duration_seconds_count{check="promql/series"} +pint_check_duration_seconds_sum{check="promql/syntax"} +pint_check_duration_seconds_count{check="promql/syntax"} +pint_check_duration_seconds_sum{check="promql/vector_matching"} +pint_check_duration_seconds_count{check="promql/vector_matching"} +# HELP pint_check_iterations_total Total number of completed check iterations since pint start +# TYPE pint_check_iterations_total counter +pint_check_iterations_total +# HELP pint_last_run_time_seconds Last checks run completion time since unix epoch in seconds +# TYPE pint_last_run_time_seconds gauge +pint_last_run_time_seconds +# HELP pint_problem Prometheus rule problem reported by pint +# TYPE pint_problem gauge +pint_problem{kind="alerting",name="comparison",problem="query using prom1 failed with: bad_response: Unmarshal: there are bytes left after unmarshal, error found in #10 byte of ...|y\"\n }Fatal error|..., bigger context ...|:\"bad_data\",\n \"error\":\"bogus query\"\n }Fatal error|...",reporter="promql/series",severity="bug"} +pint_problem{kind="recording",name="aggregate",problem="query using prom1 failed with: bad_response: Unmarshal: there are bytes left after unmarshal, error found in #10 byte of ...|y\"\n }Fatal error|..., bigger context ...|:\"bad_data\",\n \"error\":\"bogus query\"\n }Fatal error|...",reporter="promql/series",severity="bug"} +pint_problem{kind="recording",name="broken",problem="syntax error: no arguments for aggregate expression provided",reporter="promql/syntax",severity="fatal"} +# HELP pint_problems Total number of problems reported by pint +# TYPE pint_problems gauge +pint_problems +# HELP pint_prometheus_queries_total Total number of all prometheus queries +# TYPE pint_prometheus_queries_total counter +pint_prometheus_queries_total{endpoint="/api/v1/query",name="prom1"} +pint_prometheus_queries_total{endpoint="/api/v1/query",name="prom2"} +pint_prometheus_queries_total{endpoint="/api/v1/status/config",name="prom1"} +pint_prometheus_queries_total{endpoint="/api/v1/status/config",name="prom2"} +# HELP pint_prometheus_query_errors_total Total number of failed prometheus queries +# TYPE pint_prometheus_query_errors_total counter +pint_prometheus_query_errors_total{endpoint="/api/v1/query",name="prom1",reason="api/bad_response"} +pint_prometheus_query_errors_total{endpoint="/api/v1/query",name="prom2",reason="connection/error"} +pint_prometheus_query_errors_total{endpoint="/api/v1/status/config",name="prom1",reason="api/server_error"} +pint_prometheus_query_errors_total{endpoint="/api/v1/status/config",name="prom2",reason="connection/error"} +# HELP pint_rules_parsed_total Total number of rules parsed since startup +# TYPE pint_rules_parsed_total counter +pint_rules_parsed_total{kind="alerting"} +pint_rules_parsed_total{kind="invalid"} +pint_rules_parsed_total{kind="recording"} +# HELP pint_version Version information +# TYPE pint_version gauge +pint_version{version="unknown"} +-- prometheus.go -- +package main + +import ( + "context" + "io" + "log" + "net" + "net/http" + "os" + "os/signal" + "strconv" + "syscall" + "time" +) + +func main() { + http.HandleFunc("/api/v1/status/config", func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(500) + time.Sleep(time.Millisecond * 100) + io.WriteString(w, "Fatal error") + }) + + http.HandleFunc("/api/v1/query", func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(400) + time.Sleep(time.Millisecond * 200) + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{ + "status":"error", + "errorType":"bad_data", + "error":"bogus query" + }`)) + io.WriteString(w, "Fatal error") + }) + + listener, err := net.Listen("tcp", "127.0.0.1:7057") + if err != nil { + log.Fatal(err) + } + + server := &http.Server{ + Addr: "127.0.0.1:7057", + } + + go func() { + _ = server.Serve(listener) + }() + + pid := os.Getpid() + err = os.WriteFile("prometheus.pid", []byte(strconv.Itoa(pid)), 0644) + if err != nil { + log.Fatal(err) + } + + stop := make(chan os.Signal, 1) + signal.Notify(stop, os.Interrupt, syscall.SIGINT, syscall.SIGTERM) + go func() { + time.Sleep(time.Minute*2) + stop <- syscall.SIGTERM + }() + <-stop + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + server.Shutdown(ctx) +} + +-- prometheus.sh -- +env GOCACHE=$TMPDIR go run prometheus.go diff --git a/docs/changelog.md b/docs/changelog.md index cffe2708..0d0b77d1 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -7,6 +7,13 @@ - `prometheus` config block now allows to specify failover URIs using `failover` field. If failover URIs are set and main URI fails to respond pint will attempt to use them in the order specified until one of them works. +- `prometheus` config block now allows to define how upstream errors are handled using + `required` field. If `required` is set to `true` any check that depends on remote + Prometheus server will be reported as `bug` if it's unable to talk to it. + If `required` is set to `false` pint will only emit `warning` level results. + Default value for `required` is `false`. Set it to `true` if you want to hard fail + in case of remote Prometheus issues. Note that setting it to `true` might block + PRs when running `pint ci` until pint is able to talk to Prometheus again. - Renamed `pint/parse` to `yaml/parse` and added missing documentation for it. ## v0.12.0 diff --git a/docs/configuration.md b/docs/configuration.md index 2e43e040..76134526 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -106,6 +106,7 @@ prometheus "$name" { uri = "https://..." failover = ["https://...", ...] timeout = "60s" + required = true|false paths = ["...", ...] } ``` @@ -120,6 +121,13 @@ prometheus "$name" { configuration, otherwise pint checks might return unreliable results and potential false positives. - `timeout` - timeout to be used for API requests. +- `required` - decides how pint will report errors if it's unable to get a valid response + from this Prometheus server. If `required` is `true` and all API calls to this Prometheus + fail pint will report those as `bug` level problem. If it's set to `false` pint will + report those with `warning` level. + Default value for `required` is `false`. Set it to `true` if you want to hard fail + in case of remote Prometheus issues. Note that setting it to `true` might block + PRs when running `pint ci` until pint is able to talk to Prometheus again. - `paths` - optional path filter, if specified only paths matching one of listed regex patterns will use this Prometheus server for checks. diff --git a/internal/checks/alerts_count.go b/internal/checks/alerts_count.go index c8e296e7..220e89fc 100644 --- a/internal/checks/alerts_count.go +++ b/internal/checks/alerts_count.go @@ -53,12 +53,13 @@ func (c AlertsCheck) Check(ctx context.Context, rule parser.Rule) (problems []Pr qr, err := c.prom.RangeQuery(ctx, rule.AlertingRule.Expr.Value.Value, start, end, c.step) if err != nil { + text, severity := textAndSeverityFromError(err, c.Reporter(), c.prom.Name(), Bug) problems = append(problems, Problem{ Fragment: rule.AlertingRule.Expr.Value.Value, Lines: rule.AlertingRule.Expr.Lines(), Reporter: c.Reporter(), - Text: fmt.Sprintf("query using %s failed with: %s", c.prom.Name(), err), - Severity: Bug, + Text: text, + Severity: severity, }) return } diff --git a/internal/checks/alerts_count_test.go b/internal/checks/alerts_count_test.go index 6e5c7299..a62fe616 100644 --- a/internal/checks/alerts_count_test.go +++ b/internal/checks/alerts_count_test.go @@ -85,17 +85,17 @@ func TestAlertsCheck(t *testing.T) { { description: "ignores recording rules", content: "- record: foo\n expr: up == 0\n", - checker: checks.NewAlertsCheck(simpleProm("prom", "http://localhost", time.Second*5), time.Hour*24, time.Minute, time.Minute*5), + checker: checks.NewAlertsCheck(simpleProm("prom", "http://localhost", time.Second*5, true), time.Hour*24, time.Minute, time.Minute*5), }, { description: "ignores rules with syntax errors", content: "- alert: Foo Is Down\n expr: sum(\n", - checker: checks.NewAlertsCheck(simpleProm("prom", "http://localhost", time.Second*5), time.Hour*24, time.Minute, time.Minute*5), + checker: checks.NewAlertsCheck(simpleProm("prom", "http://localhost", time.Second*5, true), time.Hour*24, time.Minute, time.Minute*5), }, { description: "bad request", content: content, - checker: checks.NewAlertsCheck(simpleProm("prom", srv.URL+"/400/", time.Second*5), time.Hour*24, time.Minute, time.Minute*5), + checker: checks.NewAlertsCheck(simpleProm("prom", srv.URL+"/400/", time.Second*5, true), time.Hour*24, time.Minute, time.Minute*5), problems: []checks.Problem{ { Fragment: `up{job="foo"} == 0`, @@ -106,10 +106,24 @@ func TestAlertsCheck(t *testing.T) { }, }, }, + { + description: "connection refused", + content: content, + checker: checks.NewAlertsCheck(simpleProm("prom", "http://127.0.0.1", time.Second*5, false), time.Hour*24, time.Minute, time.Minute*5), + problems: []checks.Problem{ + { + Fragment: `up{job="foo"} == 0`, + Lines: []int{2}, + Reporter: "alerts/count", + Text: `cound't run "alerts/count" checks due to "prom" prometheus connection error: Post "http://127.0.0.1/api/v1/query_range": dial tcp 127.0.0.1:80: connect: connection refused`, + Severity: checks.Warning, + }, + }, + }, { description: "empty response", content: content, - checker: checks.NewAlertsCheck(simpleProm("prom", srv.URL+"/empty/", time.Second*5), time.Hour*24, time.Minute, time.Minute*5), + checker: checks.NewAlertsCheck(simpleProm("prom", srv.URL+"/empty/", time.Second*5, true), time.Hour*24, time.Minute, time.Minute*5), problems: []checks.Problem{ { Fragment: `up{job="foo"} == 0`, @@ -123,7 +137,7 @@ func TestAlertsCheck(t *testing.T) { { description: "multiple alerts", content: content, - checker: checks.NewAlertsCheck(simpleProm("prom", srv.URL+"/alerts/", time.Second*5), time.Hour*24, time.Minute, time.Minute*5), + checker: checks.NewAlertsCheck(simpleProm("prom", srv.URL+"/alerts/", time.Second*5, true), time.Hour*24, time.Minute, time.Minute*5), problems: []checks.Problem{ { Fragment: `up{job="foo"} == 0`, @@ -137,7 +151,7 @@ func TestAlertsCheck(t *testing.T) { { description: "for: 10m", content: "- alert: Foo Is Down\n for: 10m\n expr: up{job=\"foo\"} == 0\n", - checker: checks.NewAlertsCheck(simpleProm("prom", srv.URL+"/alerts/", time.Second*5), time.Hour*24, time.Minute*6, time.Minute*10), + checker: checks.NewAlertsCheck(simpleProm("prom", srv.URL+"/alerts/", time.Second*5, true), time.Hour*24, time.Minute*6, time.Minute*10), problems: []checks.Problem{ { Fragment: `up{job="foo"} == 0`, @@ -154,7 +168,7 @@ func TestAlertsCheck(t *testing.T) { - alert: foo expr: '{__name__="up", job="foo"} == 0' `, - checker: checks.NewAlertsCheck(simpleProm("prom", srv.URL+"/alerts/", time.Second*5), time.Hour*24, time.Minute*6, time.Minute*10), + checker: checks.NewAlertsCheck(simpleProm("prom", srv.URL+"/alerts/", time.Second*5, true), time.Hour*24, time.Minute*6, time.Minute*10), problems: []checks.Problem{ { Fragment: `{__name__="up", job="foo"} == 0`, @@ -171,7 +185,7 @@ func TestAlertsCheck(t *testing.T) { - alert: foo expr: '{__name__=~"(up|foo)", job="foo"} == 0' `, - checker: checks.NewAlertsCheck(simpleProm("prom", srv.URL+"/alerts/", time.Second*5), time.Hour*24, time.Minute*6, time.Minute*10), + checker: checks.NewAlertsCheck(simpleProm("prom", srv.URL+"/alerts/", time.Second*5, true), time.Hour*24, time.Minute*6, time.Minute*10), problems: []checks.Problem{ { Fragment: `{__name__=~"(up|foo)", job="foo"} == 0`, diff --git a/internal/checks/base.go b/internal/checks/base.go index d1d570e4..14836c2a 100644 --- a/internal/checks/base.go +++ b/internal/checks/base.go @@ -2,9 +2,11 @@ package checks import ( "context" + "errors" "fmt" "github.com/cloudflare/pint/internal/parser" + "github.com/cloudflare/pint/internal/promapi" ) var ( @@ -39,15 +41,16 @@ type Severity int func (s Severity) String() string { switch s { - case Fatal: - return "Fatal" case Information: return "Information" case Warning: return "Warning" - default: + case Bug: return "Bug" + case Fatal: + return "Fatal" } + return "Unknown" } func ParseSeverity(s string) (Severity, error) { @@ -56,10 +59,10 @@ func ParseSeverity(s string) (Severity, error) { return Fatal, nil case "bug": return Bug, nil - case "info": - return Information, nil case "warning": return Warning, nil + case "info": + return Information, nil default: return Fatal, fmt.Errorf("unknown severity: %s", s) } @@ -102,3 +105,23 @@ type exprProblem struct { text string severity Severity } + +func textAndSeverityFromError(err error, reporter, prom string, s Severity) (text string, severity Severity) { + if promapi.IsUnavailableError(err) { + text = fmt.Sprintf("cound't run %q checks due to %q prometheus connection error: %s", reporter, prom, err) + var perr *promapi.Error + if errors.As(err, &perr) { + if perr.IsStrict() { + severity = Bug + } else { + severity = Warning + } + } else { + severity = Warning + } + } else { + text = fmt.Sprintf("query using %s failed with: %s", prom, err) + severity = s + } + return +} diff --git a/internal/checks/base_test.go b/internal/checks/base_test.go index 4b380359..91c49873 100644 --- a/internal/checks/base_test.go +++ b/internal/checks/base_test.go @@ -100,11 +100,12 @@ func TestParseSeverity(t *testing.T) { } } -func simpleProm(name, uri string, timeout time.Duration) *promapi.FailoverGroup { +func simpleProm(name, uri string, timeout time.Duration, required bool) *promapi.FailoverGroup { return promapi.NewFailoverGroup( name, []*promapi.Prometheus{ promapi.NewPrometheus(name, uri, timeout), }, + required, ) } diff --git a/internal/checks/promql_rate.go b/internal/checks/promql_rate.go index 6c50e044..f8ca0f7a 100644 --- a/internal/checks/promql_rate.go +++ b/internal/checks/promql_rate.go @@ -41,16 +41,15 @@ func (c RateCheck) Check(ctx context.Context, rule parser.Rule) (problems []Prob scrapeInterval, err := c.getScrapeInterval(ctx) if err != nil { - if err != nil { - problems = append(problems, Problem{ - Fragment: expr.Value.Value, - Lines: expr.Lines(), - Reporter: c.Reporter(), - Text: fmt.Sprintf("failed to query %s prometheus config: %s", c.prom.Name(), err), - Severity: Bug, - }) - return - } + text, severity := textAndSeverityFromError(err, c.Reporter(), c.prom.Name(), Bug) + problems = append(problems, Problem{ + Fragment: expr.Value.Value, + Lines: expr.Lines(), + Reporter: c.Reporter(), + Text: text, + Severity: severity, + }) + return } for _, problem := range c.checkNode(expr.Query, scrapeInterval) { diff --git a/internal/checks/promql_rate_test.go b/internal/checks/promql_rate_test.go index 370dd760..a96e03b0 100644 --- a/internal/checks/promql_rate_test.go +++ b/internal/checks/promql_rate_test.go @@ -48,12 +48,12 @@ func TestRateCheck(t *testing.T) { { description: "ignores rules with syntax errors", content: "- record: foo\n expr: sum(foo) without(\n", - checker: checks.NewRateCheck(simpleProm("prom", srv.URL, time.Second)), + checker: checks.NewRateCheck(simpleProm("prom", srv.URL, time.Second, true)), }, { description: "rate < 2x scrape_interval", content: "- record: foo\n expr: rate(foo[1m])\n", - checker: checks.NewRateCheck(simpleProm("prom", srv.URL+"/1m/", time.Second)), + checker: checks.NewRateCheck(simpleProm("prom", srv.URL+"/1m/", time.Second, true)), problems: []checks.Problem{ { Fragment: "rate(foo[1m])", @@ -67,7 +67,7 @@ func TestRateCheck(t *testing.T) { { description: "rate < 4x scrape_interval", content: "- record: foo\n expr: rate(foo[3m])\n", - checker: checks.NewRateCheck(simpleProm("prom", srv.URL+"/1m/", time.Second)), + checker: checks.NewRateCheck(simpleProm("prom", srv.URL+"/1m/", time.Second, true)), problems: []checks.Problem{ { Fragment: "rate(foo[3m])", @@ -81,12 +81,12 @@ func TestRateCheck(t *testing.T) { { description: "rate == 4x scrape interval", content: "- record: foo\n expr: rate(foo[2m])\n", - checker: checks.NewRateCheck(simpleProm("prom", srv.URL+"/30s/", time.Second)), + checker: checks.NewRateCheck(simpleProm("prom", srv.URL+"/30s/", time.Second, true)), }, { description: "irate < 2x scrape_interval", content: "- record: foo\n expr: irate(foo[1m])\n", - checker: checks.NewRateCheck(simpleProm("prom", srv.URL+"/1m/", time.Second)), + checker: checks.NewRateCheck(simpleProm("prom", srv.URL+"/1m/", time.Second, true)), problems: []checks.Problem{ { Fragment: "irate(foo[1m])", @@ -100,7 +100,7 @@ func TestRateCheck(t *testing.T) { { description: "irate < 3x scrape_interval", content: "- record: foo\n expr: irate(foo[2m])\n", - checker: checks.NewRateCheck(simpleProm("prom", srv.URL+"/1m/", time.Second)), + checker: checks.NewRateCheck(simpleProm("prom", srv.URL+"/1m/", time.Second, true)), problems: []checks.Problem{ { Fragment: "irate(foo[2m])", @@ -117,7 +117,7 @@ func TestRateCheck(t *testing.T) { - record: foo expr: irate({__name__="foo"}[5m]) `, - checker: checks.NewRateCheck(simpleProm("prom", srv.URL+"/1m/", time.Second)), + checker: checks.NewRateCheck(simpleProm("prom", srv.URL+"/1m/", time.Second, true)), }, { description: "irate{__name__=~} > 3x scrape_interval", @@ -125,7 +125,7 @@ func TestRateCheck(t *testing.T) { - record: foo expr: irate({__name__=~"(foo|bar)_total"}[5m]) `, - checker: checks.NewRateCheck(simpleProm("prom", srv.URL+"/1m/", time.Second)), + checker: checks.NewRateCheck(simpleProm("prom", srv.URL+"/1m/", time.Second, true)), }, { description: "irate{__name__} < 3x scrape_interval", @@ -133,7 +133,7 @@ func TestRateCheck(t *testing.T) { - record: foo expr: irate({__name__="foo"}[2m]) `, - checker: checks.NewRateCheck(simpleProm("prom", srv.URL+"/1m/", time.Second)), + checker: checks.NewRateCheck(simpleProm("prom", srv.URL+"/1m/", time.Second, true)), problems: []checks.Problem{ { Fragment: `irate({__name__="foo"}[2m])`, @@ -150,7 +150,7 @@ func TestRateCheck(t *testing.T) { - record: foo expr: irate({__name__=~"(foo|bar)_total"}[2m]) `, - checker: checks.NewRateCheck(simpleProm("prom", srv.URL+"/1m/", time.Second)), + checker: checks.NewRateCheck(simpleProm("prom", srv.URL+"/1m/", time.Second, true)), problems: []checks.Problem{ { Fragment: `irate({__name__=~"(foo|bar)_total"}[2m])`, @@ -164,17 +164,17 @@ func TestRateCheck(t *testing.T) { { description: "irate == 3x scrape interval", content: "- record: foo\n expr: irate(foo[3m])\n", - checker: checks.NewRateCheck(simpleProm("prom", srv.URL+"/1m/", time.Second)), + checker: checks.NewRateCheck(simpleProm("prom", srv.URL+"/1m/", time.Second, true)), }, { description: "valid range selector", content: "- record: foo\n expr: foo[1m]\n", - checker: checks.NewRateCheck(simpleProm("prom", srv.URL+"/1m/", time.Second)), + checker: checks.NewRateCheck(simpleProm("prom", srv.URL+"/1m/", time.Second, true)), }, { description: "nested invalid rate", content: "- record: foo\n expr: sum(rate(foo[3m])) / sum(rate(bar[1m]))\n", - checker: checks.NewRateCheck(simpleProm("prom", srv.URL+"/1m/", time.Second)), + checker: checks.NewRateCheck(simpleProm("prom", srv.URL+"/1m/", time.Second, true)), problems: []checks.Problem{ { Fragment: "rate(foo[3m])", @@ -195,13 +195,13 @@ func TestRateCheck(t *testing.T) { { description: "500 error from Prometheus API", content: "- record: foo\n expr: rate(foo[5m])\n", - checker: checks.NewRateCheck(simpleProm("prom", srv.URL+"/error/", time.Second)), + checker: checks.NewRateCheck(simpleProm("prom", srv.URL+"/error/", time.Second, true)), problems: []checks.Problem{ { Fragment: "rate(foo[5m])", Lines: []int{2}, Reporter: "promql/rate", - Text: "failed to query prom prometheus config: failed to query Prometheus config: server_error: server error: 500", + Text: `cound't run "promql/rate" checks due to "prom" prometheus connection error: failed to query Prometheus config: server_error: server error: 500`, Severity: checks.Bug, }, }, @@ -209,13 +209,13 @@ func TestRateCheck(t *testing.T) { { description: "invalid status", content: "- record: foo\n expr: rate(foo[5m])\n", - checker: checks.NewRateCheck(simpleProm("prom", srv.URL, time.Second)), + checker: checks.NewRateCheck(simpleProm("prom", srv.URL, time.Second, true)), problems: []checks.Problem{ { Fragment: "rate(foo[5m])", Lines: []int{2}, Reporter: "promql/rate", - Text: "failed to query prom prometheus config: failed to query Prometheus config: bad_data: unhandled path", + Text: `query using prom failed with: failed to query Prometheus config: bad_data: unhandled path`, Severity: checks.Bug, }, }, @@ -223,26 +223,40 @@ func TestRateCheck(t *testing.T) { { description: "invalid YAML", content: "- record: foo\n expr: rate(foo[5m])\n", - checker: checks.NewRateCheck(simpleProm("prom", srv.URL+"/badYaml/", time.Second)), + checker: checks.NewRateCheck(simpleProm("prom", srv.URL+"/badYaml/", time.Second, true)), problems: []checks.Problem{ { Fragment: "rate(foo[5m])", Lines: []int{2}, Reporter: "promql/rate", - Text: fmt.Sprintf("failed to query prom prometheus config: failed to decode config data in %s/badYaml/ response: yaml: unmarshal errors:\n line 1: cannot unmarshal !!str `invalid...` into promapi.PrometheusConfig", srv.URL), + Text: fmt.Sprintf("cound't run \"promql/rate\" checks due to \"prom\" prometheus connection error: failed to decode config data in %s/badYaml/ response: yaml: unmarshal errors:\n line 1: cannot unmarshal !!str `invalid...` into promapi.PrometheusConfig", srv.URL), Severity: checks.Bug, }, }, }, + { + description: "connection refused", + content: "- record: foo\n expr: rate(foo[5m])\n", + checker: checks.NewRateCheck(simpleProm("prom", "http://", time.Second, false)), + problems: []checks.Problem{ + { + Fragment: "rate(foo[5m])", + Lines: []int{2}, + Reporter: "promql/rate", + Text: `cound't run "promql/rate" checks due to "prom" prometheus connection error: failed to query Prometheus config: Get "http:///api/v1/status/config": http: no Host in request URL`, + Severity: checks.Warning, + }, + }, + }, { description: "irate == 3 x default 1m", content: "- record: foo\n expr: irate(foo[3m])\n", - checker: checks.NewRateCheck(simpleProm("prom", srv.URL+"/default/", time.Second)), + checker: checks.NewRateCheck(simpleProm("prom", srv.URL+"/default/", time.Second, true)), }, { description: "irate < 3 x default 1m", content: "- record: foo\n expr: irate(foo[2m])\n", - checker: checks.NewRateCheck(simpleProm("prom", srv.URL+"/default/", time.Second)), + checker: checks.NewRateCheck(simpleProm("prom", srv.URL+"/default/", time.Second, true)), problems: []checks.Problem{ { Fragment: "irate(foo[2m])", diff --git a/internal/checks/promql_series.go b/internal/checks/promql_series.go index 5ff64d60..682b3d05 100644 --- a/internal/checks/promql_series.go +++ b/internal/checks/promql_series.go @@ -62,12 +62,13 @@ func (c SeriesCheck) countSeries(ctx context.Context, expr parser.PromQLExpr, se q := fmt.Sprintf("count(%s)", selector.String()) qr, err := c.prom.Query(ctx, q) if err != nil { + text, severity := textAndSeverityFromError(err, c.Reporter(), c.prom.Name(), Bug) problems = append(problems, Problem{ Fragment: selector.String(), Lines: expr.Lines(), Reporter: c.Reporter(), - Text: fmt.Sprintf("query using %s failed with: %s", c.prom.Name(), err), - Severity: Bug, + Text: text, + Severity: severity, }) return } diff --git a/internal/checks/promql_series_test.go b/internal/checks/promql_series_test.go index 92be0115..da65f1fa 100644 --- a/internal/checks/promql_series_test.go +++ b/internal/checks/promql_series_test.go @@ -108,12 +108,12 @@ func TestSeriesCheck(t *testing.T) { { description: "ignores rules with syntax errors", content: "- record: foo\n expr: sum(foo) without(\n", - checker: checks.NewSeriesCheck(simpleProm("prom", srv.URL, time.Second*5)), + checker: checks.NewSeriesCheck(simpleProm("prom", srv.URL, time.Second*5, true)), }, { description: "bad response", content: "- record: foo\n expr: sum(foo)\n", - checker: checks.NewSeriesCheck(simpleProm("prom", srv.URL, time.Second*5)), + checker: checks.NewSeriesCheck(simpleProm("prom", srv.URL, time.Second*5, true)), problems: []checks.Problem{ { Fragment: "foo", @@ -124,10 +124,24 @@ func TestSeriesCheck(t *testing.T) { }, }, }, + { + description: "bad uri", + content: "- record: foo\n expr: sum(foo)\n", + checker: checks.NewSeriesCheck(simpleProm("prom", "http://", time.Second*5, false)), + problems: []checks.Problem{ + { + Fragment: "foo", + Lines: []int{2}, + Reporter: "promql/series", + Text: `cound't run "promql/series" checks due to "prom" prometheus connection error: Post "http:///api/v1/query": http: no Host in request URL`, + Severity: checks.Warning, + }, + }, + }, { description: "simple query", content: "- record: foo\n expr: sum(notfound)\n", - checker: checks.NewSeriesCheck(simpleProm("prom", srv.URL, time.Second*5)), + checker: checks.NewSeriesCheck(simpleProm("prom", srv.URL, time.Second*5, true)), problems: []checks.Problem{ { Fragment: "notfound", @@ -141,7 +155,7 @@ func TestSeriesCheck(t *testing.T) { { description: "complex query", content: "- record: foo\n expr: sum(found_7 * on (job) sum(sum(notfound))) / found_7\n", - checker: checks.NewSeriesCheck(simpleProm("prom", srv.URL, time.Second*5)), + checker: checks.NewSeriesCheck(simpleProm("prom", srv.URL, time.Second*5, true)), problems: []checks.Problem{ { Fragment: "notfound", @@ -155,7 +169,7 @@ func TestSeriesCheck(t *testing.T) { { description: "complex query / bug", content: "- record: foo\n expr: sum(found_7 * on (job) sum(sum(notfound))) / found_7\n", - checker: checks.NewSeriesCheck(simpleProm("prom", srv.URL, time.Second*5)), + checker: checks.NewSeriesCheck(simpleProm("prom", srv.URL, time.Second*5, true)), problems: []checks.Problem{ { Fragment: "notfound", @@ -189,22 +203,22 @@ func TestSeriesCheck(t *testing.T) { ) for: 5m `, - checker: checks.NewSeriesCheck(simpleProm("prom", srv.URL, time.Second*5)), + checker: checks.NewSeriesCheck(simpleProm("prom", srv.URL, time.Second*5, true)), }, { description: "offset", content: "- record: foo\n expr: node_filesystem_readonly{mountpoint!=\"\"} offset 5m\n", - checker: checks.NewSeriesCheck(simpleProm("prom", srv.URL, time.Second*5)), + checker: checks.NewSeriesCheck(simpleProm("prom", srv.URL, time.Second*5, true)), }, { description: "negative offset", content: "- record: foo\n expr: node_filesystem_readonly{mountpoint!=\"\"} offset -15m\n", - checker: checks.NewSeriesCheck(simpleProm("prom", srv.URL, time.Second*5)), + checker: checks.NewSeriesCheck(simpleProm("prom", srv.URL, time.Second*5, true)), }, { description: "series found, label missing", content: "- record: foo\n expr: found{job=\"notfound\"}\n", - checker: checks.NewSeriesCheck(simpleProm("prom", srv.URL, time.Second*5)), + checker: checks.NewSeriesCheck(simpleProm("prom", srv.URL, time.Second*5, true)), problems: []checks.Problem{ { Fragment: `found{job="notfound"}`, @@ -218,7 +232,7 @@ func TestSeriesCheck(t *testing.T) { { description: "series missing, label missing", content: "- record: foo\n expr: notfound{job=\"notfound\"}\n", - checker: checks.NewSeriesCheck(simpleProm("prom", srv.URL, time.Second*5)), + checker: checks.NewSeriesCheck(simpleProm("prom", srv.URL, time.Second*5, true)), problems: []checks.Problem{ { Fragment: "notfound", @@ -235,7 +249,7 @@ func TestSeriesCheck(t *testing.T) { - record: foo expr: '{__name__="notfound", job="bar"}' `, - checker: checks.NewSeriesCheck(simpleProm("prom", srv.URL, time.Second*5)), + checker: checks.NewSeriesCheck(simpleProm("prom", srv.URL, time.Second*5, true)), problems: []checks.Problem{ { Fragment: `{__name__="notfound",job="bar"}`, @@ -253,7 +267,7 @@ func TestSeriesCheck(t *testing.T) { - record: foo expr: count(notfound) == 0 `, - checker: checks.NewSeriesCheck(simpleProm("prom", srv.URL, time.Second*5)), + checker: checks.NewSeriesCheck(simpleProm("prom", srv.URL, time.Second*5, true)), }, { description: "series missing but check disabled, labels", @@ -262,7 +276,7 @@ func TestSeriesCheck(t *testing.T) { - record: foo expr: count(notfound{job="foo"}) == 0 `, - checker: checks.NewSeriesCheck(simpleProm("prom", srv.URL, time.Second*5)), + checker: checks.NewSeriesCheck(simpleProm("prom", srv.URL, time.Second*5, true)), }, { description: "series missing but check disabled, negative labels", @@ -271,7 +285,7 @@ func TestSeriesCheck(t *testing.T) { - record: foo expr: count(notfound{job!="foo"}) == 0 `, - checker: checks.NewSeriesCheck(simpleProm("prom", srv.URL, time.Second*5)), + checker: checks.NewSeriesCheck(simpleProm("prom", srv.URL, time.Second*5, true)), }, { description: "series missing, disabled comment for labels", @@ -280,7 +294,7 @@ func TestSeriesCheck(t *testing.T) { - record: foo expr: count(notfound) == 0 `, - checker: checks.NewSeriesCheck(simpleProm("prom", srv.URL, time.Second*5)), + checker: checks.NewSeriesCheck(simpleProm("prom", srv.URL, time.Second*5, true)), problems: []checks.Problem{ { Fragment: `notfound`, diff --git a/internal/checks/promql_vector_matching.go b/internal/checks/promql_vector_matching.go index 7c8bd9dd..381444f0 100644 --- a/internal/checks/promql_vector_matching.go +++ b/internal/checks/promql_vector_matching.go @@ -47,7 +47,7 @@ func (c VectorMatchingCheck) Check(ctx context.Context, rule parser.Rule) (probl Lines: expr.Lines(), Reporter: c.Reporter(), Text: problem.text, - Severity: Bug, + Severity: problem.severity, }) } @@ -62,7 +62,16 @@ func (c VectorMatchingCheck) checkNode(ctx context.Context, node *parser.PromQLN q := fmt.Sprintf("count(%s)", n.String()) qr, err := c.prom.Query(ctx, q) - if err == nil && len(qr.Series) > 0 { + if err != nil { + text, severity := textAndSeverityFromError(err, c.Reporter(), c.prom.Name(), Bug) + problems = append(problems, exprProblem{ + expr: node.Expr, + text: text, + severity: severity, + }) + return + } + if len(qr.Series) > 0 { return } @@ -74,12 +83,30 @@ func (c VectorMatchingCheck) checkNode(ctx context.Context, node *parser.PromQLN } leftLabels, err := c.seriesLabels(ctx, fmt.Sprintf("topk(1, %s)", n.LHS.String()), ignored...) - if leftLabels == nil || err != nil { + if err != nil { + text, severity := textAndSeverityFromError(err, c.Reporter(), c.prom.Name(), Bug) + problems = append(problems, exprProblem{ + expr: node.Expr, + text: text, + severity: severity, + }) + return + } + if leftLabels == nil { goto NEXT } rightLabels, err := c.seriesLabels(ctx, fmt.Sprintf("topk(1, %s)", n.RHS.String()), ignored...) - if rightLabels == nil || err != nil { + if err != nil { + text, severity := textAndSeverityFromError(err, c.Reporter(), c.prom.Name(), Bug) + problems = append(problems, exprProblem{ + expr: node.Expr, + text: text, + severity: severity, + }) + return + } + if rightLabels == nil { goto NEXT } @@ -87,20 +114,23 @@ func (c VectorMatchingCheck) checkNode(ctx context.Context, node *parser.PromQLN for _, name := range n.VectorMatching.MatchingLabels { if !stringInSlice(leftLabels, name) && stringInSlice(rightLabels, name) { problems = append(problems, exprProblem{ - expr: node.Expr, - text: fmt.Sprintf("using on(%q) won't produce any results because left hand side of the query doesn't have this label: %q", name, node.Node.(*promParser.BinaryExpr).LHS), + expr: node.Expr, + text: fmt.Sprintf("using on(%q) won't produce any results because left hand side of the query doesn't have this label: %q", name, node.Node.(*promParser.BinaryExpr).LHS), + severity: Bug, }) } if stringInSlice(leftLabels, name) && !stringInSlice(rightLabels, name) { problems = append(problems, exprProblem{ - expr: node.Expr, - text: fmt.Sprintf("using on(%q) won't produce any results because right hand side of the query doesn't have this label: %q", name, node.Node.(*promParser.BinaryExpr).RHS), + expr: node.Expr, + text: fmt.Sprintf("using on(%q) won't produce any results because right hand side of the query doesn't have this label: %q", name, node.Node.(*promParser.BinaryExpr).RHS), + severity: Bug, }) } if !stringInSlice(leftLabels, name) && !stringInSlice(rightLabels, name) { problems = append(problems, exprProblem{ - expr: node.Expr, - text: fmt.Sprintf("using on(%q) won't produce any results because both sides of the query don't have this label", name), + expr: node.Expr, + text: fmt.Sprintf("using on(%q) won't produce any results because both sides of the query don't have this label", name), + severity: Bug, }) } } @@ -108,13 +138,15 @@ func (c VectorMatchingCheck) checkNode(ctx context.Context, node *parser.PromQLN if !areStringSlicesEqual(leftLabels, rightLabels) { if len(n.VectorMatching.MatchingLabels) == 0 { problems = append(problems, exprProblem{ - expr: node.Expr, - text: fmt.Sprintf("both sides of the query have different labels: %s != %s", leftLabels, rightLabels), + expr: node.Expr, + text: fmt.Sprintf("both sides of the query have different labels: %s != %s", leftLabels, rightLabels), + severity: Bug, }) } else { problems = append(problems, exprProblem{ - expr: node.Expr, - text: fmt.Sprintf("using ignoring(%q) won't produce any results because both sides of the query have different labels: %s != %s", strings.Join(n.VectorMatching.MatchingLabels, ","), leftLabels, rightLabels), + expr: node.Expr, + text: fmt.Sprintf("using ignoring(%q) won't produce any results because both sides of the query have different labels: %s != %s", strings.Join(n.VectorMatching.MatchingLabels, ","), leftLabels, rightLabels), + severity: Bug, }) } } diff --git a/internal/checks/promql_vector_matching_test.go b/internal/checks/promql_vector_matching_test.go index 07a09627..dd97552d 100644 --- a/internal/checks/promql_vector_matching_test.go +++ b/internal/checks/promql_vector_matching_test.go @@ -166,12 +166,12 @@ func TestVectorMatchingCheck(t *testing.T) { { description: "ignores rules with syntax errors", content: "- record: foo\n expr: sum(foo) without(\n", - checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second*1)), + checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second, true)), }, { description: "one to one matching", content: "- record: foo\n expr: foo_with_notfound / bar\n", - checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second*1)), + checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second, true)), problems: []checks.Problem{ { Fragment: "foo_with_notfound / bar", @@ -185,37 +185,37 @@ func TestVectorMatchingCheck(t *testing.T) { { description: "ignore left query errors", content: "- record: foo\n expr: xxx / foo\n", - checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second*1)), + checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second, true)), }, { description: "ignore righ query errors", content: "- record: foo\n expr: foo / xxx\n", - checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second*1)), + checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second, true)), }, { description: "ignore missing left series", content: "- record: foo\n expr: missing / foo\n", - checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second*1)), + checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second, true)), }, { description: "ignore missing or vector", content: "- record: foo\n expr: sum(missing or vector(0))\n", - checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second*1)), + checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second, true)), }, { description: "ignore present or vector", content: "- record: foo\n expr: sum(foo or vector(0))\n", - checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second*1)), + checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second, true)), }, { description: "ignore missing right series", content: "- record: foo\n expr: foo / missing\n", - checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second*1)), + checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second, true)), }, { description: "ignore with mismatched series", content: "- record: foo\n expr: foo / ignoring(xxx) app_registry\n", - checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second*1)), + checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second, true)), problems: []checks.Problem{ { Fragment: "foo / ignoring(xxx) app_registry", @@ -229,7 +229,7 @@ func TestVectorMatchingCheck(t *testing.T) { { description: "one to one matching with on() - both missing", content: "- record: foo\n expr: foo / on(notfound) bar\n", - checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second*1)), + checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second, true)), problems: []checks.Problem{ { Fragment: "foo / on(notfound) bar", @@ -243,32 +243,32 @@ func TestVectorMatchingCheck(t *testing.T) { { description: "one to one matching with ignoring() - both missing", content: "- record: foo\n expr: foo / ignoring(notfound) foo\n", - checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second*1)), + checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second, true)), }, { description: "one to one matching with ignoring() - both present", content: "- record: foo\n expr: foo_with_notfound / ignoring(notfound) foo_with_notfound\n", - checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second*1)), + checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second, true)), }, { description: "one to one matching with ignoring() - left missing", content: "- record: foo\n expr: foo / ignoring(notfound) foo_with_notfound\n", - checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second*1)), + checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second, true)), }, { description: "one to one matching with ignoring() - right missing", content: "- record: foo\n expr: foo_with_notfound / ignoring(notfound) foo\n", - checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second*1)), + checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second, true)), }, { description: "one to one matching with ignoring() - mismatch", content: "- record: foo\n expr: foo_with_notfound / ignoring(notfound) bar\n", - checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second*1)), + checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second, true)), }, { description: "one to one matching with on() - left missing", content: "- record: foo\n expr: foo / on(notfound) bar_with_notfound\n", - checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second*1)), + checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second, true)), problems: []checks.Problem{ { Fragment: "foo / on(notfound) bar_with_notfound", @@ -282,7 +282,7 @@ func TestVectorMatchingCheck(t *testing.T) { { description: "one to one matching with on() - right missing", content: "- record: foo\n expr: foo_with_notfound / on(notfound) bar\n", - checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second*1)), + checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second, true)), problems: []checks.Problem{ { Fragment: "foo_with_notfound / on(notfound) bar", @@ -296,7 +296,7 @@ func TestVectorMatchingCheck(t *testing.T) { { description: "nested query", content: "- alert: foo\n expr: (memory_bytes / ignoring(job) (memory_limit > 0)) * on(app_name) group_left(a,b,c) app_registry\n", - checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second*1)), + checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second, true)), problems: []checks.Problem{ { Fragment: "(memory_bytes / ignoring(job) (memory_limit > 0)) * on(app_name) group_left(a,b,c) app_registry", @@ -313,32 +313,32 @@ func TestVectorMatchingCheck(t *testing.T) { - record: foo expr: '{__name__="foo_with_notfound"} / ignoring(notfound) foo_with_notfound' `, - checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second*1)), + checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second, true)), }, { description: "skips number comparison on LHS", content: "- record: foo\n expr: 2 < foo / bar\n", - checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second*1)), + checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second, true)), }, { description: "skips number comparison on RHS", content: "- record: foo\n expr: foo / bar > 0\n", - checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second*1)), + checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second, true)), }, { description: "skips number comparison on both sides", content: "- record: foo\n expr: 1 > bool 1\n", - checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second*1)), + checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second, true)), }, { description: "up == 0 AND foo > 0", content: "- alert: foo\n expr: up == 0 AND foo > 0\n", - checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second*1)), + checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second, true)), }, { description: "subquery is trimmed", content: "- alert: foo\n expr: min_over_time((foo_with_notfound > 0)[30m:1m]) / bar\n", - checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second*1)), + checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second, true)), problems: []checks.Problem{ { Fragment: "min_over_time((foo_with_notfound > 0)[30m:1m]) / bar", @@ -352,17 +352,17 @@ func TestVectorMatchingCheck(t *testing.T) { { description: "scalar", content: "- alert: foo\n expr: (100*(1024^2))\n", - checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second*1)), + checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second, true)), }, { description: "binary expression on both sides / passing", content: "- alert: foo\n expr: (foo / ignoring(notfound) foo_with_notfound) / (foo / ignoring(notfound) foo_with_notfound)\n", - checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second*1)), + checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second, true)), }, { description: "binary expression on both sides / mismatch", content: "- alert: foo\n expr: (foo / ignoring(notfound) foo_with_notfound) / (memory_bytes / ignoring(job) memory_limit)\n", - checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second*1)), + checker: checks.NewVectorMatchingCheck(simpleProm("prom", srv.URL, time.Second, true)), problems: []checks.Problem{ { Fragment: "(foo / ignoring(notfound) foo_with_notfound) / (memory_bytes / ignoring(job) memory_limit)", @@ -373,6 +373,34 @@ func TestVectorMatchingCheck(t *testing.T) { }, }, }, + { + description: "connection refused", + content: "- record: foo\n expr: xxx/yyy\n", + checker: checks.NewVectorMatchingCheck(simpleProm("prom", "http://127.0.0.1", time.Second, true)), + problems: []checks.Problem{ + { + Fragment: "xxx/yyy", + Lines: []int{2}, + Reporter: "promql/vector_matching", + Text: `cound't run "promql/vector_matching" checks due to "prom" prometheus connection error: Post "http://127.0.0.1/api/v1/query": dial tcp 127.0.0.1:80: connect: connection refused`, + Severity: checks.Bug, + }, + }, + }, + { + description: "connection refused", + content: "- record: foo\n expr: xxx/yyy\n", + checker: checks.NewVectorMatchingCheck(simpleProm("prom", "http://127.0.0.1", time.Second, false)), + problems: []checks.Problem{ + { + Fragment: "xxx/yyy", + Lines: []int{2}, + Reporter: "promql/vector_matching", + Text: `cound't run "promql/vector_matching" checks due to "prom" prometheus connection error: Post "http://127.0.0.1/api/v1/query": dial tcp 127.0.0.1:80: connect: connection refused`, + Severity: checks.Warning, + }, + }, + }, } runTests(t, testCases) } diff --git a/internal/checks/query_cost.go b/internal/checks/query_cost.go index dbec7ebc..3644c4a6 100644 --- a/internal/checks/query_cost.go +++ b/internal/checks/query_cost.go @@ -50,12 +50,13 @@ func (c CostCheck) Check(ctx context.Context, rule parser.Rule) (problems []Prob query := fmt.Sprintf("count(%s)", expr.Value.Value) qr, err := c.prom.Query(ctx, query) if err != nil { + text, severity := textAndSeverityFromError(err, c.Reporter(), c.prom.Name(), Bug) problems = append(problems, Problem{ Fragment: expr.Value.Value, Lines: expr.Lines(), Reporter: c.Reporter(), - Text: fmt.Sprintf("query using %s failed with: %s", c.prom.Name(), err), - Severity: Bug, + Text: text, + Severity: severity, }) return } diff --git a/internal/checks/query_cost_test.go b/internal/checks/query_cost_test.go index 36503f3b..1a5b0045 100644 --- a/internal/checks/query_cost_test.go +++ b/internal/checks/query_cost_test.go @@ -79,12 +79,12 @@ func TestCostCheck(t *testing.T) { { description: "ignores rules with syntax errors", content: "- record: foo\n expr: sum(foo) without(\n", - checker: checks.NewCostCheck(simpleProm("prom", "http://localhost", time.Second*5), 4096, 0, checks.Bug), + checker: checks.NewCostCheck(simpleProm("prom", "http://localhost", time.Second*5, true), 4096, 0, checks.Bug), }, { description: "empty response", content: content, - checker: checks.NewCostCheck(simpleProm("prom", srv.URL+"/empty/", time.Second*5), 4096, 0, checks.Bug), + checker: checks.NewCostCheck(simpleProm("prom", srv.URL+"/empty/", time.Second*5, true), 4096, 0, checks.Bug), problems: []checks.Problem{ { Fragment: "sum(foo)", @@ -98,13 +98,13 @@ func TestCostCheck(t *testing.T) { { description: "response timeout", content: content, - checker: checks.NewCostCheck(simpleProm("prom", srv.URL+"/empty/", time.Millisecond*5), 4096, 0, checks.Bug), + checker: checks.NewCostCheck(simpleProm("prom", srv.URL+"/empty/", time.Millisecond*5, true), 4096, 0, checks.Bug), problems: []checks.Problem{ { Fragment: "sum(foo)", Lines: []int{2}, Reporter: "query/cost", - Text: `RE:query using prom failed with: Post "http://.+/empty/api/v1/query": context deadline exceeded`, + Text: `RE:cound't run "query/cost" checks due to "prom" prometheus connection error: Post "http://127.0.0.1:.+/empty/api/v1/query": context deadline exceeded`, Severity: checks.Bug, }, }, @@ -112,7 +112,7 @@ func TestCostCheck(t *testing.T) { { description: "bad request", content: content, - checker: checks.NewCostCheck(simpleProm("prom", srv.URL+"/400/", time.Second*5), 4096, 0, checks.Bug), + checker: checks.NewCostCheck(simpleProm("prom", srv.URL+"/400/", time.Second*5, true), 4096, 0, checks.Bug), problems: []checks.Problem{ { Fragment: "sum(foo)", @@ -123,10 +123,24 @@ func TestCostCheck(t *testing.T) { }, }, }, + { + description: "bad request", + content: content, + checker: checks.NewCostCheck(simpleProm("prom", "http://127.0.0.1", time.Second*5, false), 4096, 0, checks.Bug), + problems: []checks.Problem{ + { + Fragment: "sum(foo)", + Lines: []int{2}, + Reporter: "query/cost", + Text: `cound't run "query/cost" checks due to "prom" prometheus connection error: Post "http://127.0.0.1/api/v1/query": dial tcp 127.0.0.1:80: connect: connection refused`, + Severity: checks.Warning, + }, + }, + }, { description: "1 result", content: content, - checker: checks.NewCostCheck(simpleProm("prom", srv.URL+"/1/", time.Second*5), 4096, 0, checks.Bug), + checker: checks.NewCostCheck(simpleProm("prom", srv.URL+"/1/", time.Second*5, true), 4096, 0, checks.Bug), problems: []checks.Problem{ { Fragment: "sum(foo)", @@ -140,7 +154,7 @@ func TestCostCheck(t *testing.T) { { description: "7 results", content: content, - checker: checks.NewCostCheck(simpleProm("prom", srv.URL+"/7/", time.Second*5), 101, 0, checks.Bug), + checker: checks.NewCostCheck(simpleProm("prom", srv.URL+"/7/", time.Second*5, true), 101, 0, checks.Bug), problems: []checks.Problem{ { Fragment: "sum(foo)", @@ -154,7 +168,7 @@ func TestCostCheck(t *testing.T) { { description: "7 result with MB", content: content, - checker: checks.NewCostCheck(simpleProm("prom", srv.URL+"/7/", time.Second*5), 1024*1024, 0, checks.Bug), + checker: checks.NewCostCheck(simpleProm("prom", srv.URL+"/7/", time.Second*5, true), 1024*1024, 0, checks.Bug), problems: []checks.Problem{ { Fragment: "sum(foo)", @@ -168,7 +182,7 @@ func TestCostCheck(t *testing.T) { { description: "7 results with 1 series max (1KB bps)", content: content, - checker: checks.NewCostCheck(simpleProm("prom", srv.URL+"/7/", time.Second*5), 1024, 1, checks.Bug), + checker: checks.NewCostCheck(simpleProm("prom", srv.URL+"/7/", time.Second*5, true), 1024, 1, checks.Bug), problems: []checks.Problem{ { Fragment: "sum(foo)", @@ -182,7 +196,7 @@ func TestCostCheck(t *testing.T) { { description: "7 results with 5 series max", content: content, - checker: checks.NewCostCheck(simpleProm("prom", srv.URL+"/7/", time.Second*5), 0, 5, checks.Bug), + checker: checks.NewCostCheck(simpleProm("prom", srv.URL+"/7/", time.Second*5, true), 0, 5, checks.Bug), problems: []checks.Problem{ { Fragment: "sum(foo)", @@ -196,7 +210,7 @@ func TestCostCheck(t *testing.T) { { description: "7 results with 5 series max / infi", content: content, - checker: checks.NewCostCheck(simpleProm("prom", srv.URL+"/7/", time.Second*5), 0, 5, checks.Information), + checker: checks.NewCostCheck(simpleProm("prom", srv.URL+"/7/", time.Second*5, true), 0, 5, checks.Information), problems: []checks.Problem{ { Fragment: "sum(foo)", @@ -213,7 +227,7 @@ func TestCostCheck(t *testing.T) { - record: foo expr: 'sum({__name__="foo"})' `, - checker: checks.NewCostCheck(simpleProm("prom", srv.URL+"/7/", time.Second*5), 101, 0, checks.Bug), + checker: checks.NewCostCheck(simpleProm("prom", srv.URL+"/7/", time.Second*5, true), 101, 0, checks.Bug), problems: []checks.Problem{ { Fragment: `sum({__name__="foo"})`, diff --git a/internal/config/__snapshots__/config_test.snap b/internal/config/__snapshots__/config_test.snap index acb1dad6..9afaebbe 100755 --- a/internal/config/__snapshots__/config_test.snap +++ b/internal/config/__snapshots__/config_test.snap @@ -37,7 +37,8 @@ { "name": "prom", "uri": "http://localhost", - "timeout": "1s" + "timeout": "1s", + "required": false } ], "checks": { @@ -75,7 +76,8 @@ "timeout": "1s", "paths": [ "foo.yml" - ] + ], + "required": false } ], "checks": { @@ -113,7 +115,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false } ], "checks": { @@ -151,7 +154,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false }, { "name": "ignore", @@ -159,7 +163,8 @@ "timeout": "1s", "paths": [ "foo.+" - ] + ], + "required": false } ], "checks": { @@ -363,7 +368,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false }, { "name": "prom2", @@ -371,7 +377,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false } ], "checks": { @@ -489,7 +496,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false }, { "name": "prom2", @@ -497,7 +505,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false } ], "checks": { @@ -948,7 +957,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false } ], "checks": { @@ -999,7 +1009,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false } ], "checks": {}, @@ -1028,7 +1039,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false } ], "checks": { @@ -1062,7 +1074,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false } ], "checks": { @@ -1101,7 +1114,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false } ], "checks": { @@ -1137,12 +1151,14 @@ { "name": "prom1", "uri": "http://localhost/1", - "timeout": "1s" + "timeout": "1s", + "required": false }, { "name": "prom2", "uri": "http://localhost/2", - "timeout": "1s" + "timeout": "1s", + "required": false } ], "checks": { @@ -1184,7 +1200,8 @@ "http://localhost/1", "http://localhost/2" ], - "timeout": "1s" + "timeout": "1s", + "required": false } ], "checks": { @@ -1247,7 +1264,8 @@ { "name": "prom", "uri": "http://localhost", - "timeout": "1s" + "timeout": "1s", + "required": false } ], "checks": { @@ -1286,7 +1304,8 @@ "http://localhost/1", "http://localhost/2" ], - "timeout": "1s" + "timeout": "1s", + "required": false } ], "checks": { @@ -1321,12 +1340,14 @@ { "name": "prom1", "uri": "http://localhost/1", - "timeout": "1s" + "timeout": "1s", + "required": false }, { "name": "prom2", "uri": "http://localhost/2", - "timeout": "1s" + "timeout": "1s", + "required": false } ], "checks": { @@ -1367,7 +1388,8 @@ "timeout": "1s", "paths": [ "foo.yml" - ] + ], + "required": false } ], "checks": { @@ -1405,7 +1427,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false } ], "checks": { @@ -1443,7 +1466,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false }, { "name": "ignore", @@ -1451,7 +1475,8 @@ "timeout": "1s", "paths": [ "foo.+" - ] + ], + "required": false } ], "checks": { @@ -1655,7 +1680,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false }, { "name": "prom2", @@ -1663,7 +1689,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false } ], "checks": { @@ -1781,7 +1808,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false }, { "name": "prom2", @@ -1789,7 +1817,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false } ], "checks": { @@ -2240,7 +2269,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false } ], "checks": { @@ -2291,7 +2321,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false } ], "checks": {}, @@ -2320,7 +2351,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false } ], "checks": { @@ -2354,7 +2386,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false } ], "checks": { @@ -2393,7 +2426,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false } ], "checks": { @@ -2457,7 +2491,8 @@ { "name": "prom", "uri": "http://localhost", - "timeout": "1s" + "timeout": "1s", + "required": false } ], "checks": { @@ -2496,7 +2531,8 @@ "http://localhost/1", "http://localhost/2" ], - "timeout": "1s" + "timeout": "1s", + "required": false } ], "checks": { @@ -2531,12 +2567,14 @@ { "name": "prom1", "uri": "http://localhost/1", - "timeout": "1s" + "timeout": "1s", + "required": false }, { "name": "prom2", "uri": "http://localhost/2", - "timeout": "1s" + "timeout": "1s", + "required": false } ], "checks": { @@ -2577,7 +2615,8 @@ "timeout": "1s", "paths": [ "foo.yml" - ] + ], + "required": false } ], "checks": { @@ -2615,7 +2654,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false } ], "checks": { @@ -2653,7 +2693,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false }, { "name": "ignore", @@ -2661,7 +2702,8 @@ "timeout": "1s", "paths": [ "foo.+" - ] + ], + "required": false } ], "checks": { @@ -2865,7 +2907,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false }, { "name": "prom2", @@ -2873,7 +2916,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false } ], "checks": { @@ -2991,7 +3035,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false }, { "name": "prom2", @@ -2999,7 +3044,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false } ], "checks": { @@ -3450,7 +3496,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false } ], "checks": { @@ -3501,7 +3548,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false } ], "checks": {}, @@ -3530,7 +3578,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false } ], "checks": { @@ -3564,7 +3613,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false } ], "checks": { @@ -3603,7 +3653,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false } ], "checks": { @@ -3667,7 +3718,8 @@ { "name": "prom", "uri": "http://localhost", - "timeout": "1s" + "timeout": "1s", + "required": false } ], "checks": { @@ -3706,7 +3758,8 @@ "http://localhost/1", "http://localhost/2" ], - "timeout": "1s" + "timeout": "1s", + "required": false } ], "checks": { @@ -3741,12 +3794,14 @@ { "name": "prom1", "uri": "http://localhost/1", - "timeout": "1s" + "timeout": "1s", + "required": false }, { "name": "prom2", "uri": "http://localhost/2", - "timeout": "1s" + "timeout": "1s", + "required": false } ], "checks": { @@ -3787,7 +3842,8 @@ "timeout": "1s", "paths": [ "foo.yml" - ] + ], + "required": false } ], "checks": { @@ -3825,7 +3881,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false } ], "checks": { @@ -3863,7 +3920,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false }, { "name": "ignore", @@ -3871,7 +3929,8 @@ "timeout": "1s", "paths": [ "foo.+" - ] + ], + "required": false } ], "checks": { @@ -4075,7 +4134,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false }, { "name": "prom2", @@ -4083,7 +4143,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false } ], "checks": { @@ -4201,7 +4262,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false }, { "name": "prom2", @@ -4209,7 +4271,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false } ], "checks": { @@ -4660,7 +4723,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false } ], "checks": { @@ -4711,7 +4775,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false } ], "checks": {}, @@ -4740,7 +4805,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false } ], "checks": { @@ -4774,7 +4840,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false } ], "checks": { @@ -4813,7 +4880,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false } ], "checks": { @@ -4877,7 +4945,8 @@ { "name": "prom", "uri": "http://localhost", - "timeout": "1s" + "timeout": "1s", + "required": false } ], "checks": { @@ -4916,7 +4985,8 @@ "http://localhost/1", "http://localhost/2" ], - "timeout": "1s" + "timeout": "1s", + "required": false } ], "checks": { @@ -4951,12 +5021,14 @@ { "name": "prom1", "uri": "http://localhost/1", - "timeout": "1s" + "timeout": "1s", + "required": false }, { "name": "prom2", "uri": "http://localhost/2", - "timeout": "1s" + "timeout": "1s", + "required": false } ], "checks": { @@ -4997,7 +5069,8 @@ "timeout": "1s", "paths": [ "foo.yml" - ] + ], + "required": false } ], "checks": { @@ -5035,7 +5108,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false } ], "checks": { @@ -5073,7 +5147,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false }, { "name": "ignore", @@ -5081,7 +5156,8 @@ "timeout": "1s", "paths": [ "foo.+" - ] + ], + "required": false } ], "checks": { @@ -5285,7 +5361,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false }, { "name": "prom2", @@ -5293,7 +5370,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false } ], "checks": { @@ -5411,7 +5489,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false }, { "name": "prom2", @@ -5419,7 +5498,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false } ], "checks": { @@ -5870,7 +5950,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false } ], "checks": { @@ -5921,7 +6002,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false } ], "checks": {}, @@ -5950,7 +6032,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false } ], "checks": { @@ -5984,7 +6067,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false } ], "checks": { @@ -6023,7 +6107,8 @@ "timeout": "1s", "paths": [ "rules.yml" - ] + ], + "required": false } ], "checks": { diff --git a/internal/config/config.go b/internal/config/config.go index 6ef305fa..d8675af6 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -225,7 +225,7 @@ func Load(path string, failOnMissing bool) (cfg Config, err error) { for _, uri := range prom.Failover { upstreams = append(upstreams, promapi.NewPrometheus(prom.Name, uri, timeout)) } - cfg.prometheusServers = append(cfg.prometheusServers, promapi.NewFailoverGroup(prom.Name, upstreams)) + cfg.prometheusServers = append(cfg.prometheusServers, promapi.NewFailoverGroup(prom.Name, upstreams, prom.Required)) } for _, rule := range cfg.Rules { diff --git a/internal/config/prometheus.go b/internal/config/prometheus.go index e543b2ed..e64113ea 100644 --- a/internal/config/prometheus.go +++ b/internal/config/prometheus.go @@ -11,6 +11,7 @@ type PrometheusConfig struct { Failover []string `hcl:"failover,optional" json:"failover,omitempty"` Timeout string `hcl:"timeout" json:"timeout"` Paths []string `hcl:"paths,optional" json:"paths,omitempty"` + Required bool `hcl:"required,optional" json:"required"` } func (pc PrometheusConfig) validate() error { diff --git a/internal/promapi/errors.go b/internal/promapi/errors.go new file mode 100644 index 00000000..6b5b0b5f --- /dev/null +++ b/internal/promapi/errors.go @@ -0,0 +1,69 @@ +package promapi + +import ( + "errors" + "net" + "strings" + "syscall" + "time" + + v1 "github.com/prometheus/client_golang/api/prometheus/v1" +) + +type Error struct { + err error + isStrict bool +} + +func (e *Error) Unwrap() error { + return e.err +} + +func (e *Error) Error() string { + return e.err.Error() +} + +func (e *Error) IsStrict() bool { + return e.isStrict +} + +func IsUnavailableError(err error) bool { + var apiErr *v1.Error + if ok := errors.As(err, &apiErr); ok { + return apiErr.Type == v1.ErrServer + } + + return true +} + +func CanRetryError(err error, delta time.Duration) (time.Duration, bool) { + if errors.Is(err, syscall.ECONNREFUSED) { + return delta, false + } + + var neterr net.Error + if ok := errors.As(err, &neterr); ok && neterr.Timeout() { + return delta / 2, true + } + + var apiErr *v1.Error + if ok := errors.As(err, &apiErr); ok { + switch apiErr.Type { + case v1.ErrBadData: + case v1.ErrTimeout: + return delta / 2, true + case v1.ErrCanceled: + case v1.ErrExec: + if strings.Contains(apiErr.Msg, "query processing would load too many samples into memory in ") { + return (delta / 4) * 3, true + } + return delta / 2, true + case v1.ErrBadResponse: + case v1.ErrServer: + case v1.ErrClient: + return delta / 2, true + } + } + + return delta, false +} diff --git a/internal/promapi/failover.go b/internal/promapi/failover.go index 84f9d488..d8b8040e 100644 --- a/internal/promapi/failover.go +++ b/internal/promapi/failover.go @@ -6,14 +6,16 @@ import ( ) type FailoverGroup struct { - name string - servers []*Prometheus + name string + servers []*Prometheus + strictErrors bool } -func NewFailoverGroup(name string, servers []*Prometheus) *FailoverGroup { +func NewFailoverGroup(name string, servers []*Prometheus, strictErrors bool) *FailoverGroup { return &FailoverGroup{ - name: name, - servers: servers, + name: name, + servers: servers, + strictErrors: strictErrors, } } @@ -34,7 +36,7 @@ func (fg *FailoverGroup) Config(ctx context.Context) (cfg *PrometheusConfig, err return } } - return + return nil, &Error{err: err, isStrict: fg.strictErrors} } func (fg *FailoverGroup) Query(ctx context.Context, expr string) (qr *QueryResult, err error) { @@ -44,7 +46,7 @@ func (fg *FailoverGroup) Query(ctx context.Context, expr string) (qr *QueryResul return } } - return + return nil, &Error{err: err, isStrict: fg.strictErrors} } func (fg *FailoverGroup) RangeQuery(ctx context.Context, expr string, start, end time.Time, step time.Duration) (rqr *RangeQueryResult, err error) { @@ -54,5 +56,5 @@ func (fg *FailoverGroup) RangeQuery(ctx context.Context, expr string, start, end return } } - return + return nil, &Error{err: err, isStrict: fg.strictErrors} } diff --git a/internal/promapi/range.go b/internal/promapi/range.go index 03694808..2d730b44 100644 --- a/internal/promapi/range.go +++ b/internal/promapi/range.go @@ -4,7 +4,6 @@ import ( "context" "errors" "fmt" - "net" "strings" "time" @@ -63,7 +62,7 @@ func (p *Prometheus) RangeQuery(ctx context.Context, expr string, start, end tim if err != nil { log.Error().Err(err).Str("uri", p.uri).Str("query", expr).Msg("Range query failed") prometheusQueryErrorsTotal.WithLabelValues(p.name, "/api/v1/query_range", errReason(err)).Inc() - if delta, retryOK := canRetry(err, end.Sub(start)); retryOK { + if delta, retryOK := CanRetryError(err, end.Sub(start)); retryOK { if delta < step*2 { log.Error().Str("uri", p.uri).Str("query", expr).Msg("No more retries possible") return nil, errors.New("no more retries possible") @@ -96,17 +95,3 @@ func (p *Prometheus) RangeQuery(ctx context.Context, expr string, start, end tim return &qr, nil } - -func canRetry(err error, delta time.Duration) (time.Duration, bool) { - var neterr net.Error - if ok := errors.As(err, &neterr); ok && neterr.Timeout() { - return delta / 2, true - } - if strings.Contains(err.Error(), "query processing would load too many samples into memory in ") { - return (delta / 4) * 3, true - } - if strings.Contains(err.Error(), "found duplicate series for the match group") { - return delta / 2, true - } - return delta, false -} diff --git a/internal/reporter/reporter.go b/internal/reporter/reporter.go index e778c0b3..32f55b8d 100644 --- a/internal/reporter/reporter.go +++ b/internal/reporter/reporter.go @@ -13,29 +13,11 @@ type Report struct { Problem checks.Problem } -func (r Report) IsPassing() bool { - switch r.Problem.Severity { - case checks.Information, checks.Warning: - return true - default: - return false - } -} - type Summary struct { Reports []Report FileChanges discovery.FileFindResults } -func (s Summary) IsPassing() bool { - for _, r := range s.Reports { - if p := r.IsPassing(); !p { - return false - } - } - return true -} - func (s Summary) HasFatalProblems() bool { for _, r := range s.Reports { if r.Problem.Severity == checks.Fatal {