Skip to content

Commit

Permalink
Refactor Prometheus API error handling
Browse files Browse the repository at this point in the history
  • Loading branch information
prymitive committed Feb 21, 2022
1 parent 5cece62 commit e6a0531
Show file tree
Hide file tree
Showing 30 changed files with 746 additions and 269 deletions.
6 changes: 4 additions & 2 deletions cmd/pint/scan.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ import (

var yamlErrRe = regexp.MustCompile("^yaml: line (.+): (.+)$")

const yamlParseReporter = "yaml/parse"

func tryDecodingYamlError(e string) (int, string) {
parts := yamlErrRe.FindStringSubmatch(e)
if len(parts) > 2 {
Expand All @@ -38,7 +40,7 @@ func scanProblem(path string, err error) reporter.Report {
Path: path,
Problem: checks.Problem{
Lines: []int{line},
Reporter: "yaml/parse",
Reporter: yamlParseReporter,
Text: e,
Severity: checks.Fatal,
},
Expand Down Expand Up @@ -174,7 +176,7 @@ func scanWorker(ctx context.Context, jobs <-chan scanJob, results chan<- reporte
results <- reporter.Report{Path: job.path, Rule: job.rule, Problem: checks.Problem{
Fragment: job.rule.Error.Fragment,
Lines: []int{job.rule.Error.Line},
Reporter: "yaml/parse",
Reporter: yamlParseReporter,
Text: job.rule.Error.Err.Error(),
Severity: checks.Fatal,
}}
Expand Down
5 changes: 3 additions & 2 deletions cmd/pint/tests/0023_enabled_checks.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@ stderr 'level=debug msg="Configured checks for rule" enabled=\["promql/syntax","

-- .pint.hcl --
prometheus "prom" {
uri = "https://"
timeout = "2m"
uri = "https://"
timeout = "2m"
required = true
}

rule{}
Expand Down
4 changes: 3 additions & 1 deletion cmd/pint/tests/0025_config.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ level=info msg="Loading configuration file" path=.pint.hcl
{
"name": "prom",
"uri": "https://",
"timeout": "2m"
"timeout": "2m",
"required": true
}
],
"checks": {
Expand Down Expand Up @@ -114,6 +115,7 @@ level=info msg="Loading configuration file" path=.pint.hcl
prometheus "prom" {
uri = "https://"
timeout = "2m"
required = true
}

rule{ }
Expand Down
2 changes: 1 addition & 1 deletion cmd/pint/tests/0039_prom_selected_path.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ rules/0001.yml:6: job label is required and should be preserved when aggregating
prometheus "disabled" {
uri = "http://127.0.0.1:123"
timeout = "5s"

required = true
paths = ["invalid/.+"]
}
rule {
Expand Down
1 change: 1 addition & 0 deletions cmd/pint/tests/0043_watch_cancel.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ cat prometheus.pid | xargs kill
prometheus "slow" {
uri = "http://127.0.0.1:7043"
timeout = "2m"
required = true
}

-- prometheus.go --
Expand Down
36 changes: 20 additions & 16 deletions cmd/pint/tests/0054_watch_metrics_prometheus.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,13 @@ cat prometheus.pid | xargs kill
prometheus "prom1" {
uri = "http://127.0.0.1:7054"
timeout = "5s"
required = true
}

prometheus "prom2" {
uri = "http://127.0.0.1:1054"
timeout = "5s"
required = true
}

-- metrics.txt --
Expand Down Expand Up @@ -62,14 +64,14 @@ pint_check_iterations_total
pint_last_run_time_seconds
# HELP pint_problem Prometheus rule problem reported by pint
# TYPE pint_problem gauge
pint_problem{kind="alerting",name="comparison",problem="failed to query prom1 prometheus config: failed to query Prometheus config: server_error: server error: 500",reporter="promql/rate",severity="bug"}
pint_problem{kind="alerting",name="comparison",problem="failed to query prom2 prometheus config: failed to query Prometheus config: Get \"http://127.0.0.1:1054/api/v1/status/config\": dial tcp 127.0.0.1:1054: connect: connection refused",reporter="promql/rate",severity="bug"}
pint_problem{kind="alerting",name="comparison",problem="query using prom1 failed with: bad_response: Unmarshal: there are bytes left after unmarshal, error found in #10 byte of ...|ry\"\n }Fatal error|..., bigger context ...|pe\":\"bad_data\",\n \"error\":\"bogus query\"\n }Fatal error|...",reporter="promql/series",severity="bug"}
pint_problem{kind="alerting",name="comparison",problem="query using prom2 failed with: Post \"http://127.0.0.1:1054/api/v1/query\": dial tcp 127.0.0.1:1054: connect: connection refused",reporter="promql/series",severity="bug"}
pint_problem{kind="recording",name="aggregate",problem="failed to query prom1 prometheus config: failed to query Prometheus config: server_error: server error: 500",reporter="promql/rate",severity="bug"}
pint_problem{kind="recording",name="aggregate",problem="failed to query prom2 prometheus config: failed to query Prometheus config: Get \"http://127.0.0.1:1054/api/v1/status/config\": dial tcp 127.0.0.1:1054: connect: connection refused",reporter="promql/rate",severity="bug"}
pint_problem{kind="recording",name="aggregate",problem="query using prom1 failed with: bad_response: Unmarshal: there are bytes left after unmarshal, error found in #10 byte of ...|ry\"\n }Fatal error|..., bigger context ...|pe\":\"bad_data\",\n \"error\":\"bogus query\"\n }Fatal error|...",reporter="promql/series",severity="bug"}
pint_problem{kind="recording",name="aggregate",problem="query using prom2 failed with: Post \"http://127.0.0.1:1054/api/v1/query\": dial tcp 127.0.0.1:1054: connect: connection refused",reporter="promql/series",severity="bug"}
pint_problem{kind="alerting",name="comparison",problem="cound't run \"promql/rate\" checks due to \"prom1\" prometheus connection error: failed to query Prometheus config: server_error: server error: 500",reporter="promql/rate",severity="bug"}
pint_problem{kind="alerting",name="comparison",problem="cound't run \"promql/rate\" checks due to \"prom2\" prometheus connection error: failed to query Prometheus config: Get \"http://127.0.0.1:1054/api/v1/status/config\": dial tcp 127.0.0.1:1054: connect: connection refused",reporter="promql/rate",severity="bug"}
pint_problem{kind="alerting",name="comparison",problem="cound't run \"promql/series\" checks due to \"prom2\" prometheus connection error: Post \"http://127.0.0.1:1054/api/v1/query\": dial tcp 127.0.0.1:1054: connect: connection refused",reporter="promql/series",severity="bug"}
pint_problem{kind="alerting",name="comparison",problem="query using prom1 failed with: bad_response: Unmarshal: there are bytes left after unmarshal, error found in #10 byte of ...|y\"\n }Fatal error|..., bigger context ...|:\"bad_data\",\n \"error\":\"bogus query\"\n }Fatal error|...",reporter="promql/series",severity="bug"}
pint_problem{kind="recording",name="aggregate",problem="cound't run \"promql/rate\" checks due to \"prom1\" prometheus connection error: failed to query Prometheus config: server_error: server error: 500",reporter="promql/rate",severity="bug"}
pint_problem{kind="recording",name="aggregate",problem="cound't run \"promql/rate\" checks due to \"prom2\" prometheus connection error: failed to query Prometheus config: Get \"http://127.0.0.1:1054/api/v1/status/config\": dial tcp 127.0.0.1:1054: connect: connection refused",reporter="promql/rate",severity="bug"}
pint_problem{kind="recording",name="aggregate",problem="cound't run \"promql/series\" checks due to \"prom2\" prometheus connection error: Post \"http://127.0.0.1:1054/api/v1/query\": dial tcp 127.0.0.1:1054: connect: connection refused",reporter="promql/series",severity="bug"}
pint_problem{kind="recording",name="aggregate",problem="query using prom1 failed with: bad_response: Unmarshal: there are bytes left after unmarshal, error found in #10 byte of ...|y\"\n }Fatal error|..., bigger context ...|:\"bad_data\",\n \"error\":\"bogus query\"\n }Fatal error|...",reporter="promql/series",severity="bug"}
pint_problem{kind="recording",name="broken",problem="syntax error: no arguments for aggregate expression provided",reporter="promql/syntax",severity="fatal"}
# HELP pint_problems Total number of problems reported by pint
# TYPE pint_problems gauge
Expand Down Expand Up @@ -112,18 +114,20 @@ import (

func main() {
http.HandleFunc("/api/v1/status/config", func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(500)
w.WriteHeader(500)
time.Sleep(time.Millisecond * 100)
io.WriteString(w, "Fatal error")
})

http.HandleFunc("/api/v1/query", func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(400)
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`{
"status":"error",
"errorType":"bad_data",
"error":"bogus query"
}`))
w.WriteHeader(400)
time.Sleep(time.Millisecond * 200)
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`{
"status":"error",
"errorType":"bad_data",
"error":"bogus query"
}`))
io.WriteString(w, "Fatal error")
})

Expand Down
1 change: 1 addition & 0 deletions cmd/pint/tests/0055_prometheus_failover.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ prometheus "prom" {
uri = "http://127.0.0.1:1055"
failover = ["http://127.0.0.1:7055"]
timeout = "5s"
required = true
}

-- prometheus.go --
Expand Down
25 changes: 25 additions & 0 deletions cmd/pint/tests/0056_prometheus_required.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
pint.ok -l debug --no-color lint rules
! stdout .
stderr 'level=error msg=\"Query failed" error=\"Post \\"https:///api/v1/query\\": http: no Host in request URL\" query=count\(up\) uri=https://'
stderr 'level=error msg=\"Failed to query Prometheus configuration\" error=\"Get \\"https:///api/v1/status/config\\": http: no Host in request URL\" uri=https://'
! stderr 'level=info msg="Problems found"'

-- rules/1.yaml --
- record: one
expr: up == 0
- record: two
expr: up == 0
-- rules/2.yaml --
- record: one
expr: up == 0
- record: two
expr: up == 0

-- .pint.hcl --
prometheus "prom" {
uri = "https://"
timeout = "2m"
required = false
}

rule{}
160 changes: 160 additions & 0 deletions cmd/pint/tests/0057_watch_metrics_prometheus_ignore.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
exec bash -x ./prometheus.sh &
exec bash -c 'I=0 ; while [ ! -f prometheus.pid ] && [ $I -lt 30 ]; do sleep 1; I=$((I+1)); done'

exec bash -x ./test.sh &

pint.ok watch --listen=:6057 --pidfile=pint.pid rules
cmp curl.txt metrics.txt

-- test.sh --
sleep 15
curl -s http://127.0.0.1:6057/metrics | grep 'pint_' | perl -pe "s/^([a-zA-Z].+)[ ]([0-9\.\-\+eE]+)$/\1/g" > curl.txt
cat pint.pid | xargs kill
cat prometheus.pid | xargs kill

-- rules/1.yml --
- record: broken
expr: foo / count())

- record: aggregate
expr: sum(foo) without(job)

- alert: comparison
expr: foo

-- .pint.hcl --
prometheus "prom1" {
uri = "http://127.0.0.1:7057"
timeout = "5s"
required = false
}

prometheus "prom2" {
uri = "http://127.0.0.1:1057"
timeout = "5s"
required = false
}

-- metrics.txt --
# HELP pint_check_duration_seconds How long did a check took to complete
# TYPE pint_check_duration_seconds summary
pint_check_duration_seconds_sum{check="alerts/comparison"}
pint_check_duration_seconds_count{check="alerts/comparison"}
pint_check_duration_seconds_sum{check="alerts/for"}
pint_check_duration_seconds_count{check="alerts/for"}
pint_check_duration_seconds_sum{check="alerts/template"}
pint_check_duration_seconds_count{check="alerts/template"}
pint_check_duration_seconds_sum{check="promql/fragile"}
pint_check_duration_seconds_count{check="promql/fragile"}
pint_check_duration_seconds_sum{check="promql/rate"}
pint_check_duration_seconds_count{check="promql/rate"}
pint_check_duration_seconds_sum{check="promql/regexp"}
pint_check_duration_seconds_count{check="promql/regexp"}
pint_check_duration_seconds_sum{check="promql/series"}
pint_check_duration_seconds_count{check="promql/series"}
pint_check_duration_seconds_sum{check="promql/syntax"}
pint_check_duration_seconds_count{check="promql/syntax"}
pint_check_duration_seconds_sum{check="promql/vector_matching"}
pint_check_duration_seconds_count{check="promql/vector_matching"}
# HELP pint_check_iterations_total Total number of completed check iterations since pint start
# TYPE pint_check_iterations_total counter
pint_check_iterations_total
# HELP pint_last_run_time_seconds Last checks run completion time since unix epoch in seconds
# TYPE pint_last_run_time_seconds gauge
pint_last_run_time_seconds
# HELP pint_problem Prometheus rule problem reported by pint
# TYPE pint_problem gauge
pint_problem{kind="alerting",name="comparison",problem="query using prom1 failed with: bad_response: Unmarshal: there are bytes left after unmarshal, error found in #10 byte of ...|y\"\n }Fatal error|..., bigger context ...|:\"bad_data\",\n \"error\":\"bogus query\"\n }Fatal error|...",reporter="promql/series",severity="bug"}
pint_problem{kind="recording",name="aggregate",problem="query using prom1 failed with: bad_response: Unmarshal: there are bytes left after unmarshal, error found in #10 byte of ...|y\"\n }Fatal error|..., bigger context ...|:\"bad_data\",\n \"error\":\"bogus query\"\n }Fatal error|...",reporter="promql/series",severity="bug"}
pint_problem{kind="recording",name="broken",problem="syntax error: no arguments for aggregate expression provided",reporter="promql/syntax",severity="fatal"}
# HELP pint_problems Total number of problems reported by pint
# TYPE pint_problems gauge
pint_problems
# HELP pint_prometheus_queries_total Total number of all prometheus queries
# TYPE pint_prometheus_queries_total counter
pint_prometheus_queries_total{endpoint="/api/v1/query",name="prom1"}
pint_prometheus_queries_total{endpoint="/api/v1/query",name="prom2"}
pint_prometheus_queries_total{endpoint="/api/v1/status/config",name="prom1"}
pint_prometheus_queries_total{endpoint="/api/v1/status/config",name="prom2"}
# HELP pint_prometheus_query_errors_total Total number of failed prometheus queries
# TYPE pint_prometheus_query_errors_total counter
pint_prometheus_query_errors_total{endpoint="/api/v1/query",name="prom1",reason="api/bad_response"}
pint_prometheus_query_errors_total{endpoint="/api/v1/query",name="prom2",reason="connection/error"}
pint_prometheus_query_errors_total{endpoint="/api/v1/status/config",name="prom1",reason="api/server_error"}
pint_prometheus_query_errors_total{endpoint="/api/v1/status/config",name="prom2",reason="connection/error"}
# HELP pint_rules_parsed_total Total number of rules parsed since startup
# TYPE pint_rules_parsed_total counter
pint_rules_parsed_total{kind="alerting"}
pint_rules_parsed_total{kind="invalid"}
pint_rules_parsed_total{kind="recording"}
# HELP pint_version Version information
# TYPE pint_version gauge
pint_version{version="unknown"}
-- prometheus.go --
package main

import (
"context"
"io"
"log"
"net"
"net/http"
"os"
"os/signal"
"strconv"
"syscall"
"time"
)

func main() {
http.HandleFunc("/api/v1/status/config", func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(500)
time.Sleep(time.Millisecond * 100)
io.WriteString(w, "Fatal error")
})

http.HandleFunc("/api/v1/query", func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(400)
time.Sleep(time.Millisecond * 200)
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`{
"status":"error",
"errorType":"bad_data",
"error":"bogus query"
}`))
io.WriteString(w, "Fatal error")
})

listener, err := net.Listen("tcp", "127.0.0.1:7057")
if err != nil {
log.Fatal(err)
}

server := &http.Server{
Addr: "127.0.0.1:7057",
}

go func() {
_ = server.Serve(listener)
}()

pid := os.Getpid()
err = os.WriteFile("prometheus.pid", []byte(strconv.Itoa(pid)), 0644)
if err != nil {
log.Fatal(err)
}

stop := make(chan os.Signal, 1)
signal.Notify(stop, os.Interrupt, syscall.SIGINT, syscall.SIGTERM)
go func() {
time.Sleep(time.Minute*2)
stop <- syscall.SIGTERM
}()
<-stop
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
server.Shutdown(ctx)
}

-- prometheus.sh --
env GOCACHE=$TMPDIR go run prometheus.go
7 changes: 7 additions & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,13 @@
- `prometheus` config block now allows to specify failover URIs using `failover` field.
If failover URIs are set and main URI fails to respond pint will attempt to use them
in the order specified until one of them works.
- `prometheus` config block now allows to define how upstream errors are handled using
`required` field. If `required` is set to `true` any check that depends on remote
Prometheus server will be reported as `bug` if it's unable to talk to it.
If `required` is set to `false` pint will only emit `warning` level results.
Default value for `required` is `false`. Set it to `true` if you want to hard fail
in case of remote Prometheus issues. Note that setting it to `true` might block
PRs when running `pint ci` until pint is able to talk to Prometheus again.
- Renamed `pint/parse` to `yaml/parse` and added missing documentation for it.

## v0.12.0
Expand Down
8 changes: 8 additions & 0 deletions docs/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ prometheus "$name" {
uri = "https://..."
failover = ["https://...", ...]
timeout = "60s"
required = true|false
paths = ["...", ...]
}
```
Expand All @@ -120,6 +121,13 @@ prometheus "$name" {
configuration, otherwise pint checks might return unreliable results and potential
false positives.
- `timeout` - timeout to be used for API requests.
- `required` - decides how pint will report errors if it's unable to get a valid response
from this Prometheus server. If `required` is `true` and all API calls to this Prometheus
fail pint will report those as `bug` level problem. If it's set to `false` pint will
report those with `warning` level.
Default value for `required` is `false`. Set it to `true` if you want to hard fail
in case of remote Prometheus issues. Note that setting it to `true` might block
PRs when running `pint ci` until pint is able to talk to Prometheus again.
- `paths` - optional path filter, if specified only paths matching one of listed regex
patterns will use this Prometheus server for checks.

Expand Down
5 changes: 3 additions & 2 deletions internal/checks/alerts_count.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,13 @@ func (c AlertsCheck) Check(ctx context.Context, rule parser.Rule) (problems []Pr

qr, err := c.prom.RangeQuery(ctx, rule.AlertingRule.Expr.Value.Value, start, end, c.step)
if err != nil {
text, severity := textAndSeverityFromError(err, c.Reporter(), c.prom.Name(), Bug)
problems = append(problems, Problem{
Fragment: rule.AlertingRule.Expr.Value.Value,
Lines: rule.AlertingRule.Expr.Lines(),
Reporter: c.Reporter(),
Text: fmt.Sprintf("query using %s failed with: %s", c.prom.Name(), err),
Severity: Bug,
Text: text,
Severity: severity,
})
return
}
Expand Down
Loading

0 comments on commit e6a0531

Please sign in to comment.