Skip to content

Commit

Permalink
Merge pull request cloudflare#180 from cloudflare/errors
Browse files Browse the repository at this point in the history
Report prometheus URI when reporting errors
  • Loading branch information
prymitive authored Mar 2, 2022
2 parents b43f268 + 785af29 commit 5655d63
Show file tree
Hide file tree
Showing 21 changed files with 208 additions and 150 deletions.
16 changes: 8 additions & 8 deletions cmd/pint/tests/0054_watch_metrics_prometheus.txt
Original file line number Diff line number Diff line change
Expand Up @@ -64,14 +64,14 @@ pint_check_iterations_total
pint_last_run_time_seconds
# HELP pint_problem Prometheus rule problem reported by pint
# TYPE pint_problem gauge
pint_problem{filename="rules/1.yml",kind="alerting",name="comparison",problem="cound't run \"promql/rate\" checks due to \"prom1\" prometheus connection error: failed to query Prometheus config: server_error: server error: 500",reporter="promql/rate",severity="bug"}
pint_problem{filename="rules/1.yml",kind="alerting",name="comparison",problem="cound't run \"promql/rate\" checks due to \"prom2\" prometheus connection error: failed to query Prometheus config: Get \"http://127.0.0.1:1054/api/v1/status/config\": dial tcp 127.0.0.1:1054: connect: connection refused",reporter="promql/rate",severity="bug"}
pint_problem{filename="rules/1.yml",kind="alerting",name="comparison",problem="cound't run \"promql/series\" checks due to \"prom2\" prometheus connection error: Post \"http://127.0.0.1:1054/api/v1/query\": dial tcp 127.0.0.1:1054: connect: connection refused",reporter="promql/series",severity="bug"}
pint_problem{filename="rules/1.yml",kind="alerting",name="comparison",problem="query using prom1 failed with: bad_response: Unmarshal: there are bytes left after unmarshal, error found in #10 byte of ...|y\"\n }Fatal error|..., bigger context ...|:\"bad_data\",\n \"error\":\"bogus query\"\n }Fatal error|...",reporter="promql/series",severity="bug"}
pint_problem{filename="rules/1.yml",kind="recording",name="aggregate",problem="cound't run \"promql/rate\" checks due to \"prom1\" prometheus connection error: failed to query Prometheus config: server_error: server error: 500",reporter="promql/rate",severity="bug"}
pint_problem{filename="rules/1.yml",kind="recording",name="aggregate",problem="cound't run \"promql/rate\" checks due to \"prom2\" prometheus connection error: failed to query Prometheus config: Get \"http://127.0.0.1:1054/api/v1/status/config\": dial tcp 127.0.0.1:1054: connect: connection refused",reporter="promql/rate",severity="bug"}
pint_problem{filename="rules/1.yml",kind="recording",name="aggregate",problem="cound't run \"promql/series\" checks due to \"prom2\" prometheus connection error: Post \"http://127.0.0.1:1054/api/v1/query\": dial tcp 127.0.0.1:1054: connect: connection refused",reporter="promql/series",severity="bug"}
pint_problem{filename="rules/1.yml",kind="recording",name="aggregate",problem="query using prom1 failed with: bad_response: Unmarshal: there are bytes left after unmarshal, error found in #10 byte of ...|y\"\n }Fatal error|..., bigger context ...|:\"bad_data\",\n \"error\":\"bogus query\"\n }Fatal error|...",reporter="promql/series",severity="bug"}
pint_problem{filename="rules/1.yml",kind="alerting",name="comparison",problem="cound't run \"promql/rate\" checks due to \"prom1\" on http://127.0.0.1:7054 connection error: failed to query Prometheus config: server_error: server error: 500",reporter="promql/rate",severity="bug"}
pint_problem{filename="rules/1.yml",kind="alerting",name="comparison",problem="cound't run \"promql/rate\" checks due to \"prom2\" on http://127.0.0.1:1054 connection error: failed to query Prometheus config: Get \"http://127.0.0.1:1054/api/v1/status/config\": dial tcp 127.0.0.1:1054: connect: connection refused",reporter="promql/rate",severity="bug"}
pint_problem{filename="rules/1.yml",kind="alerting",name="comparison",problem="cound't run \"promql/series\" checks due to \"prom2\" on http://127.0.0.1:1054 connection error: Post \"http://127.0.0.1:1054/api/v1/query\": dial tcp 127.0.0.1:1054: connect: connection refused",reporter="promql/series",severity="bug"}
pint_problem{filename="rules/1.yml",kind="alerting",name="comparison",problem="query using \"prom1\" on http://127.0.0.1:7054 failed with: bad_response: Unmarshal: there are bytes left after unmarshal, error found in #10 byte of ...|y\"\n }Fatal error|..., bigger context ...|:\"bad_data\",\n \"error\":\"bogus query\"\n }Fatal error|...",reporter="promql/series",severity="bug"}
pint_problem{filename="rules/1.yml",kind="recording",name="aggregate",problem="cound't run \"promql/rate\" checks due to \"prom1\" on http://127.0.0.1:7054 connection error: failed to query Prometheus config: server_error: server error: 500",reporter="promql/rate",severity="bug"}
pint_problem{filename="rules/1.yml",kind="recording",name="aggregate",problem="cound't run \"promql/rate\" checks due to \"prom2\" on http://127.0.0.1:1054 connection error: failed to query Prometheus config: Get \"http://127.0.0.1:1054/api/v1/status/config\": dial tcp 127.0.0.1:1054: connect: connection refused",reporter="promql/rate",severity="bug"}
pint_problem{filename="rules/1.yml",kind="recording",name="aggregate",problem="cound't run \"promql/series\" checks due to \"prom2\" on http://127.0.0.1:1054 connection error: Post \"http://127.0.0.1:1054/api/v1/query\": dial tcp 127.0.0.1:1054: connect: connection refused",reporter="promql/series",severity="bug"}
pint_problem{filename="rules/1.yml",kind="recording",name="aggregate",problem="query using \"prom1\" on http://127.0.0.1:7054 failed with: bad_response: Unmarshal: there are bytes left after unmarshal, error found in #10 byte of ...|y\"\n }Fatal error|..., bigger context ...|:\"bad_data\",\n \"error\":\"bogus query\"\n }Fatal error|...",reporter="promql/series",severity="bug"}
pint_problem{filename="rules/1.yml",kind="recording",name="broken",problem="syntax error: no arguments for aggregate expression provided",reporter="promql/syntax",severity="fatal"}
# HELP pint_problems Total number of problems reported by pint
# TYPE pint_problems gauge
Expand Down
5 changes: 2 additions & 3 deletions cmd/pint/tests/0055_prometheus_failover.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,8 @@ pint.ok --no-color lint rules
! stdout .
stderr 'level=error msg="Query failed" error="Post \\"http://127.0.0.1:1055/api/v1/query\\": dial tcp 127.0.0.1:1055: connect: connection refused" query=count\(foo\) uri=http://127.0.0.1:1055'
stderr 'level=error msg="Failed to query Prometheus configuration" error="Get \\"http://127.0.0.1:1055/api/v1/status/config\\": dial tcp 127.0.0.1:1055: connect: connection refused" uri=http://127.0.0.1:1055'
stderr 'rules/1.yml:2: query using prom completed without any results for foo \(promql/series\)'

exec sh -c 'cat prometheus.pid | xargs kill'
stderr 'rules/1.yml:2: query using "prom" on http://127.0.0.1:7055 completed without any results for foo \(promql/series\)'
exec bash -c 'cat prometheus.pid | xargs kill'

-- rules/1.yml --
- record: aggregate
Expand Down
4 changes: 2 additions & 2 deletions cmd/pint/tests/0057_watch_metrics_prometheus_ignore.txt
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,8 @@ pint_check_iterations_total
pint_last_run_time_seconds
# HELP pint_problem Prometheus rule problem reported by pint
# TYPE pint_problem gauge
pint_problem{filename="rules/1.yml",kind="alerting",name="comparison",problem="query using prom1 failed with: bad_response: Unmarshal: there are bytes left after unmarshal, error found in #10 byte of ...|y\"\n }Fatal error|..., bigger context ...|:\"bad_data\",\n \"error\":\"bogus query\"\n }Fatal error|...",reporter="promql/series",severity="bug"}
pint_problem{filename="rules/1.yml",kind="recording",name="aggregate",problem="query using prom1 failed with: bad_response: Unmarshal: there are bytes left after unmarshal, error found in #10 byte of ...|y\"\n }Fatal error|..., bigger context ...|:\"bad_data\",\n \"error\":\"bogus query\"\n }Fatal error|...",reporter="promql/series",severity="bug"}
pint_problem{filename="rules/1.yml",kind="alerting",name="comparison",problem="query using \"prom1\" on http://127.0.0.1:7057 failed with: bad_response: Unmarshal: there are bytes left after unmarshal, error found in #10 byte of ...|y\"\n }Fatal error|..., bigger context ...|:\"bad_data\",\n \"error\":\"bogus query\"\n }Fatal error|...",reporter="promql/series",severity="bug"}
pint_problem{filename="rules/1.yml",kind="recording",name="aggregate",problem="query using \"prom1\" on http://127.0.0.1:7057 failed with: bad_response: Unmarshal: there are bytes left after unmarshal, error found in #10 byte of ...|y\"\n }Fatal error|..., bigger context ...|:\"bad_data\",\n \"error\":\"bogus query\"\n }Fatal error|...",reporter="promql/series",severity="bug"}
pint_problem{filename="rules/1.yml",kind="recording",name="broken",problem="syntax error: no arguments for aggregate expression provided",reporter="promql/syntax",severity="fatal"}
# HELP pint_problems Total number of problems reported by pint
# TYPE pint_problems gauge
Expand Down
1 change: 1 addition & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
### Changed

- Added `filename` label to `pint_problem` metric - #170.
- Include Prometheus server URI in reported problems.

### Fixed

Expand Down
2 changes: 1 addition & 1 deletion internal/checks/alerts_count.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ func (c AlertsCheck) Check(ctx context.Context, rule parser.Rule) (problems []Pr
Fragment: rule.AlertingRule.Expr.Value.Value,
Lines: lines,
Reporter: c.Reporter(),
Text: fmt.Sprintf("query using %s would trigger %d alert(s) in the last %s", c.prom.Name(), alerts, output.HumanizeDuration(delta)),
Text: fmt.Sprintf("query using %q on %s would trigger %d alert(s) in the last %s", c.prom.Name(), qr.URI, alerts, output.HumanizeDuration(delta)),
Severity: Information,
})
return
Expand Down
14 changes: 7 additions & 7 deletions internal/checks/alerts_count_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ func TestAlertsCheck(t *testing.T) {
Fragment: `up{job="foo"} == 0`,
Lines: []int{2},
Reporter: "alerts/count",
Text: "query using prom failed with: bad_data: unhandled path",
Text: fmt.Sprintf(`query using "prom" on %s/400/ failed with: bad_data: unhandled path`, srv.URL),
Severity: checks.Bug,
},
},
Expand All @@ -115,7 +115,7 @@ func TestAlertsCheck(t *testing.T) {
Fragment: `up{job="foo"} == 0`,
Lines: []int{2},
Reporter: "alerts/count",
Text: `cound't run "alerts/count" checks due to "prom" prometheus connection error: Post "http://127.0.0.1/api/v1/query_range": dial tcp 127.0.0.1:80: connect: connection refused`,
Text: `cound't run "alerts/count" checks due to "prom" on http://127.0.0.1 connection error: Post "http://127.0.0.1/api/v1/query_range": dial tcp 127.0.0.1:80: connect: connection refused`,
Severity: checks.Warning,
},
},
Expand All @@ -129,7 +129,7 @@ func TestAlertsCheck(t *testing.T) {
Fragment: `up{job="foo"} == 0`,
Lines: []int{2},
Reporter: "alerts/count",
Text: "query using prom would trigger 0 alert(s) in the last 1d",
Text: fmt.Sprintf(`query using "prom" on %s/empty/ would trigger 0 alert(s) in the last 1d`, srv.URL),
Severity: checks.Information,
},
},
Expand All @@ -143,7 +143,7 @@ func TestAlertsCheck(t *testing.T) {
Fragment: `up{job="foo"} == 0`,
Lines: []int{2},
Reporter: "alerts/count",
Text: "query using prom would trigger 7 alert(s) in the last 1d",
Text: fmt.Sprintf(`query using "prom" on %s/alerts/ would trigger 7 alert(s) in the last 1d`, srv.URL),
Severity: checks.Information,
},
},
Expand All @@ -157,7 +157,7 @@ func TestAlertsCheck(t *testing.T) {
Fragment: `up{job="foo"} == 0`,
Lines: []int{2, 3},
Reporter: "alerts/count",
Text: "query using prom would trigger 1 alert(s) in the last 1d",
Text: fmt.Sprintf(`query using "prom" on %s/alerts/ would trigger 1 alert(s) in the last 1d`, srv.URL),
Severity: checks.Information,
},
},
Expand All @@ -174,7 +174,7 @@ func TestAlertsCheck(t *testing.T) {
Fragment: `{__name__="up", job="foo"} == 0`,
Lines: []int{3},
Reporter: "alerts/count",
Text: "query using prom would trigger 3 alert(s) in the last 1d",
Text: fmt.Sprintf(`query using "prom" on %s/alerts/ would trigger 3 alert(s) in the last 1d`, srv.URL),
Severity: checks.Information,
},
},
Expand All @@ -191,7 +191,7 @@ func TestAlertsCheck(t *testing.T) {
Fragment: `{__name__=~"(up|foo)", job="foo"} == 0`,
Lines: []int{3},
Reporter: "alerts/count",
Text: "query using prom would trigger 3 alert(s) in the last 1d",
Text: fmt.Sprintf(`query using "prom" on %s/alerts/ would trigger 3 alert(s) in the last 1d`, srv.URL),
Severity: checks.Information,
},
},
Expand Down
13 changes: 10 additions & 3 deletions internal/checks/base.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,9 +108,12 @@ type exprProblem struct {

func textAndSeverityFromError(err error, reporter, prom string, s Severity) (text string, severity Severity) {
if promapi.IsUnavailableError(err) {
text = fmt.Sprintf("cound't run %q checks due to %q prometheus connection error: %s", reporter, prom, err)
var perr *promapi.Error
text = fmt.Sprintf("cound't run %q checks due to %q connection error: %s", reporter, prom, err)
var perr *promapi.FailoverGroupError
if errors.As(err, &perr) {
if uri := perr.URI(); uri != "" {
text = fmt.Sprintf("cound't run %q checks due to %q on %s connection error: %s", reporter, prom, uri, err)
}
if perr.IsStrict() {
severity = Bug
} else {
Expand All @@ -120,7 +123,11 @@ func textAndSeverityFromError(err error, reporter, prom string, s Severity) (tex
severity = Warning
}
} else {
text = fmt.Sprintf("query using %s failed with: %s", prom, err)
text = fmt.Sprintf("query using %q failed with: %s", prom, err)
var perr *promapi.FailoverGroupError
if errors.As(err, &perr) {
text = fmt.Sprintf("query using %q on %s failed with: %s", prom, perr.URI(), err)
}
severity = s
}
return
Expand Down
31 changes: 12 additions & 19 deletions internal/checks/promql_rate.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ func (c RateCheck) Check(ctx context.Context, rule parser.Rule) (problems []Prob
return
}

scrapeInterval, err := c.getScrapeInterval(ctx)
cfg, err := c.prom.Config(ctx)
if err != nil {
text, severity := textAndSeverityFromError(err, c.Reporter(), c.prom.Name(), Bug)
problems = append(problems, Problem{
Expand All @@ -52,7 +52,7 @@ func (c RateCheck) Check(ctx context.Context, rule parser.Rule) (problems []Prob
return
}

for _, problem := range c.checkNode(expr.Query, scrapeInterval) {
for _, problem := range c.checkNode(expr.Query, cfg) {
problems = append(problems, Problem{
Fragment: problem.expr,
Lines: expr.Lines(),
Expand All @@ -65,16 +65,7 @@ func (c RateCheck) Check(ctx context.Context, rule parser.Rule) (problems []Prob
return
}

func (c RateCheck) getScrapeInterval(ctx context.Context) (interval time.Duration, err error) {
var cfg *promapi.PrometheusConfig
cfg, err = c.prom.Config(ctx)
if err != nil {
return
}
return cfg.Global.ScrapeInterval, nil
}

func (c RateCheck) checkNode(node *parser.PromQLNode, scrapeInterval time.Duration) (problems []exprProblem) {
func (c RateCheck) checkNode(node *parser.PromQLNode, cfg *promapi.ConfigResult) (problems []exprProblem) {
if n, ok := node.Node.(*promParser.Call); ok && (n.Func.Name == "rate" || n.Func.Name == "irate") {
var minIntervals int
var recIntervals int
Expand All @@ -88,17 +79,19 @@ func (c RateCheck) checkNode(node *parser.PromQLNode, scrapeInterval time.Durati
}
for _, arg := range n.Args {
if m, ok := arg.(*promParser.MatrixSelector); ok {
if m.Range < scrapeInterval*time.Duration(minIntervals) {
if m.Range < cfg.Config.Global.ScrapeInterval*time.Duration(minIntervals) {
p := exprProblem{
expr: node.Expr,
text: fmt.Sprintf("duration for %s() must be at least %d x scrape_interval, %s is using %s scrape_interval", n.Func.Name, minIntervals, c.prom.Name(), output.HumanizeDuration(scrapeInterval)),
expr: node.Expr,
text: fmt.Sprintf("duration for %s() must be at least %d x scrape_interval, %q on %s is using %s scrape_interval",
n.Func.Name, minIntervals, c.prom.Name(), cfg.URI, output.HumanizeDuration(cfg.Config.Global.ScrapeInterval)),
severity: Bug,
}
problems = append(problems, p)
} else if m.Range < scrapeInterval*time.Duration(recIntervals) {
} else if m.Range < cfg.Config.Global.ScrapeInterval*time.Duration(recIntervals) {
p := exprProblem{
expr: node.Expr,
text: fmt.Sprintf("duration for %s() is recommended to be at least %d x scrape_interval, %s is using %s scrape_interval", n.Func.Name, recIntervals, c.prom.Name(), output.HumanizeDuration(scrapeInterval)),
expr: node.Expr,
text: fmt.Sprintf("duration for %s() is recommended to be at least %d x scrape_interval, %q on %s is using %s scrape_interval",
n.Func.Name, recIntervals, c.prom.Name(), cfg.URI, output.HumanizeDuration(cfg.Config.Global.ScrapeInterval)),
severity: Warning,
}
problems = append(problems, p)
Expand All @@ -108,7 +101,7 @@ func (c RateCheck) checkNode(node *parser.PromQLNode, scrapeInterval time.Durati
}

for _, child := range node.Children {
problems = append(problems, c.checkNode(child, scrapeInterval)...)
problems = append(problems, c.checkNode(child, cfg)...)
}

return
Expand Down
Loading

0 comments on commit 5655d63

Please sign in to comment.