diff --git a/docs/checks/promql/series.md b/docs/checks/promql/series.md index 9d796a6c..99752fee 100644 --- a/docs/checks/promql/series.md +++ b/docs/checks/promql/series.md @@ -6,7 +6,7 @@ grand_parent: Documentation # promql/series -This check will also query Prometheus servers, it is used to warn about queries +This check will query Prometheus servers, it is used to warn about queries that are using metrics not currently present in Prometheus. It parses `expr` query from every rule, finds individual metric selectors and runs a series of checks for each of them. @@ -26,6 +26,50 @@ to determine why, by checking if: If you see this check complaining about some metric it's might due to a number of different issues. Here are some usual cases. +## Your query is using ALERTS or ALERTS_FOR_STATE metrics + +Prometheus itself exposes metrics about active alerts [see docs here](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/#inspecting-alerts-during-runtime. +And it's possible to use those metrics in recording or alerting rules. +If pint finds a query using either `ALERTS{alertname="..."}` or +`ALERTS_FOR_STATE{alertname="..."}` selector it will check if there's +alerting rule with matching name defined. For queries that don't pass any +`alertname` label filters it will skip any further checks. + +## Your query is using recording rules + +If a metric isn't present in Prometheus but pint finds a recording rule +with matching name then it will try to use that recording rule instead. + +This should help with CI checks where multiple rules are added at once +and one depends on the other. + +Example with alert rule that depends on two recording rules: + +```yaml +# count the number of targets per job +- record: job:up:count + expr: count(up) by(job) + +# total number of targets that are healthy per job +- record: job:up:sum + expr: sum(up) by(job) + +# alert if <50% of targets are down +- alert: Too Many Targets Are Down + expr: (job:up:sum / job:up:count) < 0.5 +``` + +If all three rules where added in a single PR and pint didn't try to match +metrics to recording rule then `pint ci` would block such PR because metrics +this alert is using are not present in Prometheus. +By trying to match metrics to recording rules pint can use those rules +as as substitute for missing metrics and better validate such PR. + +**NOTE**: Checking recording rules instead of real metrics present in Prometheus +can be less accurate and might not spot some issues, like missing labels. +For most accurate validation via `pint ci` it's best to first add recording +rules before adding alerting rules that depend on them. + ### Your query cannot return anything - You are trying to use a metric that is not present in Prometheus at all. diff --git a/internal/checks/base_test.go b/internal/checks/base_test.go index 2845e396..6add9181 100644 --- a/internal/checks/base_test.go +++ b/internal/checks/base_test.go @@ -74,6 +74,7 @@ type checkTest struct { description string content string checker newCheckFn + entries []discovery.Entry problems problemsFn mocks []*prometheusMock } @@ -101,7 +102,7 @@ func runTests(t *testing.T, testCases []checkTest) { entries, err := parseContent(tc.content) require.NoError(t, err, "cannot parse rule content") for _, entry := range entries { - problems := tc.checker(uri).Check(ctx, entry.Rule, entries) + problems := tc.checker(uri).Check(ctx, entry.Rule, tc.entries) require.Equal(t, tc.problems(uri), problems) } @@ -124,7 +125,7 @@ func runTests(t *testing.T, testCases []checkTest) { require.NoError(t, err, "cannot parse rule content") t.Run(tc.description+" (bogus rules)", func(t *testing.T) { for _, entry := range entries { - _ = tc.checker("").Check(ctx, entry.Rule, entries) + _ = tc.checker("").Check(ctx, entry.Rule, tc.entries) } }) } @@ -148,6 +149,14 @@ func parseContent(content string) (entries []discovery.Entry, err error) { return entries, nil } +func mustParseContent(content string) (entries []discovery.Entry) { + entries, err := parseContent(content) + if err != nil { + panic(err) + } + return entries +} + func noProblems(uri string) []checks.Problem { return nil } diff --git a/internal/checks/promql_series.go b/internal/checks/promql_series.go index 78bd3814..1242fd7d 100644 --- a/internal/checks/promql_series.go +++ b/internal/checks/promql_series.go @@ -3,6 +3,7 @@ package checks import ( "context" "fmt" + "strings" "time" "github.com/rs/zerolog/log" @@ -73,6 +74,48 @@ func (c SeriesCheck) Check(ctx context.Context, rule parser.Rule, entries []disc } } + // 0. Special case for alert metrics + if metricName == "ALERTS" || metricName == "ALERTS_FOR_STATE" { + var alertname string + for _, lm := range selector.LabelMatchers { + if lm.Name == "alertname" && lm.Type != labels.MatchRegexp && lm.Type != labels.MatchNotRegexp { + alertname = lm.Value + } + } + var arEntry *discovery.Entry + if alertname != "" { + for _, entry := range entries { + entry := entry + if entry.Rule.AlertingRule != nil && + entry.Rule.Error.Err == nil && + entry.Rule.AlertingRule.Alert.Value.Value == alertname { + arEntry = &entry + break + } + } + if arEntry != nil { + log.Debug().Stringer("selector", &selector).Str("path", arEntry.Path).Msg("Metric is provided by alerting rule") + problems = append(problems, Problem{ + Fragment: selector.String(), + Lines: expr.Lines(), + Reporter: c.Reporter(), + Text: fmt.Sprintf("%s metric is generated by alerts and found alerting rule named %q", selector.String(), alertname), + Severity: Information, + }) + } else { + problems = append(problems, Problem{ + Fragment: selector.String(), + Lines: expr.Lines(), + Reporter: c.Reporter(), + Text: fmt.Sprintf("%s metric is generated by alerts but didn't found any rule named %q", selector.String(), alertname), + Severity: Bug, + }) + } + } + // ALERTS{} query with no alertname, all good + continue + } + labelNames := []string{} for _, lm := range selector.LabelMatchers { if lm.Name != labels.MetricName { @@ -100,6 +143,45 @@ func (c SeriesCheck) Check(ctx context.Context, rule parser.Rule, entries []disc continue } if len(trs.ranges) == 0 { + // Check if we have recording rule that provides this metric before we give up + var rrEntry *discovery.Entry + for _, entry := range entries { + entry := entry + if entry.Rule.RecordingRule != nil && + entry.Rule.Error.Err == nil && + entry.Rule.RecordingRule.Record.Value.Value == bareSelector.String() { + rrEntry = &entry + break + } + } + if rrEntry != nil { + // Validate recording rule instead + log.Debug().Stringer("selector", &bareSelector).Str("path", rrEntry.Path).Msg("Metric is provided by recording rule") + problems = append(problems, Problem{ + Fragment: bareSelector.String(), + Lines: expr.Lines(), + Reporter: c.Reporter(), + Text: fmt.Sprintf("%s didn't have any series for %q metric in the last %s but found recording rule that generates it, "+ + "pint will try to use source recording rule queries to validate selectors in this query but it might be less accurate", + promText(c.prom.Name(), trs.uri), bareSelector.String(), trs.sinceDesc(trs.from)), + Severity: Information, + }) + + q := wrapLabelReplaces(rrEntry.Rule.RecordingRule.Expr.Value.Value, selector.LabelMatchers) + count, _, err := c.instantSeriesCount(ctx, q) + if err != nil { + problems = append(problems, c.queryProblem(err, selector.String(), expr)) + continue + } + if count > 0 { + log.Debug().Str("check", c.Reporter()).Stringer("selector", &selector).Msg("Found fallback series from recording rule, skipping further checks") + continue + } + + // if it's not there recurse checks onto it + problems = append(problems, c.Check(ctx, rrEntry.Rule, entries)...) + } + problems = append(problems, Problem{ Fragment: bareSelector.String(), Lines: expr.Lines(), @@ -435,3 +517,15 @@ func (tr timeRanges) sinceDesc(t time.Time) (s string) { } return output.HumanizeDuration(dur.Round(time.Minute)) } + +func wrapLabelReplaces(q string, lms []*labels.Matcher) string { + out := "vector(1)" + onLabels := []string{} + for _, lm := range lms { + if lm.Name != labels.MetricName && lm.Type == labels.MatchEqual { + out = fmt.Sprintf(`label_replace(%s, "%s", "%s", "", "")`, out, lm.Name, lm.Value) + onLabels = append(onLabels, lm.Name) + } + } + return fmt.Sprintf(`count(%s AND on(%s) %s)`, q, strings.Join(onLabels, ","), out) +} diff --git a/internal/checks/promql_series_test.go b/internal/checks/promql_series_test.go index 42e424d1..ff549700 100644 --- a/internal/checks/promql_series_test.go +++ b/internal/checks/promql_series_test.go @@ -18,6 +18,10 @@ func noMetricText(name, uri, metric, since string) string { return fmt.Sprintf(`prometheus %q at %s didn't have any series for %q metric in the last %s`, name, uri, metric, since) } +func noMetricRRText(name, uri, metric, since string) string { + return fmt.Sprintf(`prometheus %q at %s didn't have any series for %q metric in the last %s but found recording rule that generates it, pint will try to use source recording rule queries to validate selectors in this query but it might be less accurate`, name, uri, metric, since) +} + func noFilterMatchText(name, uri, metric, label, filter, since string) string { return fmt.Sprintf(`prometheus %q at %s has %q metric with %q label but there are no series matching %s in the last %s`, name, uri, metric, label, filter, since) } @@ -46,6 +50,14 @@ func seriesSometimesText(name, uri, metric, since, avg string) string { return fmt.Sprintf(`metric %q is only sometimes present on prometheus %q at %s with average life span of %s in the last %s`, metric, name, uri, avg, since) } +func alertPresent(metric, alertname string) string { + return fmt.Sprintf("%s metric is generated by alerts and found alerting rule named %q", metric, alertname) +} + +func alertMissing(metric, alertname string) string { + return fmt.Sprintf("%s metric is generated by alerts but didn't found any rule named %q", metric, alertname) +} + func TestSeriesCheck(t *testing.T) { testCases := []checkTest{ { @@ -289,6 +301,195 @@ func TestSeriesCheck(t *testing.T) { }, }, }, + { + description: "#2 series never present but recording rule provides it correctly", + content: "- record: foo\n expr: sum(foo:bar{job=\"xxx\"})\n", + checker: newSeriesCheck, + entries: mustParseContent("- record: foo:bar\n expr: sum(foo:bar)\n"), + problems: func(uri string) []checks.Problem { + return []checks.Problem{ + { + Fragment: "foo:bar", + Lines: []int{2}, + Reporter: checks.SeriesCheckName, + Text: noMetricRRText("prom", uri, "foo:bar", "1w"), + Severity: checks.Information, + }, + } + }, + mocks: []*prometheusMock{ + { + conds: []requestCondition{ + requireQueryPath, + formCond{key: "query", value: `count(foo:bar{job="xxx"})`}, + }, + resp: respondWithEmptyVector(), + }, + { + conds: []requestCondition{ + requireRangeQueryPath, + formCond{key: "query", value: `count(foo:bar)`}, + }, + resp: respondWithEmptyMatrix(), + }, + { + conds: []requestCondition{ + requireQueryPath, + formCond{key: "query", value: `count(sum(foo:bar) AND on(job) label_replace(vector(1), "job", "xxx", "", ""))`}, + }, + resp: respondWithSingleInstantVector(), + }, + }, + }, + { + description: "#2 series never present but recording rule provides it without results", + content: "- record: foo\n expr: sum(foo:bar{job=\"xxx\"})\n", + checker: newSeriesCheck, + entries: mustParseContent("- record: foo:bar\n expr: sum(foo)\n"), + problems: func(uri string) []checks.Problem { + return []checks.Problem{ + { + Fragment: "foo:bar", + Lines: []int{2}, + Reporter: checks.SeriesCheckName, + Text: noMetricRRText("prom", uri, "foo:bar", "1w"), + Severity: checks.Information, + }, + { + Fragment: "foo", + Lines: []int{2}, + Reporter: checks.SeriesCheckName, + Text: noMetricText("prom", uri, "foo", "1w"), + Severity: checks.Bug, + }, + { + Fragment: "foo:bar", + Lines: []int{2}, + Reporter: checks.SeriesCheckName, + Text: noMetricText("prom", uri, "foo:bar", "1w"), + Severity: checks.Bug, + }, + } + }, + mocks: []*prometheusMock{ + { + conds: []requestCondition{ + requireQueryPath, + formCond{key: "query", value: `count(foo:bar{job="xxx"})`}, + }, + resp: respondWithEmptyVector(), + }, + { + conds: []requestCondition{ + requireRangeQueryPath, + formCond{key: "query", value: `count(foo:bar)`}, + }, + resp: respondWithEmptyMatrix(), + }, + { + conds: []requestCondition{ + requireQueryPath, + formCond{key: "query", value: `count(sum(foo) AND on(job) label_replace(vector(1), "job", "xxx", "", ""))`}, + }, + resp: respondWithEmptyVector(), + }, + { + conds: []requestCondition{ + requireQueryPath, + formCond{key: "query", value: `count(foo)`}, + }, + resp: respondWithEmptyVector(), + }, + { + conds: []requestCondition{ + requireRangeQueryPath, + formCond{key: "query", value: `count(foo)`}, + }, + resp: respondWithEmptyMatrix(), + }, + }, + }, + { + description: "#2 {ALERTS=...} present", + content: "- record: foo\n expr: count(ALERTS{alertname=\"myalert\"})\n", + checker: newSeriesCheck, + entries: mustParseContent("- alert: myalert\n expr: sum(foo) == 0\n"), + problems: func(uri string) []checks.Problem { + return []checks.Problem{ + { + Fragment: `ALERTS{alertname="myalert"}`, + Lines: []int{2}, + Reporter: checks.SeriesCheckName, + Text: alertPresent(`ALERTS{alertname="myalert"}`, "myalert"), + Severity: checks.Information, + }, + } + }, + }, + { + description: "#2 {ALERTS=...} missing", + content: "- record: foo\n expr: count(ALERTS{alertname=\"myalert\"})\n", + checker: newSeriesCheck, + entries: mustParseContent("- alert: notmyalert\n expr: sum(foo) == 0\n"), + problems: func(uri string) []checks.Problem { + return []checks.Problem{ + { + Fragment: `ALERTS{alertname="myalert"}`, + Lines: []int{2}, + Reporter: checks.SeriesCheckName, + Text: alertMissing(`ALERTS{alertname="myalert"}`, "myalert"), + Severity: checks.Bug, + }, + } + }, + }, + { + description: "#2 series never present but recording rule provides it, query error", + content: "- record: foo\n expr: sum(foo:bar{job=\"xxx\"})\n", + checker: newSeriesCheck, + entries: mustParseContent("- record: foo:bar\n expr: sum(foo:bar)\n"), + problems: func(uri string) []checks.Problem { + return []checks.Problem{ + { + Fragment: "foo:bar", + Lines: []int{2}, + Reporter: checks.SeriesCheckName, + Text: noMetricRRText("prom", uri, "foo:bar", "1w"), + Severity: checks.Information, + }, + { + Fragment: `foo:bar{job="xxx"}`, + Lines: []int{2}, + Reporter: checks.SeriesCheckName, + Text: checkErrorBadData("prom", uri, "bad_data: bad input data"), + Severity: checks.Bug, + }, + } + }, + mocks: []*prometheusMock{ + { + conds: []requestCondition{ + requireQueryPath, + formCond{key: "query", value: `count(foo:bar{job="xxx"})`}, + }, + resp: respondWithEmptyVector(), + }, + { + conds: []requestCondition{ + requireRangeQueryPath, + formCond{key: "query", value: `count(foo:bar)`}, + }, + resp: respondWithEmptyMatrix(), + }, + { + conds: []requestCondition{ + requireQueryPath, + formCond{key: "query", value: `count(sum(foo:bar) AND on(job) label_replace(vector(1), "job", "xxx", "", ""))`}, + }, + resp: respondWithBadData(), + }, + }, + }, { description: "#2 query error", content: "- record: foo\n expr: found > 0\n", @@ -1161,45 +1362,6 @@ func TestSeriesCheck(t *testing.T) { }, }, }, - { - description: "ALERTS{notfound=...}", - content: "- alert: foo\n expr: count(ALERTS{notfound=\"foo\"}) >= 10\n", - checker: newSeriesCheck, - problems: func(uri string) []checks.Problem { - return []checks.Problem{ - { - Fragment: `ALERTS{notfound="foo"}`, - Lines: []int{2}, - Reporter: checks.SeriesCheckName, - Text: noLabelKeyText("prom", uri, "ALERTS", "notfound", "1w"), - Severity: checks.Bug, - }, - } - }, - mocks: []*prometheusMock{ - { - conds: []requestCondition{ - requireQueryPath, - formCond{key: "query", value: `count(ALERTS{notfound="foo"})`}, - }, - resp: respondWithEmptyVector(), - }, - { - conds: []requestCondition{ - requireRangeQueryPath, - formCond{key: "query", value: `count(ALERTS)`}, - }, - resp: respondWithSingleRangeVector1W(), - }, - { - conds: []requestCondition{ - requireRangeQueryPath, - formCond{key: "query", value: `count(ALERTS) by (notfound)`}, - }, - resp: respondWithSingleRangeVector1W(), - }, - }, - }, } runTests(t, testCases) }