From e60dbd633bee896addd687b29b1ae36bce3cc3b7 Mon Sep 17 00:00:00 2001 From: Lukasz Mierzwa Date: Wed, 1 Mar 2023 13:18:02 +0000 Subject: [PATCH] Detect rate(sum(counter)) chains --- docs/changelog.md | 11 +++ docs/checks/promql/rate.md | 6 ++ internal/checks/promql_rate.go | 42 +++++++++- internal/checks/promql_rate_test.go | 126 ++++++++++++++++++++++++++++ internal/parser/utils/sum.go | 71 ++++++++++++++++ internal/parser/utils/sum_test.go | 97 +++++++++++++++++++++ 6 files changed, 350 insertions(+), 3 deletions(-) create mode 100644 internal/parser/utils/sum.go create mode 100644 internal/parser/utils/sum_test.go diff --git a/docs/changelog.md b/docs/changelog.md index 94563400..26f84195 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -5,6 +5,17 @@ ### Added - Added `--fail-on` flag to `pint ci` command - #525. +- [promql/rate](checks/promql/rate.md) will now look for queries that call + `rate()` on results of `sum(counter)` via recording rules. + Example: + + ```yaml + - record: my:sum + expr: sum(http_requests_total) + + - alert: my alert + expr: rate(my:sum[5m]) + ``` ### Changed diff --git a/docs/checks/promql/rate.md b/docs/checks/promql/rate.md index b60a9d14..2a09245e 100644 --- a/docs/checks/promql/rate.md +++ b/docs/checks/promql/rate.md @@ -23,6 +23,12 @@ to verify that: For gauge metrics use [`delta()`](https://prometheus.io/docs/prometheus/latest/querying/functions/#delta) or [`deriv()`](https://prometheus.io/docs/prometheus/latest/querying/functions/#deriv) functions instead. +- `rate()` is never called on result of `sum(counter)` since that will always return + invalid results. + Chaining `rate(sum(...))` is only possible when passing a metric produced via recording rules + to `rate()` and so pint will try to find such chains. + See [this blog post](https://www.robustperception.io/rate-then-sum-never-sum-then-rate/) + for details. ## Common problems diff --git a/internal/checks/promql_rate.go b/internal/checks/promql_rate.go index e0bf2872..27eb81eb 100644 --- a/internal/checks/promql_rate.go +++ b/internal/checks/promql_rate.go @@ -10,6 +10,7 @@ import ( "github.com/cloudflare/pint/internal/discovery" "github.com/cloudflare/pint/internal/output" "github.com/cloudflare/pint/internal/parser" + "github.com/cloudflare/pint/internal/parser/utils" "github.com/cloudflare/pint/internal/promapi" v1 "github.com/prometheus/client_golang/api/prometheus/v1" @@ -62,7 +63,7 @@ func (c RateCheck) Check(ctx context.Context, path string, rule parser.Rule, ent } done := &completedList{} - for _, problem := range c.checkNode(ctx, expr.Query, cfg, done) { + for _, problem := range c.checkNode(ctx, expr.Query, entries, cfg, done) { problems = append(problems, Problem{ Fragment: problem.expr, Lines: expr.Lines(), @@ -75,7 +76,7 @@ func (c RateCheck) Check(ctx context.Context, path string, rule parser.Rule, ent return problems } -func (c RateCheck) checkNode(ctx context.Context, node *parser.PromQLNode, cfg *promapi.ConfigResult, done *completedList) (problems []exprProblem) { +func (c RateCheck) checkNode(ctx context.Context, node *parser.PromQLNode, entries []discovery.Entry, cfg *promapi.ConfigResult, done *completedList) (problems []exprProblem) { if n, ok := node.Node.(*promParser.Call); ok && (n.Func.Name == "rate" || n.Func.Name == "irate" || n.Func.Name == "deriv") { for _, arg := range n.Args { m, ok := arg.(*promParser.MatrixSelector) @@ -119,12 +120,47 @@ func (c RateCheck) checkNode(ctx context.Context, node *parser.PromQLNode, cfg * }) } } + + for _, e := range entries { + if e.PathError != nil { + continue + } + if e.Rule.Error.Err != nil { + continue + } + if e.Rule.RecordingRule != nil && e.Rule.RecordingRule.Expr.SyntaxError == nil && e.Rule.RecordingRule.Record.Value.Value == s.Name { + for _, sm := range utils.HasOuterSum(e.Rule.RecordingRule.Expr.Query) { + if sv, ok := sm.Expr.(*promParser.VectorSelector); ok { + metadata, err := c.prom.Metadata(ctx, sv.Name) + if err != nil { + text, severity := textAndSeverityFromError(err, c.Reporter(), c.prom.Name(), Bug) + problems = append(problems, exprProblem{ + expr: sv.Name, + text: text, + severity: severity, + }) + continue + } + for _, m := range metadata.Metadata { + if m.Type == v1.MetricTypeCounter { + problems = append(problems, exprProblem{ + expr: node.Expr, + text: fmt.Sprintf("rate(sum(counter)) chain detected, %s is called here on results of %s, calling rate on sum() results will return bogus results, always sum(rate(counter)), never rate(sum(counter))", + node.Expr, sm), + severity: Bug, + }) + } + } + } + } + } + } } } } for _, child := range node.Children { - problems = append(problems, c.checkNode(ctx, child, cfg, done)...) + problems = append(problems, c.checkNode(ctx, child, entries, cfg, done)...) } return problems diff --git a/internal/checks/promql_rate_test.go b/internal/checks/promql_rate_test.go index 33330351..2e1c9856 100644 --- a/internal/checks/promql_rate_test.go +++ b/internal/checks/promql_rate_test.go @@ -23,6 +23,10 @@ func notCounterText(name, uri, fun, metric, kind string) string { return fmt.Sprintf(`%s() should only be used with counters but %q is a %s according to metrics metadata from prometheus %q at %s`, fun, metric, kind, name, uri) } +func rateSumText(rateName, sumExpr string) string { + return fmt.Sprintf("rate(sum(counter)) chain detected, rate(%s) is called here on results of %s, calling rate on sum() results will return bogus results, always sum(rate(counter)), never rate(sum(counter))", rateName, sumExpr) +} + func TestRateCheck(t *testing.T) { testCases := []checkTest{ { @@ -619,6 +623,128 @@ func TestRateCheck(t *testing.T) { }, }, }, + { + description: "rate_over_sum", + content: "- alert: my alert\n expr: rate(my:sum[5m])\n", + entries: mustParseContent("- record: my:sum\n expr: sum(foo)\n"), + checker: newRateCheck, + prometheus: newSimpleProm, + problems: func(uri string) []checks.Problem { + return []checks.Problem{ + { + Fragment: "rate(my:sum[5m])", + Lines: []int{2}, + Reporter: "promql/rate", + Text: rateSumText("my:sum[5m]", "sum(foo)"), + Severity: checks.Bug, + }, + } + }, + mocks: []*prometheusMock{ + { + conds: []requestCondition{requireConfigPath}, + resp: configResponse{yaml: "global:\n scrape_interval: 1m\n"}, + }, + { + conds: []requestCondition{ + requireMetadataPath, + formCond{"metric", "foo"}, + }, + resp: metadataResponse{metadata: map[string][]v1.Metadata{ + "foo": {{Type: "counter"}}, + }}, + }, + { + conds: []requestCondition{ + requireMetadataPath, + formCond{"metric", "my:sum"}, + }, + resp: metadataResponse{metadata: map[string][]v1.Metadata{}}, + }, + }, + }, + { + description: "rate_over_sum_error", + content: "- alert: my alert\n expr: rate(my:sum[5m])\n", + entries: mustParseContent("- record: my:sum\n expr: sum(foo)\n"), + checker: newRateCheck, + prometheus: newSimpleProm, + problems: func(uri string) []checks.Problem { + return []checks.Problem{ + { + Fragment: "foo", + Lines: []int{2}, + Reporter: "promql/rate", + Text: checkErrorUnableToRun(checks.RateCheckName, "prom", uri, "server_error: internal error"), + Severity: checks.Bug, + }, + } + }, + mocks: []*prometheusMock{ + { + conds: []requestCondition{requireConfigPath}, + resp: configResponse{yaml: "global:\n scrape_interval: 1m\n"}, + }, + { + conds: []requestCondition{ + requireMetadataPath, + formCond{"metric", "foo"}, + }, + resp: respondWithInternalError(), + }, + { + conds: []requestCondition{ + requireMetadataPath, + formCond{"metric", "my:sum"}, + }, + resp: metadataResponse{metadata: map[string][]v1.Metadata{}}, + }, + }, + }, + { + description: "rate_over_sum_on_gauge", + content: "- alert: my alert\n expr: rate(my:sum[5m])\n", + entries: mustParseContent("- record: my:sum\n expr: sum(foo)\n"), + checker: newRateCheck, + prometheus: newSimpleProm, + problems: noProblems, + mocks: []*prometheusMock{ + { + conds: []requestCondition{requireConfigPath}, + resp: configResponse{yaml: "global:\n scrape_interval: 1m\n"}, + }, + { + conds: []requestCondition{ + requireMetadataPath, + formCond{"metric", "foo"}, + }, + resp: metadataResponse{metadata: map[string][]v1.Metadata{ + "foo": {{Type: "gauge"}}, + }}, + }, + { + conds: []requestCondition{ + requireMetadataPath, + formCond{"metric", "my:sum"}, + }, + resp: metadataResponse{metadata: map[string][]v1.Metadata{}}, + }, + }, + }, + { + description: "sum_over_rate", + content: "- alert: my alert\n expr: sum(my:rate:5m)\n", + entries: mustParseContent("- record: my:rate:5m\n expr: rate(foo[5m])\n"), + checker: newRateCheck, + prometheus: newSimpleProm, + problems: noProblems, + mocks: []*prometheusMock{ + { + conds: []requestCondition{requireConfigPath}, + resp: configResponse{yaml: "global:\n scrape_interval: 1m\n"}, + }, + }, + }, } runTests(t, testCases) } diff --git a/internal/parser/utils/sum.go b/internal/parser/utils/sum.go new file mode 100644 index 00000000..91aa23b1 --- /dev/null +++ b/internal/parser/utils/sum.go @@ -0,0 +1,71 @@ +package utils + +import ( + promParser "github.com/prometheus/prometheus/promql/parser" + "github.com/rs/zerolog/log" + + "github.com/cloudflare/pint/internal/parser" +) + +func HasOuterSum(node *parser.PromQLNode) (calls []*promParser.AggregateExpr) { + if n, ok := node.Node.(*promParser.AggregateExpr); ok { + if n.Op == promParser.SUM { + calls = append(calls, n) + return calls + } + } + + if n, ok := node.Node.(*promParser.AggregateExpr); ok { + switch n.Op { + case promParser.COUNT: + return nil + case promParser.COUNT_VALUES: + return nil + } + } + + if n, ok := node.Node.(*promParser.BinaryExpr); ok { + if n.VectorMatching != nil { + switch n.VectorMatching.Card { + case promParser.CardOneToOne: + case promParser.CardOneToMany: + for i, child := range node.Children { + if i == len(node.Children)-1 { + return HasOuterSum(child) + } + } + case promParser.CardManyToOne: + return HasOuterSum(node.Children[0]) + case promParser.CardManyToMany: + default: + log.Warn().Str("matching", n.VectorMatching.Card.String()).Msg("Unsupported VectorMatching operation") + } + } + + if n.Op.IsComparisonOperator() { + for i, child := range node.Children { + if i == 0 { + return HasOuterSum(child) + } + } + } else { + switch n.Op { + case promParser.LOR: + for _, child := range node.Children { + calls = append(calls, HasOuterSum(child)...) + } + return calls + case promParser.DIV, promParser.LUNLESS, promParser.LAND: + for _, child := range node.Children { + return HasOuterSum(child) + } + } + } + } + + for _, child := range node.Children { + calls = append(calls, HasOuterSum(child)...) + } + + return calls +} diff --git a/internal/parser/utils/sum_test.go b/internal/parser/utils/sum_test.go new file mode 100644 index 00000000..5a817159 --- /dev/null +++ b/internal/parser/utils/sum_test.go @@ -0,0 +1,97 @@ +package utils_test + +import ( + "testing" + + "github.com/stretchr/testify/require" + + "github.com/cloudflare/pint/internal/parser" + "github.com/cloudflare/pint/internal/parser/utils" +) + +func TestHasOuterSum(t *testing.T) { + type testCaseT struct { + expr string + output []string + } + + testCases := []testCaseT{ + { + expr: "foo", + }, + { + expr: "sum(foo)", + output: []string{"sum(foo)"}, + }, + { + expr: "sum(foo) > 0", + output: []string{"sum(foo)"}, + }, + { + expr: "sum(foo) / sum(rate(bar[2m]))", + output: []string{"sum(foo)"}, + }, + { + expr: "sum(sum(foo)) > 0", + output: []string{"sum(sum(foo))"}, + }, + { + expr: "count(sum(foo)) > 0", + }, + { + expr: "count_values(\"foo\", sum(foo)) > 0", + }, + { + expr: "floor(sum(foo)) > 0", + output: []string{"sum(foo)"}, + }, + { + expr: "ceil(sum(foo)) > 0", + output: []string{"sum(foo)"}, + }, + { + expr: "sum(foo) or sum(bar)", + output: []string{"sum(foo)", "sum(bar)"}, + }, + { + expr: "sum(foo) * on() sum(bar)", + output: []string{"sum(foo)", "sum(bar)"}, + }, + { + expr: "sum(foo) without() * on() group_left(instance) sum(deriv(foo[2m]))", + output: []string{"sum without () (foo)"}, + }, + { + expr: "sum(foo) without(job) * on() group_right(instance) sum(deriv(foo[2m]))", + output: []string{"sum(deriv(foo[2m]))"}, + }, + { + expr: "2 > foo", + }, + { + expr: "2 > sum(foo)", + }, + } + + for _, tc := range testCases { + t.Run(tc.expr, func(t *testing.T) { + n, err := parser.DecodeExpr(tc.expr) + if err != nil { + t.Error(err) + t.FailNow() + } + calls := utils.HasOuterSum(n) + if len(calls) == 0 { + if len(tc.output) > 0 { + t.Errorf("HasOuterSum() returned nil, expected %s", tc.output) + } + } else { + output := []string{} + for _, a := range calls { + output = append(output, a.String()) + } + require.Equal(t, tc.output, output, "HasOuterSum() returned wrong output") + } + }) + } +}