Skip to content

Commit

Permalink
Detect rate(sum(counter)) chains
Browse files Browse the repository at this point in the history
  • Loading branch information
prymitive committed Mar 1, 2023
1 parent 34c202c commit e60dbd6
Show file tree
Hide file tree
Showing 6 changed files with 350 additions and 3 deletions.
11 changes: 11 additions & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,17 @@
### Added

- Added `--fail-on` flag to `pint ci` command - #525.
- [promql/rate](checks/promql/rate.md) will now look for queries that call
`rate()` on results of `sum(counter)` via recording rules.
Example:

```yaml
- record: my:sum
expr: sum(http_requests_total)

- alert: my alert
expr: rate(my:sum[5m])
```
### Changed
Expand Down
6 changes: 6 additions & 0 deletions docs/checks/promql/rate.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,12 @@ to verify that:
For gauge metrics use [`delta()`](https://prometheus.io/docs/prometheus/latest/querying/functions/#delta)
or [`deriv()`](https://prometheus.io/docs/prometheus/latest/querying/functions/#deriv)
functions instead.
- `rate()` is never called on result of `sum(counter)` since that will always return
invalid results.
Chaining `rate(sum(...))` is only possible when passing a metric produced via recording rules
to `rate()` and so pint will try to find such chains.
See [this blog post](https://www.robustperception.io/rate-then-sum-never-sum-then-rate/)
for details.

## Common problems

Expand Down
42 changes: 39 additions & 3 deletions internal/checks/promql_rate.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"github.com/cloudflare/pint/internal/discovery"
"github.com/cloudflare/pint/internal/output"
"github.com/cloudflare/pint/internal/parser"
"github.com/cloudflare/pint/internal/parser/utils"
"github.com/cloudflare/pint/internal/promapi"

v1 "github.com/prometheus/client_golang/api/prometheus/v1"
Expand Down Expand Up @@ -62,7 +63,7 @@ func (c RateCheck) Check(ctx context.Context, path string, rule parser.Rule, ent
}

done := &completedList{}
for _, problem := range c.checkNode(ctx, expr.Query, cfg, done) {
for _, problem := range c.checkNode(ctx, expr.Query, entries, cfg, done) {
problems = append(problems, Problem{
Fragment: problem.expr,
Lines: expr.Lines(),
Expand All @@ -75,7 +76,7 @@ func (c RateCheck) Check(ctx context.Context, path string, rule parser.Rule, ent
return problems
}

func (c RateCheck) checkNode(ctx context.Context, node *parser.PromQLNode, cfg *promapi.ConfigResult, done *completedList) (problems []exprProblem) {
func (c RateCheck) checkNode(ctx context.Context, node *parser.PromQLNode, entries []discovery.Entry, cfg *promapi.ConfigResult, done *completedList) (problems []exprProblem) {
if n, ok := node.Node.(*promParser.Call); ok && (n.Func.Name == "rate" || n.Func.Name == "irate" || n.Func.Name == "deriv") {
for _, arg := range n.Args {
m, ok := arg.(*promParser.MatrixSelector)
Expand Down Expand Up @@ -119,12 +120,47 @@ func (c RateCheck) checkNode(ctx context.Context, node *parser.PromQLNode, cfg *
})
}
}

for _, e := range entries {
if e.PathError != nil {
continue
}
if e.Rule.Error.Err != nil {
continue
}
if e.Rule.RecordingRule != nil && e.Rule.RecordingRule.Expr.SyntaxError == nil && e.Rule.RecordingRule.Record.Value.Value == s.Name {
for _, sm := range utils.HasOuterSum(e.Rule.RecordingRule.Expr.Query) {
if sv, ok := sm.Expr.(*promParser.VectorSelector); ok {
metadata, err := c.prom.Metadata(ctx, sv.Name)
if err != nil {
text, severity := textAndSeverityFromError(err, c.Reporter(), c.prom.Name(), Bug)
problems = append(problems, exprProblem{
expr: sv.Name,
text: text,
severity: severity,
})
continue
}
for _, m := range metadata.Metadata {
if m.Type == v1.MetricTypeCounter {
problems = append(problems, exprProblem{
expr: node.Expr,
text: fmt.Sprintf("rate(sum(counter)) chain detected, %s is called here on results of %s, calling rate on sum() results will return bogus results, always sum(rate(counter)), never rate(sum(counter))",
node.Expr, sm),
severity: Bug,
})
}
}
}
}
}
}
}
}
}

for _, child := range node.Children {
problems = append(problems, c.checkNode(ctx, child, cfg, done)...)
problems = append(problems, c.checkNode(ctx, child, entries, cfg, done)...)
}

return problems
Expand Down
126 changes: 126 additions & 0 deletions internal/checks/promql_rate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ func notCounterText(name, uri, fun, metric, kind string) string {
return fmt.Sprintf(`%s() should only be used with counters but %q is a %s according to metrics metadata from prometheus %q at %s`, fun, metric, kind, name, uri)
}

func rateSumText(rateName, sumExpr string) string {
return fmt.Sprintf("rate(sum(counter)) chain detected, rate(%s) is called here on results of %s, calling rate on sum() results will return bogus results, always sum(rate(counter)), never rate(sum(counter))", rateName, sumExpr)
}

func TestRateCheck(t *testing.T) {
testCases := []checkTest{
{
Expand Down Expand Up @@ -619,6 +623,128 @@ func TestRateCheck(t *testing.T) {
},
},
},
{
description: "rate_over_sum",
content: "- alert: my alert\n expr: rate(my:sum[5m])\n",
entries: mustParseContent("- record: my:sum\n expr: sum(foo)\n"),
checker: newRateCheck,
prometheus: newSimpleProm,
problems: func(uri string) []checks.Problem {
return []checks.Problem{
{
Fragment: "rate(my:sum[5m])",
Lines: []int{2},
Reporter: "promql/rate",
Text: rateSumText("my:sum[5m]", "sum(foo)"),
Severity: checks.Bug,
},
}
},
mocks: []*prometheusMock{
{
conds: []requestCondition{requireConfigPath},
resp: configResponse{yaml: "global:\n scrape_interval: 1m\n"},
},
{
conds: []requestCondition{
requireMetadataPath,
formCond{"metric", "foo"},
},
resp: metadataResponse{metadata: map[string][]v1.Metadata{
"foo": {{Type: "counter"}},
}},
},
{
conds: []requestCondition{
requireMetadataPath,
formCond{"metric", "my:sum"},
},
resp: metadataResponse{metadata: map[string][]v1.Metadata{}},
},
},
},
{
description: "rate_over_sum_error",
content: "- alert: my alert\n expr: rate(my:sum[5m])\n",
entries: mustParseContent("- record: my:sum\n expr: sum(foo)\n"),
checker: newRateCheck,
prometheus: newSimpleProm,
problems: func(uri string) []checks.Problem {
return []checks.Problem{
{
Fragment: "foo",
Lines: []int{2},
Reporter: "promql/rate",
Text: checkErrorUnableToRun(checks.RateCheckName, "prom", uri, "server_error: internal error"),
Severity: checks.Bug,
},
}
},
mocks: []*prometheusMock{
{
conds: []requestCondition{requireConfigPath},
resp: configResponse{yaml: "global:\n scrape_interval: 1m\n"},
},
{
conds: []requestCondition{
requireMetadataPath,
formCond{"metric", "foo"},
},
resp: respondWithInternalError(),
},
{
conds: []requestCondition{
requireMetadataPath,
formCond{"metric", "my:sum"},
},
resp: metadataResponse{metadata: map[string][]v1.Metadata{}},
},
},
},
{
description: "rate_over_sum_on_gauge",
content: "- alert: my alert\n expr: rate(my:sum[5m])\n",
entries: mustParseContent("- record: my:sum\n expr: sum(foo)\n"),
checker: newRateCheck,
prometheus: newSimpleProm,
problems: noProblems,
mocks: []*prometheusMock{
{
conds: []requestCondition{requireConfigPath},
resp: configResponse{yaml: "global:\n scrape_interval: 1m\n"},
},
{
conds: []requestCondition{
requireMetadataPath,
formCond{"metric", "foo"},
},
resp: metadataResponse{metadata: map[string][]v1.Metadata{
"foo": {{Type: "gauge"}},
}},
},
{
conds: []requestCondition{
requireMetadataPath,
formCond{"metric", "my:sum"},
},
resp: metadataResponse{metadata: map[string][]v1.Metadata{}},
},
},
},
{
description: "sum_over_rate",
content: "- alert: my alert\n expr: sum(my:rate:5m)\n",
entries: mustParseContent("- record: my:rate:5m\n expr: rate(foo[5m])\n"),
checker: newRateCheck,
prometheus: newSimpleProm,
problems: noProblems,
mocks: []*prometheusMock{
{
conds: []requestCondition{requireConfigPath},
resp: configResponse{yaml: "global:\n scrape_interval: 1m\n"},
},
},
},
}
runTests(t, testCases)
}
71 changes: 71 additions & 0 deletions internal/parser/utils/sum.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
package utils

import (
promParser "github.com/prometheus/prometheus/promql/parser"
"github.com/rs/zerolog/log"

"github.com/cloudflare/pint/internal/parser"
)

func HasOuterSum(node *parser.PromQLNode) (calls []*promParser.AggregateExpr) {
if n, ok := node.Node.(*promParser.AggregateExpr); ok {
if n.Op == promParser.SUM {
calls = append(calls, n)
return calls
}
}

if n, ok := node.Node.(*promParser.AggregateExpr); ok {
switch n.Op {
case promParser.COUNT:
return nil
case promParser.COUNT_VALUES:
return nil
}
}

if n, ok := node.Node.(*promParser.BinaryExpr); ok {
if n.VectorMatching != nil {
switch n.VectorMatching.Card {
case promParser.CardOneToOne:
case promParser.CardOneToMany:
for i, child := range node.Children {
if i == len(node.Children)-1 {
return HasOuterSum(child)
}
}
case promParser.CardManyToOne:
return HasOuterSum(node.Children[0])
case promParser.CardManyToMany:
default:
log.Warn().Str("matching", n.VectorMatching.Card.String()).Msg("Unsupported VectorMatching operation")
}
}

if n.Op.IsComparisonOperator() {
for i, child := range node.Children {
if i == 0 {
return HasOuterSum(child)
}
}
} else {
switch n.Op {
case promParser.LOR:
for _, child := range node.Children {
calls = append(calls, HasOuterSum(child)...)
}
return calls
case promParser.DIV, promParser.LUNLESS, promParser.LAND:
for _, child := range node.Children {
return HasOuterSum(child)
}
}
}
}

for _, child := range node.Children {
calls = append(calls, HasOuterSum(child)...)
}

return calls
}
Loading

0 comments on commit e60dbd6

Please sign in to comment.