From e60dbd633bee896addd687b29b1ae36bce3cc3b7 Mon Sep 17 00:00:00 2001
From: Lukasz Mierzwa <lukasz@cloudflare.com>
Date: Wed, 1 Mar 2023 13:18:02 +0000
Subject: [PATCH] Detect rate(sum(counter)) chains

---
 docs/changelog.md                   |  11 +++
 docs/checks/promql/rate.md          |   6 ++
 internal/checks/promql_rate.go      |  42 +++++++++-
 internal/checks/promql_rate_test.go | 126 ++++++++++++++++++++++++++++
 internal/parser/utils/sum.go        |  71 ++++++++++++++++
 internal/parser/utils/sum_test.go   |  97 +++++++++++++++++++++
 6 files changed, 350 insertions(+), 3 deletions(-)
 create mode 100644 internal/parser/utils/sum.go
 create mode 100644 internal/parser/utils/sum_test.go

diff --git a/docs/changelog.md b/docs/changelog.md
index 94563400..26f84195 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -5,6 +5,17 @@
 ### Added
 
 - Added `--fail-on` flag to `pint ci` command - #525.
+- [promql/rate](checks/promql/rate.md) will now look for queries that call
+  `rate()` on results of `sum(counter)` via recording rules.
+  Example:
+
+  ```yaml
+  - record: my:sum
+    expr: sum(http_requests_total)
+  
+  - alert: my alert
+    expr: rate(my:sum[5m])
+  ```
 
 ### Changed
 
diff --git a/docs/checks/promql/rate.md b/docs/checks/promql/rate.md
index b60a9d14..2a09245e 100644
--- a/docs/checks/promql/rate.md
+++ b/docs/checks/promql/rate.md
@@ -23,6 +23,12 @@ to verify that:
   For gauge metrics use [`delta()`](https://prometheus.io/docs/prometheus/latest/querying/functions/#delta)
   or [`deriv()`](https://prometheus.io/docs/prometheus/latest/querying/functions/#deriv) 
   functions instead.
+- `rate()` is never called on result of `sum(counter)` since that will always return
+  invalid results.
+  Chaining `rate(sum(...))` is only possible when passing a metric produced via recording rules
+  to `rate()` and so pint will try to find such chains.
+  See [this blog post](https://www.robustperception.io/rate-then-sum-never-sum-then-rate/)
+  for details.
 
 ## Common problems
 
diff --git a/internal/checks/promql_rate.go b/internal/checks/promql_rate.go
index e0bf2872..27eb81eb 100644
--- a/internal/checks/promql_rate.go
+++ b/internal/checks/promql_rate.go
@@ -10,6 +10,7 @@ import (
 	"github.com/cloudflare/pint/internal/discovery"
 	"github.com/cloudflare/pint/internal/output"
 	"github.com/cloudflare/pint/internal/parser"
+	"github.com/cloudflare/pint/internal/parser/utils"
 	"github.com/cloudflare/pint/internal/promapi"
 
 	v1 "github.com/prometheus/client_golang/api/prometheus/v1"
@@ -62,7 +63,7 @@ func (c RateCheck) Check(ctx context.Context, path string, rule parser.Rule, ent
 	}
 
 	done := &completedList{}
-	for _, problem := range c.checkNode(ctx, expr.Query, cfg, done) {
+	for _, problem := range c.checkNode(ctx, expr.Query, entries, cfg, done) {
 		problems = append(problems, Problem{
 			Fragment: problem.expr,
 			Lines:    expr.Lines(),
@@ -75,7 +76,7 @@ func (c RateCheck) Check(ctx context.Context, path string, rule parser.Rule, ent
 	return problems
 }
 
-func (c RateCheck) checkNode(ctx context.Context, node *parser.PromQLNode, cfg *promapi.ConfigResult, done *completedList) (problems []exprProblem) {
+func (c RateCheck) checkNode(ctx context.Context, node *parser.PromQLNode, entries []discovery.Entry, cfg *promapi.ConfigResult, done *completedList) (problems []exprProblem) {
 	if n, ok := node.Node.(*promParser.Call); ok && (n.Func.Name == "rate" || n.Func.Name == "irate" || n.Func.Name == "deriv") {
 		for _, arg := range n.Args {
 			m, ok := arg.(*promParser.MatrixSelector)
@@ -119,12 +120,47 @@ func (c RateCheck) checkNode(ctx context.Context, node *parser.PromQLNode, cfg *
 						})
 					}
 				}
+
+				for _, e := range entries {
+					if e.PathError != nil {
+						continue
+					}
+					if e.Rule.Error.Err != nil {
+						continue
+					}
+					if e.Rule.RecordingRule != nil && e.Rule.RecordingRule.Expr.SyntaxError == nil && e.Rule.RecordingRule.Record.Value.Value == s.Name {
+						for _, sm := range utils.HasOuterSum(e.Rule.RecordingRule.Expr.Query) {
+							if sv, ok := sm.Expr.(*promParser.VectorSelector); ok {
+								metadata, err := c.prom.Metadata(ctx, sv.Name)
+								if err != nil {
+									text, severity := textAndSeverityFromError(err, c.Reporter(), c.prom.Name(), Bug)
+									problems = append(problems, exprProblem{
+										expr:     sv.Name,
+										text:     text,
+										severity: severity,
+									})
+									continue
+								}
+								for _, m := range metadata.Metadata {
+									if m.Type == v1.MetricTypeCounter {
+										problems = append(problems, exprProblem{
+											expr: node.Expr,
+											text: fmt.Sprintf("rate(sum(counter)) chain detected, %s is called here on results of %s, calling rate on sum() results will return bogus results, always sum(rate(counter)), never rate(sum(counter))",
+												node.Expr, sm),
+											severity: Bug,
+										})
+									}
+								}
+							}
+						}
+					}
+				}
 			}
 		}
 	}
 
 	for _, child := range node.Children {
-		problems = append(problems, c.checkNode(ctx, child, cfg, done)...)
+		problems = append(problems, c.checkNode(ctx, child, entries, cfg, done)...)
 	}
 
 	return problems
diff --git a/internal/checks/promql_rate_test.go b/internal/checks/promql_rate_test.go
index 33330351..2e1c9856 100644
--- a/internal/checks/promql_rate_test.go
+++ b/internal/checks/promql_rate_test.go
@@ -23,6 +23,10 @@ func notCounterText(name, uri, fun, metric, kind string) string {
 	return fmt.Sprintf(`%s() should only be used with counters but %q is a %s according to metrics metadata from prometheus %q at %s`, fun, metric, kind, name, uri)
 }
 
+func rateSumText(rateName, sumExpr string) string {
+	return fmt.Sprintf("rate(sum(counter)) chain detected, rate(%s) is called here on results of %s, calling rate on sum() results will return bogus results, always sum(rate(counter)), never rate(sum(counter))", rateName, sumExpr)
+}
+
 func TestRateCheck(t *testing.T) {
 	testCases := []checkTest{
 		{
@@ -619,6 +623,128 @@ func TestRateCheck(t *testing.T) {
 				},
 			},
 		},
+		{
+			description: "rate_over_sum",
+			content:     "- alert: my alert\n  expr: rate(my:sum[5m])\n",
+			entries:     mustParseContent("- record: my:sum\n  expr: sum(foo)\n"),
+			checker:     newRateCheck,
+			prometheus:  newSimpleProm,
+			problems: func(uri string) []checks.Problem {
+				return []checks.Problem{
+					{
+						Fragment: "rate(my:sum[5m])",
+						Lines:    []int{2},
+						Reporter: "promql/rate",
+						Text:     rateSumText("my:sum[5m]", "sum(foo)"),
+						Severity: checks.Bug,
+					},
+				}
+			},
+			mocks: []*prometheusMock{
+				{
+					conds: []requestCondition{requireConfigPath},
+					resp:  configResponse{yaml: "global:\n  scrape_interval: 1m\n"},
+				},
+				{
+					conds: []requestCondition{
+						requireMetadataPath,
+						formCond{"metric", "foo"},
+					},
+					resp: metadataResponse{metadata: map[string][]v1.Metadata{
+						"foo": {{Type: "counter"}},
+					}},
+				},
+				{
+					conds: []requestCondition{
+						requireMetadataPath,
+						formCond{"metric", "my:sum"},
+					},
+					resp: metadataResponse{metadata: map[string][]v1.Metadata{}},
+				},
+			},
+		},
+		{
+			description: "rate_over_sum_error",
+			content:     "- alert: my alert\n  expr: rate(my:sum[5m])\n",
+			entries:     mustParseContent("- record: my:sum\n  expr: sum(foo)\n"),
+			checker:     newRateCheck,
+			prometheus:  newSimpleProm,
+			problems: func(uri string) []checks.Problem {
+				return []checks.Problem{
+					{
+						Fragment: "foo",
+						Lines:    []int{2},
+						Reporter: "promql/rate",
+						Text:     checkErrorUnableToRun(checks.RateCheckName, "prom", uri, "server_error: internal error"),
+						Severity: checks.Bug,
+					},
+				}
+			},
+			mocks: []*prometheusMock{
+				{
+					conds: []requestCondition{requireConfigPath},
+					resp:  configResponse{yaml: "global:\n  scrape_interval: 1m\n"},
+				},
+				{
+					conds: []requestCondition{
+						requireMetadataPath,
+						formCond{"metric", "foo"},
+					},
+					resp: respondWithInternalError(),
+				},
+				{
+					conds: []requestCondition{
+						requireMetadataPath,
+						formCond{"metric", "my:sum"},
+					},
+					resp: metadataResponse{metadata: map[string][]v1.Metadata{}},
+				},
+			},
+		},
+		{
+			description: "rate_over_sum_on_gauge",
+			content:     "- alert: my alert\n  expr: rate(my:sum[5m])\n",
+			entries:     mustParseContent("- record: my:sum\n  expr: sum(foo)\n"),
+			checker:     newRateCheck,
+			prometheus:  newSimpleProm,
+			problems:    noProblems,
+			mocks: []*prometheusMock{
+				{
+					conds: []requestCondition{requireConfigPath},
+					resp:  configResponse{yaml: "global:\n  scrape_interval: 1m\n"},
+				},
+				{
+					conds: []requestCondition{
+						requireMetadataPath,
+						formCond{"metric", "foo"},
+					},
+					resp: metadataResponse{metadata: map[string][]v1.Metadata{
+						"foo": {{Type: "gauge"}},
+					}},
+				},
+				{
+					conds: []requestCondition{
+						requireMetadataPath,
+						formCond{"metric", "my:sum"},
+					},
+					resp: metadataResponse{metadata: map[string][]v1.Metadata{}},
+				},
+			},
+		},
+		{
+			description: "sum_over_rate",
+			content:     "- alert: my alert\n  expr: sum(my:rate:5m)\n",
+			entries:     mustParseContent("- record: my:rate:5m\n  expr: rate(foo[5m])\n"),
+			checker:     newRateCheck,
+			prometheus:  newSimpleProm,
+			problems:    noProblems,
+			mocks: []*prometheusMock{
+				{
+					conds: []requestCondition{requireConfigPath},
+					resp:  configResponse{yaml: "global:\n  scrape_interval: 1m\n"},
+				},
+			},
+		},
 	}
 	runTests(t, testCases)
 }
diff --git a/internal/parser/utils/sum.go b/internal/parser/utils/sum.go
new file mode 100644
index 00000000..91aa23b1
--- /dev/null
+++ b/internal/parser/utils/sum.go
@@ -0,0 +1,71 @@
+package utils
+
+import (
+	promParser "github.com/prometheus/prometheus/promql/parser"
+	"github.com/rs/zerolog/log"
+
+	"github.com/cloudflare/pint/internal/parser"
+)
+
+func HasOuterSum(node *parser.PromQLNode) (calls []*promParser.AggregateExpr) {
+	if n, ok := node.Node.(*promParser.AggregateExpr); ok {
+		if n.Op == promParser.SUM {
+			calls = append(calls, n)
+			return calls
+		}
+	}
+
+	if n, ok := node.Node.(*promParser.AggregateExpr); ok {
+		switch n.Op {
+		case promParser.COUNT:
+			return nil
+		case promParser.COUNT_VALUES:
+			return nil
+		}
+	}
+
+	if n, ok := node.Node.(*promParser.BinaryExpr); ok {
+		if n.VectorMatching != nil {
+			switch n.VectorMatching.Card {
+			case promParser.CardOneToOne:
+			case promParser.CardOneToMany:
+				for i, child := range node.Children {
+					if i == len(node.Children)-1 {
+						return HasOuterSum(child)
+					}
+				}
+			case promParser.CardManyToOne:
+				return HasOuterSum(node.Children[0])
+			case promParser.CardManyToMany:
+			default:
+				log.Warn().Str("matching", n.VectorMatching.Card.String()).Msg("Unsupported VectorMatching operation")
+			}
+		}
+
+		if n.Op.IsComparisonOperator() {
+			for i, child := range node.Children {
+				if i == 0 {
+					return HasOuterSum(child)
+				}
+			}
+		} else {
+			switch n.Op {
+			case promParser.LOR:
+				for _, child := range node.Children {
+					calls = append(calls, HasOuterSum(child)...)
+				}
+				return calls
+			case promParser.DIV, promParser.LUNLESS, promParser.LAND:
+				for _, child := range node.Children {
+					return HasOuterSum(child)
+				}
+			}
+		}
+	}
+
+	for _, child := range node.Children {
+		calls = append(calls, HasOuterSum(child)...)
+	}
+
+	return calls
+}
diff --git a/internal/parser/utils/sum_test.go b/internal/parser/utils/sum_test.go
new file mode 100644
index 00000000..5a817159
--- /dev/null
+++ b/internal/parser/utils/sum_test.go
@@ -0,0 +1,97 @@
+package utils_test
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/require"
+
+	"github.com/cloudflare/pint/internal/parser"
+	"github.com/cloudflare/pint/internal/parser/utils"
+)
+
+func TestHasOuterSum(t *testing.T) {
+	type testCaseT struct {
+		expr   string
+		output []string
+	}
+
+	testCases := []testCaseT{
+		{
+			expr: "foo",
+		},
+		{
+			expr:   "sum(foo)",
+			output: []string{"sum(foo)"},
+		},
+		{
+			expr:   "sum(foo) > 0",
+			output: []string{"sum(foo)"},
+		},
+		{
+			expr:   "sum(foo) / sum(rate(bar[2m]))",
+			output: []string{"sum(foo)"},
+		},
+		{
+			expr:   "sum(sum(foo)) > 0",
+			output: []string{"sum(sum(foo))"},
+		},
+		{
+			expr: "count(sum(foo)) > 0",
+		},
+		{
+			expr: "count_values(\"foo\", sum(foo)) > 0",
+		},
+		{
+			expr:   "floor(sum(foo)) > 0",
+			output: []string{"sum(foo)"},
+		},
+		{
+			expr:   "ceil(sum(foo)) > 0",
+			output: []string{"sum(foo)"},
+		},
+		{
+			expr:   "sum(foo) or sum(bar)",
+			output: []string{"sum(foo)", "sum(bar)"},
+		},
+		{
+			expr:   "sum(foo) * on() sum(bar)",
+			output: []string{"sum(foo)", "sum(bar)"},
+		},
+		{
+			expr:   "sum(foo) without() * on() group_left(instance) sum(deriv(foo[2m]))",
+			output: []string{"sum without () (foo)"},
+		},
+		{
+			expr:   "sum(foo) without(job) * on() group_right(instance) sum(deriv(foo[2m]))",
+			output: []string{"sum(deriv(foo[2m]))"},
+		},
+		{
+			expr: "2 > foo",
+		},
+		{
+			expr: "2 > sum(foo)",
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.expr, func(t *testing.T) {
+			n, err := parser.DecodeExpr(tc.expr)
+			if err != nil {
+				t.Error(err)
+				t.FailNow()
+			}
+			calls := utils.HasOuterSum(n)
+			if len(calls) == 0 {
+				if len(tc.output) > 0 {
+					t.Errorf("HasOuterSum() returned nil, expected %s", tc.output)
+				}
+			} else {
+				output := []string{}
+				for _, a := range calls {
+					output = append(output, a.String())
+				}
+				require.Equal(t, tc.output, output, "HasOuterSum() returned wrong output")
+			}
+		})
+	}
+}