Skip to content

Commit

Permalink
Add minCount & severity to alerts/count
Browse files Browse the repository at this point in the history
  • Loading branch information
prymitive committed Jun 8, 2023
1 parent 87088f0 commit 6c1e41f
Show file tree
Hide file tree
Showing 8 changed files with 312 additions and 11 deletions.
4 changes: 4 additions & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@
### Added

- Added `exclude` option to `ci` config block - #609.
- Added `minCount` & `severity` to `alerts/count` check - #612.
This allows to only show estimated alerts count only if there would be high
enough (`>= minCount`) number of alerts.
Setting `severity` as well allows to block rules that would create too many alerts.

### Fixed

Expand Down
8 changes: 4 additions & 4 deletions docs/checks/alerts/annotation.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@ annotation "$pattern" {

- `$pattern` - regexp pattern to match annotation name on, this can be templated
to reference checked rule fields, see [Configuration](../../configuration.md)
for details
- `severity` - set custom severity for reported issues, defaults to a warning
- `value` - optional value pattern to enforce, if not set only the
for details.
- `severity` - set custom severity for reported issues, defaults to a warning.
- `value` - optional value pattern to enforce, if not set only the.
- `required` - if `true` pint will require every alert to have this annotation set,
if `false` it will only check values where annotation is set
if `false` it will only check values where annotation is set.

## How to enable it

Expand Down
26 changes: 26 additions & 0 deletions docs/checks/alerts/count.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ alerts {
range = "1h"
step = "1m"
resolve = "5m"
minCount = 0
severity = "bug|warning|info"
}
```

Expand All @@ -30,6 +32,11 @@ alerts {
to `scrape_interval`, try to reduce it if that would load too many samples.
Defaults to `1m`.
- `resolve` - duration after which stale alerts are resolved. Defaults to `5m`.
- `minCount` - minimal number of alerts for this check to report it. Default to `0`.
Set this to a no-zero value if you want this check to report only if the estimated
number of alerts is high enough.
- `severity` - set custom severity for reported issues, defaults to `info`.
This can be only set when `minCount` is set to a non-zero value.

## How to enable it

Expand All @@ -55,6 +62,25 @@ rule {
}
```

Report an error if there would be too many (>=50) alerts firing:

```js
prometheus "prod" {
uri = "https://prometheus-prod.example.com"
timeout = "60s"
}

rule {
alerts {
range = "1d"
step = "1m"
resolve = "5m"
minCount = 50
severity = "bug"
}
}
```

## How to disable it

You can disable this check globally by adding this config block:
Expand Down
12 changes: 10 additions & 2 deletions internal/checks/alerts_count.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,14 @@ const (
AlertsCheckName = "alerts/count"
)

func NewAlertsCheck(prom *promapi.FailoverGroup, lookBack, step, resolve time.Duration) AlertsCheck {
func NewAlertsCheck(prom *promapi.FailoverGroup, lookBack, step, resolve time.Duration, minCount int, severity Severity) AlertsCheck {
return AlertsCheck{
prom: prom,
lookBack: lookBack,
step: step,
resolve: resolve,
minCount: minCount,
severity: severity,
}
}

Expand All @@ -33,6 +35,8 @@ type AlertsCheck struct {
lookBack time.Duration
step time.Duration
resolve time.Duration
minCount int
severity Severity
}

func (c AlertsCheck) Meta() CheckMeta {
Expand Down Expand Up @@ -93,6 +97,10 @@ func (c AlertsCheck) Check(ctx context.Context, _ string, rule parser.Rule, _ []
}
}

if alerts < c.minCount {
return problems
}

lines := []int{}
lines = append(lines, rule.AlertingRule.Expr.Lines()...)
if rule.AlertingRule.For != nil {
Expand All @@ -106,7 +114,7 @@ func (c AlertsCheck) Check(ctx context.Context, _ string, rule parser.Rule, _ []
Lines: lines,
Reporter: c.Reporter(),
Text: fmt.Sprintf("%s would trigger %d alert(s) in the last %s", promText(c.prom.Name(), qr.URI), alerts, output.HumanizeDuration(delta)),
Severity: Information,
Severity: c.severity,
})
return problems
}
214 changes: 213 additions & 1 deletion internal/checks/alerts_count_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ import (
)

func newAlertsCheck(prom *promapi.FailoverGroup) checks.RuleChecker {
return checks.NewAlertsCheck(prom, time.Hour*24, time.Minute, time.Minute*5)
return checks.NewAlertsCheck(prom, time.Hour*24, time.Minute, time.Minute*5, 0, checks.Information)
}

func alertsText(name, uri string, count int, since string) string {
Expand Down Expand Up @@ -265,6 +265,218 @@ func TestAlertsCountCheck(t *testing.T) {
},
},
},
{
description: "minCount=2",
content: "- alert: Foo Is Down\n for: 10m\n expr: up{job=\"foo\"} == 0\n",
checker: func(prom *promapi.FailoverGroup) checks.RuleChecker {
return checks.NewAlertsCheck(prom, time.Hour*24, time.Minute, time.Minute*5, 2, checks.Information)
},
prometheus: newSimpleProm,
problems: func(uri string) []checks.Problem {
return []checks.Problem{
{
Fragment: `up{job="foo"} == 0`,
Lines: []int{2, 3},
Reporter: "alerts/count",
Text: alertsText("prom", uri, 2, "1d"),
Severity: checks.Information,
},
}
},
mocks: []*prometheusMock{
{
conds: []requestCondition{
requireRangeQueryPath,
formCond{key: "query", value: `up{job="foo"} == 0`},
},
resp: matrixResponse{
samples: []*model.SampleStream{
generateSampleStream(
map[string]string{"job": "foo"},
time.Now().Add(time.Hour*-24),
time.Now().Add(time.Hour*-24).Add(time.Minute*6),
time.Minute,
),
generateSampleStream(
map[string]string{"job": "foo"},
time.Now().Add(time.Hour*-23),
time.Now().Add(time.Hour*-23).Add(time.Minute*6),
time.Minute,
),
generateSampleStream(
map[string]string{"job": "foo"},
time.Now().Add(time.Hour*-22),
time.Now().Add(time.Hour*-22).Add(time.Minute),
time.Minute,
),
generateSampleStream(
map[string]string{"job": "foo"},
time.Now().Add(time.Hour*-21),
time.Now().Add(time.Hour*-21).Add(time.Minute*16),
time.Minute,
),
generateSampleStream(
map[string]string{"job": "foo"},
time.Now().Add(time.Hour*-20),
time.Now().Add(time.Hour*-20).Add(time.Minute*9).Add(time.Second*59),
time.Minute,
),
generateSampleStream(
map[string]string{"job": "foo"},
time.Now().Add(time.Hour*-18),
time.Now().Add(time.Hour*-18).Add(time.Hour*2),
time.Minute,
),
},
},
},
{
conds: []requestCondition{
requireRangeQueryPath,
formCond{key: "query", value: `count(up)`},
},
resp: respondWithSingleRangeVector1D(),
},
},
},
{
description: "minCount=2 severity=bug",
content: "- alert: Foo Is Down\n for: 10m\n expr: up{job=\"foo\"} == 0\n",
checker: func(prom *promapi.FailoverGroup) checks.RuleChecker {
return checks.NewAlertsCheck(prom, time.Hour*24, time.Minute, time.Minute*5, 2, checks.Bug)
},
prometheus: newSimpleProm,
problems: func(uri string) []checks.Problem {
return []checks.Problem{
{
Fragment: `up{job="foo"} == 0`,
Lines: []int{2, 3},
Reporter: "alerts/count",
Text: alertsText("prom", uri, 2, "1d"),
Severity: checks.Bug,
},
}
},
mocks: []*prometheusMock{
{
conds: []requestCondition{
requireRangeQueryPath,
formCond{key: "query", value: `up{job="foo"} == 0`},
},
resp: matrixResponse{
samples: []*model.SampleStream{
generateSampleStream(
map[string]string{"job": "foo"},
time.Now().Add(time.Hour*-24),
time.Now().Add(time.Hour*-24).Add(time.Minute*6),
time.Minute,
),
generateSampleStream(
map[string]string{"job": "foo"},
time.Now().Add(time.Hour*-23),
time.Now().Add(time.Hour*-23).Add(time.Minute*6),
time.Minute,
),
generateSampleStream(
map[string]string{"job": "foo"},
time.Now().Add(time.Hour*-22),
time.Now().Add(time.Hour*-22).Add(time.Minute),
time.Minute,
),
generateSampleStream(
map[string]string{"job": "foo"},
time.Now().Add(time.Hour*-21),
time.Now().Add(time.Hour*-21).Add(time.Minute*16),
time.Minute,
),
generateSampleStream(
map[string]string{"job": "foo"},
time.Now().Add(time.Hour*-20),
time.Now().Add(time.Hour*-20).Add(time.Minute*9).Add(time.Second*59),
time.Minute,
),
generateSampleStream(
map[string]string{"job": "foo"},
time.Now().Add(time.Hour*-18),
time.Now().Add(time.Hour*-18).Add(time.Hour*2),
time.Minute,
),
},
},
},
{
conds: []requestCondition{
requireRangeQueryPath,
formCond{key: "query", value: `count(up)`},
},
resp: respondWithSingleRangeVector1D(),
},
},
},
{
description: "minCount=10",
content: "- alert: Foo Is Down\n for: 10m\n expr: up{job=\"foo\"} == 0\n",
checker: func(prom *promapi.FailoverGroup) checks.RuleChecker {
return checks.NewAlertsCheck(prom, time.Hour*24, time.Minute, time.Minute*5, 10, checks.Information)
},
prometheus: newSimpleProm,
problems: noProblems,
mocks: []*prometheusMock{
{
conds: []requestCondition{
requireRangeQueryPath,
formCond{key: "query", value: `up{job="foo"} == 0`},
},
resp: matrixResponse{
samples: []*model.SampleStream{
generateSampleStream(
map[string]string{"job": "foo"},
time.Now().Add(time.Hour*-24),
time.Now().Add(time.Hour*-24).Add(time.Minute*6),
time.Minute,
),
generateSampleStream(
map[string]string{"job": "foo"},
time.Now().Add(time.Hour*-23),
time.Now().Add(time.Hour*-23).Add(time.Minute*6),
time.Minute,
),
generateSampleStream(
map[string]string{"job": "foo"},
time.Now().Add(time.Hour*-22),
time.Now().Add(time.Hour*-22).Add(time.Minute),
time.Minute,
),
generateSampleStream(
map[string]string{"job": "foo"},
time.Now().Add(time.Hour*-21),
time.Now().Add(time.Hour*-21).Add(time.Minute*16),
time.Minute,
),
generateSampleStream(
map[string]string{"job": "foo"},
time.Now().Add(time.Hour*-20),
time.Now().Add(time.Hour*-20).Add(time.Minute*9).Add(time.Second*59),
time.Minute,
),
generateSampleStream(
map[string]string{"job": "foo"},
time.Now().Add(time.Hour*-18),
time.Now().Add(time.Hour*-18).Add(time.Hour*2),
time.Minute,
),
},
},
},
{
conds: []requestCondition{
requireRangeQueryPath,
formCond{key: "query", value: `count(up)`},
},
resp: respondWithSingleRangeVector1D(),
},
},
},
{
description: "{__name__=}",
content: `
Expand Down
34 changes: 31 additions & 3 deletions internal/config/alerts.go
Original file line number Diff line number Diff line change
@@ -1,9 +1,17 @@
package config

import (
"fmt"

"github.com/cloudflare/pint/internal/checks"
)

type AlertsSettings struct {
Range string `hcl:"range" json:"range"`
Step string `hcl:"step" json:"step"`
Resolve string `hcl:"resolve" json:"resolve"`
Range string `hcl:"range" json:"range"`
Step string `hcl:"step" json:"step"`
Resolve string `hcl:"resolve" json:"resolve"`
MinCount int `hcl:"minCount,optional" json:"minCount,omitempty"`
Severity string `hcl:"severity,optional" json:"severity,omitempty"`
}

func (as AlertsSettings) validate() error {
Expand All @@ -22,5 +30,25 @@ func (as AlertsSettings) validate() error {
return err
}
}
if as.MinCount < 0 {
return fmt.Errorf("minCount cannot be < 0, got %d", as.MinCount)
}
if as.Severity != "" {
sev, err := checks.ParseSeverity(as.Severity)
if err != nil {
return err
}
if as.MinCount <= 0 && sev > checks.Information {
return fmt.Errorf("cannot set serverity to %q when minCount is 0", as.Severity)
}
}
return nil
}

func (as AlertsSettings) getSeverity(fallback checks.Severity) checks.Severity {
if as.Severity != "" {
sev, _ := checks.ParseSeverity(as.Severity)
return sev
}
return fallback
}
Loading

0 comments on commit 6c1e41f

Please sign in to comment.