Skip to content

Commit

Permalink
Avoid false positives when Prometheus is down
Browse files Browse the repository at this point in the history
  • Loading branch information
prymitive committed Jun 17, 2022
1 parent fc27478 commit b1a2f6f
Show file tree
Hide file tree
Showing 10 changed files with 505 additions and 26 deletions.
2 changes: 2 additions & 0 deletions cmd/pint/tests/0039_prom_selected_path.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ cmp stderr stderr.txt
-- stderr.txt --
level=info msg="Loading configuration file" path=.pint.hcl
level=info msg="File parsed" path=rules/0001.yml rules=3
level=debug msg="Starting query workers" name=disabled uri=http://127.0.0.1:123 workers=16
level=debug msg="Found alerting rule" alert=first lines=1-3 path=rules/0001.yml
level=debug msg="Configured checks for rule" enabled=["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp"] path=rules/0001.yml rule=first
level=debug msg="Found recording rule" lines=5-6 path=rules/0001.yml record=second
Expand All @@ -15,6 +16,7 @@ rules/0001.yml:6: job label is required and should be preserved when aggregating
expr: sum(bar)

level=info msg="Problems found" Warning=1
level=debug msg="Stopping query workers" name=disabled uri=http://127.0.0.1:123
-- rules/0001.yml --
- alert: first
expr: foo > 1
Expand Down
4 changes: 4 additions & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@
- Strict parsing mode didn't fully validate rule group files, this is now fixed
and pint runs the same set of checks as Prometheus.
- Fixed `promql/series` handling of rules with `{__name__=~"foo|bar"}` queries.
- If Prometheus was stopped or restarted `promql/series` would occasionally
report metrics as "sometimes present". This check will now try to find time
ranges with no metrics in Prometheus and ignore these when checking if
metrics are present.

## v0.21.1

Expand Down
63 changes: 56 additions & 7 deletions internal/checks/promql_series.go
Original file line number Diff line number Diff line change
Expand Up @@ -136,9 +136,14 @@ func (c SeriesCheck) Check(ctx context.Context, rule parser.Rule, entries []disc
continue
}

promUptime, err := c.seriesTimeRanges(ctx, "count(up)", rangeStart, rangeEnd, rangeStep, nil)
if err != nil {
log.Warn().Err(err).Str("name", c.prom.Name()).Msg("Cannot detect Prometheus uptime gaps")
}

// 2. If foo was NEVER there -> BUG
log.Debug().Str("check", c.Reporter()).Stringer("selector", &bareSelector).Msg("Checking if base metric has historical series")
trs, err := c.serieTimeRanges(ctx, fmt.Sprintf("count(%s)", bareSelector.String()), rangeStart, rangeEnd, rangeStep)
trs, err := c.seriesTimeRanges(ctx, fmt.Sprintf("count(%s)", bareSelector.String()), rangeStart, rangeEnd, rangeStep, promUptime)
if err != nil {
problems = append(problems, c.queryProblem(err, bareSelector.String(), expr))
continue
Expand Down Expand Up @@ -188,7 +193,7 @@ func (c SeriesCheck) Check(ctx context.Context, rule parser.Rule, entries []disc
l := stripLabels(selector)
l.LabelMatchers = append(l.LabelMatchers, labels.MustNewMatcher(labels.MatchRegexp, name, ".+"))
log.Debug().Str("check", c.Reporter()).Stringer("selector", &l).Str("label", name).Msg("Checking if base metric has historical series with required label")
trsLabelCount, err := c.serieTimeRanges(ctx, fmt.Sprintf("count(%s) by (%s)", l.String(), name), rangeStart, rangeEnd, rangeStep)
trsLabelCount, err := c.seriesTimeRanges(ctx, fmt.Sprintf("count(%s) by (%s)", l.String(), name), rangeStart, rangeEnd, rangeStep, promUptime)
if err != nil {
problems = append(problems, c.queryProblem(err, selector.String(), expr))
continue
Expand All @@ -208,7 +213,7 @@ func (c SeriesCheck) Check(ctx context.Context, rule parser.Rule, entries []disc
log.Debug().Str("check", c.Reporter()).Stringer("selector", &l).Str("label", name).Msg("No historical series with label used for the query")
}

if len(trsLabelCount.labelValues(name)) == len(trsLabelCount.ranges) && trsLabelCount.avgLife() < (trsLabelCount.duration()/2) {
if len(trsLabelCount.gaps) > 0 && len(trsLabelCount.labelValues(name)) == len(trsLabelCount.ranges) && trsLabelCount.avgLife() < (trsLabelCount.duration()/2) {
highChurnLabels = append(highChurnLabels, name)
}
}
Expand Down Expand Up @@ -266,7 +271,7 @@ func (c SeriesCheck) Check(ctx context.Context, rule parser.Rule, entries []disc
}
log.Debug().Str("check", c.Reporter()).Stringer("selector", &labelSelector).Stringer("matcher", lm).Msg("Checking if there are historical series matching filter")

trsLabel, err := c.serieTimeRanges(ctx, fmt.Sprintf("count(%s)", labelSelector.String()), rangeStart, rangeEnd, rangeStep)
trsLabel, err := c.seriesTimeRanges(ctx, fmt.Sprintf("count(%s)", labelSelector.String()), rangeStart, rangeEnd, rangeStep, promUptime)
if err != nil {
problems = append(problems, c.queryProblem(err, labelSelector.String(), expr))
continue
Expand Down Expand Up @@ -331,7 +336,7 @@ func (c SeriesCheck) Check(ctx context.Context, rule parser.Rule, entries []disc
}

// 7. if foo is ALWAYS/SOMETIMES there BUT {bar OR baz} value is SOMETIMES there -> WARN
if len(trsLabel.ranges) > 1 {
if len(trsLabel.ranges) > 1 && len(trsLabel.gaps) > 0 {
problems = append(problems, Problem{
Fragment: selector.String(),
Lines: expr.Lines(),
Expand All @@ -350,7 +355,7 @@ func (c SeriesCheck) Check(ctx context.Context, rule parser.Rule, entries []disc
}

// 8. If foo is SOMETIMES there -> WARN
if len(trs.ranges) > 1 {
if len(trs.ranges) > 1 && len(trs.gaps) > 0 {
problems = append(problems, Problem{
Fragment: bareSelector.String(),
Lines: expr.Lines(),
Expand Down Expand Up @@ -392,7 +397,7 @@ func (c SeriesCheck) instantSeriesCount(ctx context.Context, query string) (int,
return series, qr.URI, nil
}

func (c SeriesCheck) serieTimeRanges(ctx context.Context, query string, start, end time.Time, step time.Duration) (tr *timeRanges, err error) {
func (c SeriesCheck) seriesTimeRanges(ctx context.Context, query string, start, end time.Time, step time.Duration, promUptime *timeRanges) (tr *timeRanges, err error) {
qr, err := c.prom.RangeQuery(ctx, query, start, end, step)
if err != nil {
return nil, err
Expand All @@ -404,6 +409,7 @@ func (c SeriesCheck) serieTimeRanges(ctx context.Context, query string, start, e
until: qr.End.Round(time.Second),
step: step,
}

var ts time.Time
for _, s := range qr.Samples {
for _, v := range s.Values {
Expand All @@ -429,6 +435,34 @@ func (c SeriesCheck) serieTimeRanges(ctx context.Context, query string, start, e
}
}

if promUptime != nil {
ts = tr.from
for !ts.After(tr.until) {
if tr.covers(ts) || !promUptime.covers(ts) {
ts = ts.Add(tr.step)
continue
}

var found bool
for i := range tr.gaps {
if !ts.Before(tr.gaps[i].start) &&
!ts.After(tr.gaps[i].end.Add(tr.step)) {
tr.gaps[i].end = ts.Add(tr.step)
found = true
break
}
}
if !found {
tr.gaps = append(tr.gaps, rangeGap{
start: ts,
end: ts.Add(tr.step),
})
}

ts = ts.Add(tr.step)
}
}

return tr, nil
}

Expand Down Expand Up @@ -513,12 +547,18 @@ type timeRange struct {
end time.Time
}

type rangeGap struct {
start time.Time
end time.Time
}

type timeRanges struct {
uri string
from time.Time
until time.Time
step time.Duration
ranges []timeRange
gaps []rangeGap
}

func (tr timeRanges) withLabelName(name string) (r []timeRange) {
Expand Down Expand Up @@ -586,3 +626,12 @@ func (tr timeRanges) sinceDesc(t time.Time) (s string) {
}
return output.HumanizeDuration(dur.Round(time.Minute))
}

func (tr timeRanges) covers(ts time.Time) bool {
for _, r := range tr.ranges {
if !r.start.After(ts) && !r.end.Before(ts) {
return true
}
}
return false
}
Loading

0 comments on commit b1a2f6f

Please sign in to comment.