Skip to content

Commit

Permalink
Avoid incorrect label errors when a series disappears
Browse files Browse the repository at this point in the history
  • Loading branch information
prymitive committed Feb 16, 2023
1 parent 2b4061f commit 7f47f1e
Show file tree
Hide file tree
Showing 4 changed files with 78 additions and 6 deletions.
2 changes: 2 additions & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
### Fixed

- Fixed a bug in `pint ci` that would cause a failure if a directory was renamed.
- Fixed false positive reports from `promql/series` check when a time series
disappears from Prometheus.

## v0.40.0

Expand Down
6 changes: 4 additions & 2 deletions internal/checks/alerts_count.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,9 @@ func (c AlertsCheck) Check(ctx context.Context, path string, rule parser.Rule, e
return
}

qr, err := c.prom.RangeQuery(ctx, rule.AlertingRule.Expr.Value.Value, promapi.NewRelativeRange(c.lookBack, c.step))
params := promapi.NewRelativeRange(c.lookBack, c.step)

qr, err := c.prom.RangeQuery(ctx, rule.AlertingRule.Expr.Value.Value, params)
if err != nil {
text, severity := textAndSeverityFromError(err, c.Reporter(), c.prom.Name(), Bug)
problems = append(problems, Problem{
Expand All @@ -69,7 +71,7 @@ func (c AlertsCheck) Check(ctx context.Context, path string, rule parser.Rule, e
}

if len(qr.Series.Ranges) > 0 {
promUptime, err := c.prom.RangeQuery(ctx, "count(up)", promapi.NewRelativeRange(c.lookBack, c.step))
promUptime, err := c.prom.RangeQuery(ctx, "count(up)", params)
if err != nil {
log.Warn().Err(err).Str("name", c.prom.Name()).Msg("Cannot detect Prometheus uptime gaps")
} else {
Expand Down
20 changes: 16 additions & 4 deletions internal/checks/promql_series.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,8 @@ func (c SeriesCheck) Check(ctx context.Context, path string, rule parser.Rule, e
return
}

params := promapi.NewRelativeRange(settings.lookbackRangeDuration, settings.lookbackStepDuration)

done := map[string]bool{}
for _, selector := range getSelectors(expr.Query) {
if _, ok := done[selector.String()]; ok {
Expand Down Expand Up @@ -183,7 +185,7 @@ func (c SeriesCheck) Check(ctx context.Context, path string, rule parser.Rule, e
continue
}

promUptime, err := c.prom.RangeQuery(ctx, fmt.Sprintf("count(%s)", c.prom.UptimeMetric()), promapi.NewRelativeRange(settings.lookbackRangeDuration, settings.lookbackStepDuration))
promUptime, err := c.prom.RangeQuery(ctx, fmt.Sprintf("count(%s)", c.prom.UptimeMetric()), params)
if err != nil {
log.Warn().Err(err).Str("name", c.prom.Name()).Msg("Cannot detect Prometheus uptime gaps")
}
Expand All @@ -198,7 +200,7 @@ func (c SeriesCheck) Check(ctx context.Context, path string, rule parser.Rule, e

// 2. If foo was NEVER there -> BUG
log.Debug().Str("check", c.Reporter()).Stringer("selector", &bareSelector).Msg("Checking if base metric has historical series")
trs, err := c.prom.RangeQuery(ctx, fmt.Sprintf("count(%s)", bareSelector.String()), promapi.NewRelativeRange(settings.lookbackRangeDuration, settings.lookbackStepDuration))
trs, err := c.prom.RangeQuery(ctx, fmt.Sprintf("count(%s)", bareSelector.String()), params)
if err != nil {
problems = append(problems, c.queryProblem(err, bareSelector.String(), expr))
continue
Expand Down Expand Up @@ -253,14 +255,24 @@ func (c SeriesCheck) Check(ctx context.Context, path string, rule parser.Rule, e
l := stripLabels(selector)
l.LabelMatchers = append(l.LabelMatchers, labels.MustNewMatcher(labels.MatchRegexp, name, ".+"))
log.Debug().Str("check", c.Reporter()).Stringer("selector", &l).Str("label", name).Msg("Checking if base metric has historical series with required label")
trsLabelCount, err := c.prom.RangeQuery(ctx, fmt.Sprintf("absent(%s)", l.String()), promapi.NewRelativeRange(settings.lookbackRangeDuration, settings.lookbackStepDuration))
trsLabelCount, err := c.prom.RangeQuery(ctx, fmt.Sprintf("absent(%s)", l.String()), params)
if err != nil {
problems = append(problems, c.queryProblem(err, selector.String(), expr))
continue
}
trsLabelCount.Series.FindGaps(promUptime.Series, trsLabelCount.Series.From, trsLabelCount.Series.Until)

if trsLabelCount.Series.Ranges.Len() == 1 && len(trsLabelCount.Series.Gaps) == 0 {
var isAbsentOutsideSeriesRange bool
for _, str := range trs.Series.Ranges {
if _, ok := promapi.Overlaps(str, trsLabelCount.Series.Ranges[0], trsLabelCount.Series.Step); !ok {
isAbsentOutsideSeriesRange = true
break
}
}
if isAbsentOutsideSeriesRange {
continue
}
problems = append(problems, Problem{
Fragment: selector.String(),
Lines: expr.Lines(),
Expand Down Expand Up @@ -333,7 +345,7 @@ func (c SeriesCheck) Check(ctx context.Context, path string, rule parser.Rule, e
}
log.Debug().Str("check", c.Reporter()).Stringer("selector", &labelSelector).Stringer("matcher", lm).Msg("Checking if there are historical series matching filter")

trsLabel, err := c.prom.RangeQuery(ctx, fmt.Sprintf("count(%s)", labelSelector.String()), promapi.NewRelativeRange(settings.lookbackRangeDuration, settings.lookbackStepDuration))
trsLabel, err := c.prom.RangeQuery(ctx, fmt.Sprintf("count(%s)", labelSelector.String()), params)
if err != nil {
problems = append(problems, c.queryProblem(err, labelSelector.String(), expr))
continue
Expand Down
56 changes: 56 additions & 0 deletions internal/checks/promql_series_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -700,6 +700,62 @@ func TestSeriesCheck(t *testing.T) {
},
},
},
{
description: "#3 metric disappeared, only one sample, absent race",
content: "- record: foo\n expr: sum(found{job=\"abc\"})\n",
checker: newSeriesCheck,
prometheus: newSimpleProm,
problems: func(uri string) []checks.Problem {
return []checks.Problem{
{
Fragment: `found`,
Lines: []int{2},
Reporter: checks.SeriesCheckName,
Text: seriesDisappearedText("prom", uri, `found`, "1w"),
Severity: checks.Bug,
},
}
},
mocks: []*prometheusMock{
{
conds: []requestCondition{
requireQueryPath,
formCond{key: "query", value: `count(found{job="abc"})`},
},
resp: respondWithEmptyVector(),
},
{
conds: []requestCondition{
requireRangeQueryPath,
formCond{key: "query", value: `count(found)`},
},
resp: matrixResponse{
samples: []*model.SampleStream{
generateSampleStream(
map[string]string{},
time.Now().Add(time.Hour*24*-7),
time.Now().Add(time.Hour*24*-7).Add(time.Minute*5),
time.Minute*5,
),
},
},
},
{
conds: []requestCondition{
requireRangeQueryPath,
formCond{key: "query", value: `absent(found{job=~".+"})`},
},
resp: respondWithSingleRangeVector1W(),
},
{
conds: []requestCondition{
requireRangeQueryPath,
formCond{key: "query", value: "count(up)"},
},
resp: respondWithSingleRangeVector1W(),
},
},
},
{
description: "#4 metric was present but disappeared 50m ago",
content: "- record: foo\n expr: sum(found{job=\"foo\", instance=\"bar\"})\n",
Expand Down

0 comments on commit 7f47f1e

Please sign in to comment.