From 7f47f1ef65d209dd2009d6475bab393b96be0164 Mon Sep 17 00:00:00 2001 From: Lukasz Mierzwa Date: Thu, 16 Feb 2023 15:02:01 +0000 Subject: [PATCH] Avoid incorrect label errors when a series disappears --- docs/changelog.md | 2 + internal/checks/alerts_count.go | 6 ++- internal/checks/promql_series.go | 20 ++++++++-- internal/checks/promql_series_test.go | 56 +++++++++++++++++++++++++++ 4 files changed, 78 insertions(+), 6 deletions(-) diff --git a/docs/changelog.md b/docs/changelog.md index 0c8079c0..aa2b1433 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -5,6 +5,8 @@ ### Fixed - Fixed a bug in `pint ci` that would cause a failure if a directory was renamed. +- Fixed false positive reports from `promql/series` check when a time series + disappears from Prometheus. ## v0.40.0 diff --git a/internal/checks/alerts_count.go b/internal/checks/alerts_count.go index d7dbac27..464620ed 100644 --- a/internal/checks/alerts_count.go +++ b/internal/checks/alerts_count.go @@ -55,7 +55,9 @@ func (c AlertsCheck) Check(ctx context.Context, path string, rule parser.Rule, e return } - qr, err := c.prom.RangeQuery(ctx, rule.AlertingRule.Expr.Value.Value, promapi.NewRelativeRange(c.lookBack, c.step)) + params := promapi.NewRelativeRange(c.lookBack, c.step) + + qr, err := c.prom.RangeQuery(ctx, rule.AlertingRule.Expr.Value.Value, params) if err != nil { text, severity := textAndSeverityFromError(err, c.Reporter(), c.prom.Name(), Bug) problems = append(problems, Problem{ @@ -69,7 +71,7 @@ func (c AlertsCheck) Check(ctx context.Context, path string, rule parser.Rule, e } if len(qr.Series.Ranges) > 0 { - promUptime, err := c.prom.RangeQuery(ctx, "count(up)", promapi.NewRelativeRange(c.lookBack, c.step)) + promUptime, err := c.prom.RangeQuery(ctx, "count(up)", params) if err != nil { log.Warn().Err(err).Str("name", c.prom.Name()).Msg("Cannot detect Prometheus uptime gaps") } else { diff --git a/internal/checks/promql_series.go b/internal/checks/promql_series.go index 7a8c9182..b9c31e93 100644 --- a/internal/checks/promql_series.go +++ b/internal/checks/promql_series.go @@ -99,6 +99,8 @@ func (c SeriesCheck) Check(ctx context.Context, path string, rule parser.Rule, e return } + params := promapi.NewRelativeRange(settings.lookbackRangeDuration, settings.lookbackStepDuration) + done := map[string]bool{} for _, selector := range getSelectors(expr.Query) { if _, ok := done[selector.String()]; ok { @@ -183,7 +185,7 @@ func (c SeriesCheck) Check(ctx context.Context, path string, rule parser.Rule, e continue } - promUptime, err := c.prom.RangeQuery(ctx, fmt.Sprintf("count(%s)", c.prom.UptimeMetric()), promapi.NewRelativeRange(settings.lookbackRangeDuration, settings.lookbackStepDuration)) + promUptime, err := c.prom.RangeQuery(ctx, fmt.Sprintf("count(%s)", c.prom.UptimeMetric()), params) if err != nil { log.Warn().Err(err).Str("name", c.prom.Name()).Msg("Cannot detect Prometheus uptime gaps") } @@ -198,7 +200,7 @@ func (c SeriesCheck) Check(ctx context.Context, path string, rule parser.Rule, e // 2. If foo was NEVER there -> BUG log.Debug().Str("check", c.Reporter()).Stringer("selector", &bareSelector).Msg("Checking if base metric has historical series") - trs, err := c.prom.RangeQuery(ctx, fmt.Sprintf("count(%s)", bareSelector.String()), promapi.NewRelativeRange(settings.lookbackRangeDuration, settings.lookbackStepDuration)) + trs, err := c.prom.RangeQuery(ctx, fmt.Sprintf("count(%s)", bareSelector.String()), params) if err != nil { problems = append(problems, c.queryProblem(err, bareSelector.String(), expr)) continue @@ -253,7 +255,7 @@ func (c SeriesCheck) Check(ctx context.Context, path string, rule parser.Rule, e l := stripLabels(selector) l.LabelMatchers = append(l.LabelMatchers, labels.MustNewMatcher(labels.MatchRegexp, name, ".+")) log.Debug().Str("check", c.Reporter()).Stringer("selector", &l).Str("label", name).Msg("Checking if base metric has historical series with required label") - trsLabelCount, err := c.prom.RangeQuery(ctx, fmt.Sprintf("absent(%s)", l.String()), promapi.NewRelativeRange(settings.lookbackRangeDuration, settings.lookbackStepDuration)) + trsLabelCount, err := c.prom.RangeQuery(ctx, fmt.Sprintf("absent(%s)", l.String()), params) if err != nil { problems = append(problems, c.queryProblem(err, selector.String(), expr)) continue @@ -261,6 +263,16 @@ func (c SeriesCheck) Check(ctx context.Context, path string, rule parser.Rule, e trsLabelCount.Series.FindGaps(promUptime.Series, trsLabelCount.Series.From, trsLabelCount.Series.Until) if trsLabelCount.Series.Ranges.Len() == 1 && len(trsLabelCount.Series.Gaps) == 0 { + var isAbsentOutsideSeriesRange bool + for _, str := range trs.Series.Ranges { + if _, ok := promapi.Overlaps(str, trsLabelCount.Series.Ranges[0], trsLabelCount.Series.Step); !ok { + isAbsentOutsideSeriesRange = true + break + } + } + if isAbsentOutsideSeriesRange { + continue + } problems = append(problems, Problem{ Fragment: selector.String(), Lines: expr.Lines(), @@ -333,7 +345,7 @@ func (c SeriesCheck) Check(ctx context.Context, path string, rule parser.Rule, e } log.Debug().Str("check", c.Reporter()).Stringer("selector", &labelSelector).Stringer("matcher", lm).Msg("Checking if there are historical series matching filter") - trsLabel, err := c.prom.RangeQuery(ctx, fmt.Sprintf("count(%s)", labelSelector.String()), promapi.NewRelativeRange(settings.lookbackRangeDuration, settings.lookbackStepDuration)) + trsLabel, err := c.prom.RangeQuery(ctx, fmt.Sprintf("count(%s)", labelSelector.String()), params) if err != nil { problems = append(problems, c.queryProblem(err, labelSelector.String(), expr)) continue diff --git a/internal/checks/promql_series_test.go b/internal/checks/promql_series_test.go index 6ac635fd..3c8ea8d2 100644 --- a/internal/checks/promql_series_test.go +++ b/internal/checks/promql_series_test.go @@ -700,6 +700,62 @@ func TestSeriesCheck(t *testing.T) { }, }, }, + { + description: "#3 metric disappeared, only one sample, absent race", + content: "- record: foo\n expr: sum(found{job=\"abc\"})\n", + checker: newSeriesCheck, + prometheus: newSimpleProm, + problems: func(uri string) []checks.Problem { + return []checks.Problem{ + { + Fragment: `found`, + Lines: []int{2}, + Reporter: checks.SeriesCheckName, + Text: seriesDisappearedText("prom", uri, `found`, "1w"), + Severity: checks.Bug, + }, + } + }, + mocks: []*prometheusMock{ + { + conds: []requestCondition{ + requireQueryPath, + formCond{key: "query", value: `count(found{job="abc"})`}, + }, + resp: respondWithEmptyVector(), + }, + { + conds: []requestCondition{ + requireRangeQueryPath, + formCond{key: "query", value: `count(found)`}, + }, + resp: matrixResponse{ + samples: []*model.SampleStream{ + generateSampleStream( + map[string]string{}, + time.Now().Add(time.Hour*24*-7), + time.Now().Add(time.Hour*24*-7).Add(time.Minute*5), + time.Minute*5, + ), + }, + }, + }, + { + conds: []requestCondition{ + requireRangeQueryPath, + formCond{key: "query", value: `absent(found{job=~".+"})`}, + }, + resp: respondWithSingleRangeVector1W(), + }, + { + conds: []requestCondition{ + requireRangeQueryPath, + formCond{key: "query", value: "count(up)"}, + }, + resp: respondWithSingleRangeVector1W(), + }, + }, + }, { description: "#4 metric was present but disappeared 50m ago", content: "- record: foo\n expr: sum(found{job=\"foo\", instance=\"bar\"})\n",