Skip to content

Commit

Permalink
Support setting min-age for promql/series
Browse files Browse the repository at this point in the history
  • Loading branch information
prymitive committed Apr 26, 2022
1 parent 13026ff commit 66f5bda
Show file tree
Hide file tree
Showing 8 changed files with 883 additions and 46 deletions.
14 changes: 14 additions & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,19 @@
# Changelog

## v0.18.0

### Added

- Allow fine tuning `promql/series` check with extra control comments
`# pint rule/set promql/series min-age ...` and
`# pint rule/set promql/series ignore/label-value ...`
See [promql/series](/docs/checks/promql/series.md) for details.

### Changed

- `promql/series` will now report missing metrics only if they were last seen
over 2 hours ago by default. This can be customized per rule with comments.

## v0.17.7

### Fixed
Expand Down
72 changes: 71 additions & 1 deletion docs/checks/promql/series.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,77 @@ that.

## Configuration

This check doesn't have any configuration options.
This check doesn't have any configuration options but it supports a few control
comments that can be placed around each rule.

### min-age

But default this check will report a problem if a metric was present
in Prometheus but disappeared for at least two hours ago.
You can change this duration per Prometheus rule by adding a comment around it.
Syntax:

To set `min-age` for all metrics in a query:

`# pint rule/set promql/series min-age $duration`

To set `min-age` for specific metric:

`# pint rule/set promql/series($metric_name) min-age $duration`

Example:

```yaml
- record: ...
# Report problems if any metric in this query is missing for at least 3 days
# pint rule/set promql/series min-age 3d
expr: sum(foo) / sum(bar)
- record: ...
# Report problems if:
# - metric "foo" is missing for at least 1 hour (defaults)
# - metric "bar{instance=xxx}" is missing for at least 4 hours
# pint rule/set promql/series(bar{instance="xxx"}) min-age 4h
expr: sum(foo) / sum(bar{instance="xxx"})
```

# ignore/label-value

By default pint will report a problem if a rule uses query with a label filter
and the value of that filter query doesn't match anything.
For example `rate(http_errors_total{code="500"}[2m])` will report a problem
if there are no `http_errors_total` series with `code="500"`.
The goal here is to catch typos in label filters or labels with values that
got renamed, but in some cases this will report false positive problems,
especially if label values are exported dynamically, for example after
HTTP status code is observed.
In the `http_errors_total{code="500"}` example if `code` label is generated
based on HTTP responses then there won't be any series with `code="500"` until
there's at least one HTTP response that generated this code.
You can relax pint checks so it doesn't validate if label values for specific
labels are present on any time series.
Syntax

`# pint rule/set promql/series ignore/label-value $labelName`

Example:

```yaml
- alert: ...
# disable code label checks for all metrics used in this rule
# pint rule/set promql/series ignore/label-value code
expr: rate(http_errors_total{code="500"}[2m]) > 0.1
- alert: ...
# disable code label checks for http_errors_total metric
# pint rule/set promql/series(http_errors_total) ignore/label-value code
expr: rate(http_errors_total{code="500"}[2m]) > 0.1
- alert: ...
# disable code label checks only for http_errors_total{code="500"} queries
# pint rule/set promql/series(http_errors_total{code="500"}) ignore/label-value code
expr: rate(http_errors_total{code="500"}[2m]) > 0.1
```

## How to enable it

Expand Down
87 changes: 83 additions & 4 deletions internal/checks/promql_series.go
Original file line number Diff line number Diff line change
Expand Up @@ -214,10 +214,26 @@ func (c SeriesCheck) Check(ctx context.Context, rule parser.Rule, entries []disc
continue
}

// 4. If foo was ALWAYS there but it's NO LONGER there -> BUG
// 4. If foo was ALWAYS there but it's NO LONGER there (for more than min-age) -> BUG
if len(trs.ranges) == 1 &&
!trs.oldest().After(trs.from.Add(rangeStep)) &&
trs.newest().Before(trs.until.Add(rangeStep*-1)) {

minAge, p := c.getMinAge(rule, selector)
if len(p) > 0 {
problems = append(problems, p...)
}

if !trs.newest().Before(trs.until.Add(time.Duration(minAge) * -1)) {
log.Debug().
Str("check", c.Reporter()).
Stringer("selector", &selector).
Str("min-age", output.HumanizeDuration(minAge)).
Str("last-seen", trs.sinceDesc(trs.newest())).
Msg("Series disappeared from prometheus but for less then configured min-age")
continue
}

problems = append(problems, Problem{
Fragment: bareSelector.String(),
Lines: expr.Lines(),
Expand All @@ -227,7 +243,7 @@ func (c SeriesCheck) Check(ctx context.Context, rule parser.Rule, entries []disc
promText(c.prom.Name(), trs.uri), bareSelector.String(), trs.sinceDesc(trs.newest())),
Severity: Bug,
})
log.Debug().Str("check", c.Reporter()).Stringer("selector", &bareSelector).Msg("Series disappeared from prometheus ")
log.Debug().Str("check", c.Reporter()).Stringer("selector", &bareSelector).Msg("Series disappeared from prometheus")
continue
}

Expand All @@ -238,6 +254,10 @@ func (c SeriesCheck) Check(ctx context.Context, rule parser.Rule, entries []disc
if lm.Type != labels.MatchEqual && lm.Type != labels.MatchRegexp {
continue
}
if c.isLabelValueIgnored(rule, selector, lm.Name) {
log.Debug().Stringer("selector", &selector).Str("label", lm.Name).Msg("Label check disabled by comment")
continue
}
labelSelector := promParser.VectorSelector{
Name: metricName,
LabelMatchers: []*labels.Matcher{lm},
Expand Down Expand Up @@ -277,8 +297,24 @@ func (c SeriesCheck) Check(ctx context.Context, rule parser.Rule, entries []disc

// 6. If foo is ALWAYS/SOMETIMES there AND {bar OR baz} used to be there ALWAYS BUT it's NO LONGER there -> BUG
if len(trsLabel.ranges) == 1 &&
!trsLabel.oldest().After(trs.until.Add(rangeLookback-1).Add(rangeStep)) &&
trsLabel.newest().Before(trs.until.Add(rangeStep*-1)) {
!trsLabel.oldest().After(trsLabel.until.Add(rangeLookback-1).Add(rangeStep)) &&
trsLabel.newest().Before(trsLabel.until.Add(rangeStep*-1)) {

minAge, p := c.getMinAge(rule, selector)
if len(p) > 0 {
problems = append(problems, p...)
}

if !trsLabel.newest().Before(trsLabel.until.Add(time.Duration(minAge) * -1)) {
log.Debug().
Str("check", c.Reporter()).
Stringer("selector", &selector).
Str("min-age", output.HumanizeDuration(minAge)).
Str("last-seen", trs.sinceDesc(trsLabel.newest())).
Msg("Series disappeared from prometheus but for less then configured min-age")
continue
}

problems = append(problems, Problem{
Fragment: labelSelector.String(),
Lines: expr.Lines(),
Expand Down Expand Up @@ -394,6 +430,48 @@ func (c SeriesCheck) serieTimeRanges(ctx context.Context, query string, lookback
return tr, nil
}

func (c SeriesCheck) getMinAge(rule parser.Rule, selector promParser.VectorSelector) (minAge time.Duration, problems []Problem) {
minAge = time.Hour * 2

bareSelector := stripLabels(selector)
for _, s := range [][]string{
{"rule/set", c.Reporter(), "min-age"},
{"rule/set", fmt.Sprintf("%s(%s)", c.Reporter(), bareSelector.String()), "min-age"},
{"rule/set", fmt.Sprintf("%s(%s)", c.Reporter(), selector.String()), "min-age"},
} {
if cmt, ok := rule.GetComment(s...); ok {
dur, err := model.ParseDuration(cmt.Value)
if err != nil {
problems = append(problems, Problem{
Fragment: cmt.String(),
Lines: rule.LineRange(),
Reporter: c.Reporter(),
Text: fmt.Sprintf("failed to parse pint comment as duration: %s", err),
Severity: Warning,
})
} else {
minAge = time.Duration(dur)
}
}
}

return minAge, problems
}

func (c SeriesCheck) isLabelValueIgnored(rule parser.Rule, selector promParser.VectorSelector, labelName string) bool {
bareSelector := stripLabels(selector)
for _, s := range []string{
fmt.Sprintf("rule/set %s ignore/label-value %s", c.Reporter(), labelName),
fmt.Sprintf("rule/set %s(%s) ignore/label-value %s", c.Reporter(), bareSelector.String(), labelName),
fmt.Sprintf("rule/set %s(%s) ignore/label-value %s", c.Reporter(), selector.String(), labelName),
} {
if rule.HasComment(s) {
return true
}
}
return false
}

func getSelectors(n *parser.PromQLNode) (selectors []promParser.VectorSelector) {
if node, ok := n.Node.(*promParser.VectorSelector); ok {
// copy node without offset
Expand All @@ -419,6 +497,7 @@ func stripLabels(selector promParser.VectorSelector) promParser.VectorSelector {
for _, lm := range selector.LabelMatchers {
if lm.Name == labels.MetricName {
s.LabelMatchers = append(s.LabelMatchers, lm)
s.Name = lm.Value
}
}
return s
Expand Down
Loading

0 comments on commit 66f5bda

Please sign in to comment.