Skip to content

Commit

Permalink
Refactor promql/series check
Browse files Browse the repository at this point in the history
  • Loading branch information
prymitive committed Mar 9, 2022
1 parent 6bc1eff commit 6de93c2
Show file tree
Hide file tree
Showing 19 changed files with 891 additions and 161 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ test:

.PHONY: debug-testscript
debug-testscript:
for I in ./cmd/pint/tests/*.txt ; do T=`basename "$${I}" | cut -d. -f1`; echo ">>> $${T}" ; go test -count=1 -timeout=30s -v -run=TestScript/$${T} ./cmd/pint ; done
for I in ./cmd/pint/tests/*.txt ; do T=`basename "$${I}" | cut -d. -f1`; echo ">>> $${T}" ; go test -count=1 -timeout=30s -v -run=TestScript/$${T} ./cmd/pint || exit 1 ; done

.PHONY: update-snapshots
update-snapshots:
Expand Down
16 changes: 8 additions & 8 deletions cmd/pint/tests/0054_watch_metrics_prometheus.txt
Original file line number Diff line number Diff line change
Expand Up @@ -64,14 +64,14 @@ pint_check_iterations_total
pint_last_run_time_seconds
# HELP pint_problem Prometheus rule problem reported by pint
# TYPE pint_problem gauge
pint_problem{filename="rules/1.yml",kind="alerting",name="comparison",problem="cound't run \"promql/rate\" checks due to \"prom1\" on http://127.0.0.1:7054 connection error: failed to query Prometheus config: server_error: server error: 500",reporter="promql/rate",severity="bug"}
pint_problem{filename="rules/1.yml",kind="alerting",name="comparison",problem="cound't run \"promql/rate\" checks due to \"prom2\" on http://127.0.0.1:1054 connection error: failed to query Prometheus config: Get \"http://127.0.0.1:1054/api/v1/status/config\": dial tcp 127.0.0.1:1054: connect: connection refused",reporter="promql/rate",severity="bug"}
pint_problem{filename="rules/1.yml",kind="alerting",name="comparison",problem="cound't run \"promql/series\" checks due to \"prom2\" on http://127.0.0.1:1054 connection error: Post \"http://127.0.0.1:1054/api/v1/query\": dial tcp 127.0.0.1:1054: connect: connection refused",reporter="promql/series",severity="bug"}
pint_problem{filename="rules/1.yml",kind="alerting",name="comparison",problem="query using \"prom1\" on http://127.0.0.1:7054 failed with: bad_response: Unmarshal: there are bytes left after unmarshal, error found in #10 byte of ...|y\"\n }Fatal error|..., bigger context ...|:\"bad_data\",\n \"error\":\"bogus query\"\n }Fatal error|...",reporter="promql/series",severity="bug"}
pint_problem{filename="rules/1.yml",kind="recording",name="aggregate",problem="cound't run \"promql/rate\" checks due to \"prom1\" on http://127.0.0.1:7054 connection error: failed to query Prometheus config: server_error: server error: 500",reporter="promql/rate",severity="bug"}
pint_problem{filename="rules/1.yml",kind="recording",name="aggregate",problem="cound't run \"promql/rate\" checks due to \"prom2\" on http://127.0.0.1:1054 connection error: failed to query Prometheus config: Get \"http://127.0.0.1:1054/api/v1/status/config\": dial tcp 127.0.0.1:1054: connect: connection refused",reporter="promql/rate",severity="bug"}
pint_problem{filename="rules/1.yml",kind="recording",name="aggregate",problem="cound't run \"promql/series\" checks due to \"prom2\" on http://127.0.0.1:1054 connection error: Post \"http://127.0.0.1:1054/api/v1/query\": dial tcp 127.0.0.1:1054: connect: connection refused",reporter="promql/series",severity="bug"}
pint_problem{filename="rules/1.yml",kind="recording",name="aggregate",problem="query using \"prom1\" on http://127.0.0.1:7054 failed with: bad_response: Unmarshal: there are bytes left after unmarshal, error found in #10 byte of ...|y\"\n }Fatal error|..., bigger context ...|:\"bad_data\",\n \"error\":\"bogus query\"\n }Fatal error|...",reporter="promql/series",severity="bug"}
pint_problem{filename="rules/1.yml",kind="alerting",name="comparison",problem="cound't run \"promql/rate\" checks due to prometheus \"prom1\" at http://127.0.0.1:7054 connection error: failed to query Prometheus config: server_error: server error: 500",reporter="promql/rate",severity="bug"}
pint_problem{filename="rules/1.yml",kind="alerting",name="comparison",problem="cound't run \"promql/rate\" checks due to prometheus \"prom2\" at http://127.0.0.1:1054 connection error: failed to query Prometheus config: Get \"http://127.0.0.1:1054/api/v1/status/config\": dial tcp 127.0.0.1:1054: connect: connection refused",reporter="promql/rate",severity="bug"}
pint_problem{filename="rules/1.yml",kind="alerting",name="comparison",problem="cound't run \"promql/series\" checks due to prometheus \"prom2\" at http://127.0.0.1:1054 connection error: Post \"http://127.0.0.1:1054/api/v1/query\": dial tcp 127.0.0.1:1054: connect: connection refused",reporter="promql/series",severity="bug"}
pint_problem{filename="rules/1.yml",kind="alerting",name="comparison",problem="prometheus \"prom1\" at http://127.0.0.1:7054 failed with: bad_response: Unmarshal: there are bytes left after unmarshal, error found in #10 byte of ...|y\"\n }Fatal error|..., bigger context ...|:\"bad_data\",\n \"error\":\"bogus query\"\n }Fatal error|...",reporter="promql/series",severity="bug"}
pint_problem{filename="rules/1.yml",kind="recording",name="aggregate",problem="cound't run \"promql/rate\" checks due to prometheus \"prom1\" at http://127.0.0.1:7054 connection error: failed to query Prometheus config: server_error: server error: 500",reporter="promql/rate",severity="bug"}
pint_problem{filename="rules/1.yml",kind="recording",name="aggregate",problem="cound't run \"promql/rate\" checks due to prometheus \"prom2\" at http://127.0.0.1:1054 connection error: failed to query Prometheus config: Get \"http://127.0.0.1:1054/api/v1/status/config\": dial tcp 127.0.0.1:1054: connect: connection refused",reporter="promql/rate",severity="bug"}
pint_problem{filename="rules/1.yml",kind="recording",name="aggregate",problem="cound't run \"promql/series\" checks due to prometheus \"prom2\" at http://127.0.0.1:1054 connection error: Post \"http://127.0.0.1:1054/api/v1/query\": dial tcp 127.0.0.1:1054: connect: connection refused",reporter="promql/series",severity="bug"}
pint_problem{filename="rules/1.yml",kind="recording",name="aggregate",problem="prometheus \"prom1\" at http://127.0.0.1:7054 failed with: bad_response: Unmarshal: there are bytes left after unmarshal, error found in #10 byte of ...|y\"\n }Fatal error|..., bigger context ...|:\"bad_data\",\n \"error\":\"bogus query\"\n }Fatal error|...",reporter="promql/series",severity="bug"}
pint_problem{filename="rules/1.yml",kind="recording",name="broken",problem="syntax error: no arguments for aggregate expression provided",reporter="promql/syntax",severity="fatal"}
# HELP pint_problems Total number of problems reported by pint
# TYPE pint_problems gauge
Expand Down
14 changes: 13 additions & 1 deletion cmd/pint/tests/0055_prometheus_failover.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ pint.error --no-color lint rules
! stdout .
stderr 'level=error msg="Query failed" error="Post \\"http://127.0.0.1:1055/api/v1/query\\": dial tcp 127.0.0.1:1055: connect: connection refused" query=count\(foo\) uri=http://127.0.0.1:1055'
stderr 'level=error msg="Failed to query Prometheus configuration" error="Get \\"http://127.0.0.1:1055/api/v1/status/config\\": dial tcp 127.0.0.1:1055: connect: connection refused" uri=http://127.0.0.1:1055'
stderr 'rules/1.yml:2: query using "prom" on http://127.0.0.1:7055 completed without any results for foo \(promql/series\)'
stderr 'rules/1.yml:2: prometheus "prom" at http://127.0.0.1:7055 didn''t have any series for "foo" metric in the last 1w \(promql/series\)'
exec bash -c 'cat prometheus.pid | xargs kill'

-- rules/1.yml --
Expand Down Expand Up @@ -42,6 +42,18 @@ func main() {
_, _ = w.Write([]byte(`{"status":"success","data":{"yaml":"global:\n scrape_interval: 30s\n"}}`))
})

http.HandleFunc("/api/v1/query_range", func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(200)
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`{
"status":"success",
"data":{
"resultType":"matrix",
"result":[]
}
}`))
})

http.HandleFunc("/api/v1/query", func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(200)
w.Header().Set("Content-Type", "application/json")
Expand Down
4 changes: 2 additions & 2 deletions cmd/pint/tests/0057_watch_metrics_prometheus_ignore.txt
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,8 @@ pint_check_iterations_total
pint_last_run_time_seconds
# HELP pint_problem Prometheus rule problem reported by pint
# TYPE pint_problem gauge
pint_problem{filename="rules/1.yml",kind="alerting",name="comparison",problem="query using \"prom1\" on http://127.0.0.1:7057 failed with: bad_response: Unmarshal: there are bytes left after unmarshal, error found in #10 byte of ...|y\"\n }Fatal error|..., bigger context ...|:\"bad_data\",\n \"error\":\"bogus query\"\n }Fatal error|...",reporter="promql/series",severity="bug"}
pint_problem{filename="rules/1.yml",kind="recording",name="aggregate",problem="query using \"prom1\" on http://127.0.0.1:7057 failed with: bad_response: Unmarshal: there are bytes left after unmarshal, error found in #10 byte of ...|y\"\n }Fatal error|..., bigger context ...|:\"bad_data\",\n \"error\":\"bogus query\"\n }Fatal error|...",reporter="promql/series",severity="bug"}
pint_problem{filename="rules/1.yml",kind="alerting",name="comparison",problem="prometheus \"prom1\" at http://127.0.0.1:7057 failed with: bad_response: Unmarshal: there are bytes left after unmarshal, error found in #10 byte of ...|y\"\n }Fatal error|..., bigger context ...|:\"bad_data\",\n \"error\":\"bogus query\"\n }Fatal error|...",reporter="promql/series",severity="bug"}
pint_problem{filename="rules/1.yml",kind="recording",name="aggregate",problem="prometheus \"prom1\" at http://127.0.0.1:7057 failed with: bad_response: Unmarshal: there are bytes left after unmarshal, error found in #10 byte of ...|y\"\n }Fatal error|..., bigger context ...|:\"bad_data\",\n \"error\":\"bogus query\"\n }Fatal error|...",reporter="promql/series",severity="bug"}
pint_problem{filename="rules/1.yml",kind="recording",name="broken",problem="syntax error: no arguments for aggregate expression provided",reporter="promql/syntax",severity="fatal"}
# HELP pint_problems Total number of problems reported by pint
# TYPE pint_problems gauge
Expand Down
3 changes: 2 additions & 1 deletion docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
### Changed

- Always print the number of detected problems when running `pint lint`.
- Raise `promql/series` severity from `Warning` to `Bug`.
- `promql/series` check was refactored and will now detect a range of
problems. See [promql/series](checks/promql/series.md) for details.

## v0.14.0

Expand Down
70 changes: 67 additions & 3 deletions docs/checks/promql/series.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,75 @@ grand_parent: Documentation
This check will also query Prometheus servers, it is used to warn about queries
that are using metrics not currently present in Prometheus.
It parses `expr` query from every rule, finds individual metric selectors and
checks if they return any values.
runs a series of checks for each of them.

Let's say we have a rule this query: `sum(my_metric{foo="bar"}) > 10`.
This checks would query all configured server for the existence of
`my_metric{foo="bar"}` series and report a warning if it's missing.
This checks would first try to determine if `my_metric{foo="bar"}`
returns anything via instant query and if it doesn't it will try
to determine why, by checking if:

- `my_metric` metric was ever present in Prometheus
- `my_metric` was present but disappeared
- `my_metric` has any series with `foo` label
- `my_metric` has any series matching `foo="bar"`

## Common problems

If you see this check complaining about some metric it's might due to a number
of different issues. Here are some usual cases.

### Your query cannot return anything

- You are trying to use a metric that is not present in Prometheus at all.
- Service exporting your metric is not working or no longer being scraped.
- You are querying wrong Prometheus server.
- You are trying to filter a metric that exists using a label key that is
never present on that metric.
- You are using label value as a filter, but that value is never present.

If that's the case you need to fix you query. Make sure your metric is present
and it has all the labels you expect to see.

### Metrics you are using have unstable labeling scheme

Some time series for the same metric will have label `foo` and some won't.
Although there's nothing technically wrong with this and Prometheus allows
you to do so, this makes querying metrics difficult as results containing
label `foo` will be mixed with other results not having that label.
All queries would effectively need a `{foo!=""}` or `{foo=""}` filter to
select only one variant of this metric.

Best solution here is to fix labeling scheme.

### Metric labels are generated dynamically in response to some activity

Some label values will appear only temporarily, for example if metrics
are generated for serviced HTTP request and they include some details of
those requests that cannot be known ahead of time, like request path or
method.

When possible this can be addressed by initializing metrics with all known
label values to zero on startup:

```go
func main() {
myMetric = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "http_requests_total",
Help: "Total number of HTTP requests",
},
[]string{"code"},
)
myMetric.WithLabelValues("2xx").Set(0)
myMetric.WithLabelValues("3xx").Set(0)
myMetric.WithLabelValues("4xx").Set(0)
myMetric.WithLabelValues("5xx").Set(0)
}
```

If that's not doable you can let pint know that it's not possible to validate
those queries by disabling this check. See below for instructions on how to do
that.

## Configuration

Expand Down
12 changes: 6 additions & 6 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -12,20 +12,20 @@ require (
github.com/prometheus/client_golang v1.12.1
github.com/prometheus/client_model v0.2.0
github.com/prometheus/common v0.32.1
github.com/prometheus/prometheus v1.8.2-0.20220211202545-56e14463bccf
github.com/prometheus/prometheus v1.8.2-0.20220308163432-03831554a519
github.com/rogpeppe/go-internal v1.8.1
github.com/rs/zerolog v1.26.1
github.com/stretchr/testify v1.7.0
github.com/urfave/cli/v2 v2.3.0
golang.org/x/oauth2 v0.0.0-20211104180415-d3ed0bb246c8
golang.org/x/oauth2 v0.0.0-20220223155221-ee480838109b
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b
)

require (
github.com/agext/levenshtein v1.2.3 // indirect
github.com/alecthomas/units v0.0.0-20211218093645-b94a6e3cc137 // indirect
github.com/apparentlymart/go-textseg/v13 v13.0.0 // indirect
github.com/aws/aws-sdk-go v1.42.53 // indirect
github.com/aws/aws-sdk-go v1.43.14 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/cespare/xxhash/v2 v2.1.2 // indirect
github.com/cpuguy83/go-md2man/v2 v2.0.1 // indirect
Expand Down Expand Up @@ -64,10 +64,10 @@ require (
github.com/zclconf/go-cty v1.10.0 // indirect
go.uber.org/atomic v1.9.0 // indirect
go.uber.org/goleak v1.1.12 // indirect
golang.org/x/crypto v0.0.0-20220214200702-86341886e292 // indirect
golang.org/x/net v0.0.0-20220127200216-cd36cc0744dd // indirect
golang.org/x/crypto v0.0.0-20220307211146-efcb8507fb70 // indirect
golang.org/x/net v0.0.0-20220225172249-27dd8689420f // indirect
golang.org/x/sync v0.0.0-20210220032951-036812b2e83c // indirect
golang.org/x/sys v0.0.0-20220209214540-3681064d5158 // indirect
golang.org/x/sys v0.0.0-20220307203707-22a9840ba4d7 // indirect
golang.org/x/text v0.3.7 // indirect
google.golang.org/appengine v1.6.7 // indirect
google.golang.org/protobuf v1.27.1 // indirect
Expand Down
Loading

0 comments on commit 6de93c2

Please sign in to comment.