Skip to content

Commit

Permalink
Add promql/range_query check
Browse files Browse the repository at this point in the history
  • Loading branch information
prymitive committed Jul 19, 2022
1 parent a6d3d59 commit e98c946
Show file tree
Hide file tree
Showing 19 changed files with 1,222 additions and 454 deletions.
8 changes: 4 additions & 4 deletions cmd/pint/tests/0023_enabled_checks.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
pint.error -l debug --no-color lint rules
! stdout .
stderr 'level=debug msg="Configured checks for rule" enabled=\["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp","promql/rate\(prom\)","promql/series\(prom\)","promql/vector_matching\(prom\)"\] path=rules/1.yaml rule=one'
stderr 'level=debug msg="Configured checks for rule" enabled=\["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp","promql/rate\(prom\)","promql/series\(prom\)","promql/vector_matching\(prom\)"\] path=rules/1.yaml rule=two'
stderr 'level=debug msg="Configured checks for rule" enabled=\["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp","promql/rate\(prom\)","promql/series\(prom\)","promql/vector_matching\(prom\)"\] path=rules/2.yaml rule=one'
stderr 'level=debug msg="Configured checks for rule" enabled=\["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp","promql/rate\(prom\)","promql/series\(prom\)","promql/vector_matching\(prom\)"\] path=rules/2.yaml rule=two'
stderr 'level=debug msg="Configured checks for rule" enabled=\["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp","promql/rate\(prom\)","promql/series\(prom\)","promql/vector_matching\(prom\)"\,"promql/range_query\(prom\)"] path=rules/1.yaml rule=one'
stderr 'level=debug msg="Configured checks for rule" enabled=\["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp","promql/rate\(prom\)","promql/series\(prom\)","promql/vector_matching\(prom\)"\,"promql/range_query\(prom\)"] path=rules/1.yaml rule=two'
stderr 'level=debug msg="Configured checks for rule" enabled=\["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp","promql/rate\(prom\)","promql/series\(prom\)","promql/vector_matching\(prom\)"\,"promql/range_query\(prom\)"] path=rules/2.yaml rule=one'
stderr 'level=debug msg="Configured checks for rule" enabled=\["promql/syntax","alerts/for","alerts/comparison","alerts/template","promql/fragile","promql/regexp","promql/rate\(prom\)","promql/series\(prom\)","promql/vector_matching\(prom\)"\,"promql/range_query\(prom\)"] path=rules/2.yaml rule=two'

-- rules/1.yaml --
- record: one
Expand Down
1 change: 1 addition & 0 deletions cmd/pint/tests/0025_config.txt
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ level=info msg="Loading configuration file" path=.pint.hcl
"promql/aggregate",
"alerts/comparison",
"promql/fragile",
"promql/range_query",
"promql/rate",
"promql/regexp",
"promql/syntax",
Expand Down
1 change: 1 addition & 0 deletions cmd/pint/tests/0043_watch_cancel.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ pint.ok --no-color watch --interval=1h --listen=127.0.0.1:6043 --pidfile=pint.pi
! stdout .
stderr 'level=info msg="Shutting down"'
stderr 'level=error msg="Query returned an error" error="failed to query Prometheus config: Get \\"http://127.0.0.1:7043/api/v1/status/config\\": context canceled" query=/api/v1/status/config uri=http://127.0.0.1:7043'
stderr 'level=error msg="Query returned an error" error="failed to query Prometheus flags: Get \\"http://127.0.0.1:7043/api/v1/status/flags\\": context canceled" query=/api/v1/status/flags uri=http://127.0.0.1:7043'
stderr 'level=error msg="Query returned an error" error="Post \\"http://127.0.0.1:7043/api/v1/query\\": context canceled" query=count\(foo\) uri=http://127.0.0.1:7043'
stderr 'level=info msg="Waiting for all background tasks to finish"'
stderr 'level=info msg="Background worker finished"'
Expand Down
10 changes: 10 additions & 0 deletions cmd/pint/tests/0054_watch_metrics_prometheus.txt
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ pint_check_duration_seconds_sum{check="alerts/template"}
pint_check_duration_seconds_count{check="alerts/template"}
pint_check_duration_seconds_sum{check="promql/fragile"}
pint_check_duration_seconds_count{check="promql/fragile"}
pint_check_duration_seconds_sum{check="promql/range_query"}
pint_check_duration_seconds_count{check="promql/range_query"}
pint_check_duration_seconds_sum{check="promql/rate"}
pint_check_duration_seconds_count{check="promql/rate"}
pint_check_duration_seconds_sum{check="promql/regexp"}
Expand Down Expand Up @@ -77,11 +79,13 @@ pint_last_run_duration_seconds
pint_last_run_time_seconds
# HELP pint_problem Prometheus rule problem reported by pint
# TYPE pint_problem gauge
pint_problem{filename="rules/1.yml",kind="recording",name="aggregate",owner="",problem="cound't run \"promql/range_query\" checks due to prometheus \"prom2\" at http://127.0.0.1:1054 connection error: connection refused",reporter="promql/range_query",severity="bug"}
pint_problem{filename="rules/1.yml",kind="recording",name="aggregate",owner="",problem="cound't run \"promql/rate\" checks due to prometheus \"prom1\" at http://127.0.0.1:7054 connection error: server_error: server error: 500",reporter="promql/rate",severity="bug"}
pint_problem{filename="rules/1.yml",kind="recording",name="aggregate",owner="",problem="cound't run \"promql/rate\" checks due to prometheus \"prom2\" at http://127.0.0.1:1054 connection error: connection refused",reporter="promql/rate",severity="bug"}
pint_problem{filename="rules/1.yml",kind="recording",name="aggregate",owner="",problem="cound't run \"promql/series\" checks due to prometheus \"prom2\" at http://127.0.0.1:1054 connection error: connection refused",reporter="promql/series",severity="bug"}
pint_problem{filename="rules/1.yml",kind="recording",name="aggregate",owner="",problem="prometheus \"prom1\" at http://127.0.0.1:7054 failed with: bad_response: Unmarshal: there are bytes left after unmarshal, error found in #10 byte of ...|y\"\n }Fatal error|..., bigger context ...|:\"bad_data\",\n \"error\":\"bogus query\"\n }Fatal error|...",reporter="promql/series",severity="bug"}
pint_problem{filename="rules/1.yml",kind="recording",name="broken",owner="",problem="syntax error: no arguments for aggregate expression provided",reporter="promql/syntax",severity="fatal"}
pint_problem{filename="rules/2.yml",kind="alerting",name="comparison",owner="bob and alice",problem="cound't run \"promql/range_query\" checks due to prometheus \"prom2\" at http://127.0.0.1:1054 connection error: connection refused",reporter="promql/range_query",severity="bug"}
pint_problem{filename="rules/2.yml",kind="alerting",name="comparison",owner="bob and alice",problem="cound't run \"promql/rate\" checks due to prometheus \"prom1\" at http://127.0.0.1:7054 connection error: server_error: server error: 500",reporter="promql/rate",severity="bug"}
pint_problem{filename="rules/2.yml",kind="alerting",name="comparison",owner="bob and alice",problem="cound't run \"promql/rate\" checks due to prometheus \"prom2\" at http://127.0.0.1:1054 connection error: connection refused",reporter="promql/rate",severity="bug"}
pint_problem{filename="rules/2.yml",kind="alerting",name="comparison",owner="bob and alice",problem="cound't run \"promql/series\" checks due to prometheus \"prom2\" at http://127.0.0.1:1054 connection error: connection refused",reporter="promql/series",severity="bug"}
Expand All @@ -95,18 +99,24 @@ pint_prometheus_queries_running{endpoint="/api/v1/query",name="prom1"}
pint_prometheus_queries_running{endpoint="/api/v1/query",name="prom2"}
pint_prometheus_queries_running{endpoint="/api/v1/status/config",name="prom1"}
pint_prometheus_queries_running{endpoint="/api/v1/status/config",name="prom2"}
pint_prometheus_queries_running{endpoint="/api/v1/status/flags",name="prom1"}
pint_prometheus_queries_running{endpoint="/api/v1/status/flags",name="prom2"}
# HELP pint_prometheus_queries_total Total number of all prometheus queries
# TYPE pint_prometheus_queries_total counter
pint_prometheus_queries_total{endpoint="/api/v1/query",name="prom1"}
pint_prometheus_queries_total{endpoint="/api/v1/query",name="prom2"}
pint_prometheus_queries_total{endpoint="/api/v1/status/config",name="prom1"}
pint_prometheus_queries_total{endpoint="/api/v1/status/config",name="prom2"}
pint_prometheus_queries_total{endpoint="/api/v1/status/flags",name="prom1"}
pint_prometheus_queries_total{endpoint="/api/v1/status/flags",name="prom2"}
# HELP pint_prometheus_query_errors_total Total number of failed prometheus queries
# TYPE pint_prometheus_query_errors_total counter
pint_prometheus_query_errors_total{endpoint="/api/v1/query",name="prom1",reason="api/bad_response"}
pint_prometheus_query_errors_total{endpoint="/api/v1/query",name="prom2",reason="connection/error"}
pint_prometheus_query_errors_total{endpoint="/api/v1/status/config",name="prom1",reason="api/server_error"}
pint_prometheus_query_errors_total{endpoint="/api/v1/status/config",name="prom2",reason="connection/error"}
pint_prometheus_query_errors_total{endpoint="/api/v1/status/flags",name="prom1",reason="api/client_error"}
pint_prometheus_query_errors_total{endpoint="/api/v1/status/flags",name="prom2",reason="connection/error"}
# HELP pint_rules_parsed_total Total number of rules parsed since startup
# TYPE pint_rules_parsed_total counter
pint_rules_parsed_total{kind="alerting"}
Expand Down
2 changes: 1 addition & 1 deletion cmd/pint/tests/0056_prometheus_required.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ pint.ok -l debug --no-color lint rules
! stdout .
stderr 'level=error msg="Query returned an error" error="Post \\"https:///api/v1/query\\": http: no Host in request URL" query=count\(up\) uri=https://'
stderr 'level=error msg="Query returned an error" error="failed to query Prometheus config: Get \\"https:///api/v1/status/config\\": http: no Host in request URL" query=/api/v1/status/config uri=https://'
stderr 'level=info msg="Problems found" Warning=8'
stderr 'level=info msg="Problems found" Warning=12'

-- rules/1.yaml --
- record: one
Expand Down
8 changes: 8 additions & 0 deletions cmd/pint/tests/0057_watch_metrics_prometheus_ignore.txt
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ pint_check_duration_seconds_sum{check="alerts/template"}
pint_check_duration_seconds_count{check="alerts/template"}
pint_check_duration_seconds_sum{check="promql/fragile"}
pint_check_duration_seconds_count{check="promql/fragile"}
pint_check_duration_seconds_sum{check="promql/range_query"}
pint_check_duration_seconds_count{check="promql/range_query"}
pint_check_duration_seconds_sum{check="promql/rate"}
pint_check_duration_seconds_count{check="promql/rate"}
pint_check_duration_seconds_sum{check="promql/regexp"}
Expand Down Expand Up @@ -87,18 +89,24 @@ pint_prometheus_queries_running{endpoint="/api/v1/query",name="prom1"}
pint_prometheus_queries_running{endpoint="/api/v1/query",name="prom2"}
pint_prometheus_queries_running{endpoint="/api/v1/status/config",name="prom1"}
pint_prometheus_queries_running{endpoint="/api/v1/status/config",name="prom2"}
pint_prometheus_queries_running{endpoint="/api/v1/status/flags",name="prom1"}
pint_prometheus_queries_running{endpoint="/api/v1/status/flags",name="prom2"}
# HELP pint_prometheus_queries_total Total number of all prometheus queries
# TYPE pint_prometheus_queries_total counter
pint_prometheus_queries_total{endpoint="/api/v1/query",name="prom1"}
pint_prometheus_queries_total{endpoint="/api/v1/query",name="prom2"}
pint_prometheus_queries_total{endpoint="/api/v1/status/config",name="prom1"}
pint_prometheus_queries_total{endpoint="/api/v1/status/config",name="prom2"}
pint_prometheus_queries_total{endpoint="/api/v1/status/flags",name="prom1"}
pint_prometheus_queries_total{endpoint="/api/v1/status/flags",name="prom2"}
# HELP pint_prometheus_query_errors_total Total number of failed prometheus queries
# TYPE pint_prometheus_query_errors_total counter
pint_prometheus_query_errors_total{endpoint="/api/v1/query",name="prom1",reason="api/bad_response"}
pint_prometheus_query_errors_total{endpoint="/api/v1/query",name="prom2",reason="connection/error"}
pint_prometheus_query_errors_total{endpoint="/api/v1/status/config",name="prom1",reason="api/server_error"}
pint_prometheus_query_errors_total{endpoint="/api/v1/status/config",name="prom2",reason="connection/error"}
pint_prometheus_query_errors_total{endpoint="/api/v1/status/flags",name="prom1",reason="api/client_error"}
pint_prometheus_query_errors_total{endpoint="/api/v1/status/flags",name="prom2",reason="connection/error"}
# HELP pint_rules_parsed_total Total number of rules parsed since startup
# TYPE pint_rules_parsed_total counter
pint_rules_parsed_total{kind="alerting"}
Expand Down
15 changes: 12 additions & 3 deletions cmd/pint/tests/0080_lint_online.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,16 @@ exec bash -c 'cat prometheus.pid | xargs kill'
-- stderr.txt --
level=info msg="Loading configuration file" path=.pint.hcl
level=info msg="File parsed" path=rules/1.yml rules=1
rules/1.yml:2: http_errors_total[2d] selector is trying to query Prometheus for 2d worth of metrics, but prometheus "prom1" at http://127.0.0.1:7080 is configured to only keep 1d of metrics history (promql/range_query)
expr: rate(http_errors_total[2d]) > 0

rules/1.yml:2: prometheus "prom1" at http://127.0.0.1:7080 didn't have any series for "http_errors_total" metric in the last 1w. Metric name "http_errors_total" matches "promql/series" check ignore regexp "^.+_errors_.+$" (promql/series)
expr: rate(http_errors_total[2m]) > 0
expr: rate(http_errors_total[2d]) > 0

level=info msg="Problems found" Warning=1
level=info msg="Problems found" Warning=2
-- rules/1.yml --
- alert: http errors
expr: rate(http_errors_total[2m]) > 0
expr: rate(http_errors_total[2d]) > 0

-- .pint.hcl --
prometheus "prom1" {
Expand Down Expand Up @@ -63,6 +66,12 @@ func main() {
_, _ = w.Write([]byte(`{"status":"success","data":{"yaml":"global:\n scrape_interval: 30s\n"}}`))
})

http.HandleFunc("/api/v1/status/flags", func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(200)
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`{"status":"success","data":{"--storage.tsdb.retention.time": "1d"}}`))
})

http.HandleFunc("/api/v1/query_range", func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(200)
w.Header().Set("Content-Type", "application/json")
Expand Down
6 changes: 6 additions & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# Changelog

## v0.26.0

### Added

- [promql/range_query](checks/promql/range_query.md) check.

## v0.25.0

### Changed
Expand Down
89 changes: 89 additions & 0 deletions docs/checks/promql/range_query.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
---
layout: default
parent: Checks
grand_parent: Documentation
---

# promql/range_query

This check inspects range query selectors on all queries.
It will warn if a query tries to request a time range that
is bigger than Prometheus retention limits.

By default Prometheus keeps [15 days of data](https://prometheus.io/docs/prometheus/latest/storage/#operational-aspects),
this can be customized by setting time or disk space limits.
There are two main ways of configuring retention limits in Prometheus:
* time based - Prometheus will keep last N days of metrics
* disk based - Prometheus will try to use up to N bytes of disk space.

Pint will ignore any disk space limits, since that doesn't tell us
what the effective time retention is.
But it will check the value of `--storage.tsdb.retention.time` flag passed
to Prometheus and it will warn if any selector tries to query more
data then Prometheus can store.

For example if Prometheus is running with `--storage.tsdb.retention.time=30d`
then it will store up to 30 days of historical metrics data.
If we would try to query `foo[40d]` then that query can only return up
to 30 days of data, it will never return more.

This usually isn't really a problem but can indicate a mismatch between
expectations of data retention and reality, and so you might think that by
getting results of a `avg_over_time(foo[40d])` you are getting the average
value of `foo` in the last 40 days, but in reality you're only getting
an average value in the last 30 days, and you cannot get any more than that.

## Configuration

This check doesn't have any configuration options.

## How to enable it

This check is enabled by default for all configured Prometheus servers.

Example:

```js
prometheus "prod" {
uri = "https://prometheus-prod.example.com"
timeout = "60s"
paths = [
"rules/prod/.*",
"rules/common/.*",
]
}

prometheus "dev" {
uri = "https://prometheus-dev.example.com"
timeout = "30s"
paths = [
"rules/dev/.*",
"rules/common/.*",
]
}
```

## How to disable it

You can disable this check globally by adding this config block:

```js
checks {
disabled = ["promql/range_query"]
}
```

Or you can disable it per rule by adding a comment to it:

`# pint disable promql/range_query`

If you want to disable only individual instances of this check
you can add a more specific comment.

`# pint disable promql/range_query($prometheus)`

Where `$prometheus` is the name of Prometheus server to disable.

Example:

`# pint disable promql/range_query(prod)`
2 changes: 2 additions & 0 deletions internal/checks/base.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ var (
AggregationCheckName,
ComparisonCheckName,
FragileCheckName,
RangeQueryCheckName,
RateCheckName,
RegexpCheckName,
SyntaxCheckName,
Expand All @@ -30,6 +31,7 @@ var (
}
OnlineChecks = []string{
AlertsCheckName,
RangeQueryCheckName,
RateCheckName,
VectorMatchingCheckName,
CostCheckName,
Expand Down
22 changes: 22 additions & 0 deletions internal/checks/base_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,7 @@ func (fc formCond) isMatch(r *http.Request) bool {

var (
requireConfigPath = requestPathCond{path: "/api/v1/status/config"}
requireFlagsPath = requestPathCond{path: "/api/v1/status/flags"}
requireQueryPath = requestPathCond{path: "/api/v1/query"}
requireRangeQueryPath = requestPathCond{path: "/api/v1/query_range"}
requireMetadataPath = requestPathCond{path: "/api/v1/metadata"}
Expand Down Expand Up @@ -379,6 +380,27 @@ func (cr configResponse) respond(w http.ResponseWriter, r *http.Request) {
_, _ = w.Write(d)
}

type flagsResponse struct {
flags map[string]string
}

func (fg flagsResponse) respond(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(200)
w.Header().Set("Content-Type", "application/json")
result := struct {
Status string `json:"status"`
Data v1.FlagsResult `json:"data"`
}{
Status: "success",
Data: fg.flags,
}
d, err := json.MarshalIndent(result, "", " ")
if err != nil {
panic(err)
}
_, _ = w.Write(d)
}

type metadataResponse struct {
metadata map[string][]v1.Metadata
}
Expand Down
Loading

0 comments on commit e98c946

Please sign in to comment.