Skip to content

Commit

Permalink
Support failover URIs for Prometheus servers
Browse files Browse the repository at this point in the history
  • Loading branch information
prymitive committed Feb 18, 2022
1 parent 9de4633 commit 5b03212
Show file tree
Hide file tree
Showing 22 changed files with 516 additions and 116 deletions.
89 changes: 89 additions & 0 deletions cmd/pint/tests/0055_prometheus_failover.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
exec bash -x ./prometheus.sh &
exec bash -c 'I=0 ; while [ ! -f prometheus.pid ] && [ $I -lt 30 ]; do sleep 1; I=$((I+1)); done'

pint.ok --no-color lint rules
! stdout .
stderr 'level=error msg="Query failed" error="Post \\"http://127.0.0.1:1055/api/v1/query\\": dial tcp 127.0.0.1:1055: connect: connection refused" query=count\(foo\) uri=http://127.0.0.1:1055'
stderr 'level=error msg="Failed to query Prometheus configuration" error="Get \\"http://127.0.0.1:1055/api/v1/status/config\\": dial tcp 127.0.0.1:1055: connect: connection refused" uri=http://127.0.0.1:1055'
stderr 'rules/1.yml:2: query using prom completed without any results for foo \(promql/series\)'

exec sh -c 'cat prometheus.pid | xargs kill'

-- rules/1.yml --
- record: aggregate
expr: sum(foo) without(job)

-- .pint.hcl --
prometheus "prom" {
uri = "http://127.0.0.1:1055"
failover = ["http://127.0.0.1:7055"]
timeout = "5s"
}

-- prometheus.go --
package main

import (
"context"
"log"
"net"
"net/http"
"os"
"os/signal"
"strconv"
"syscall"
"time"
)

func main() {
http.HandleFunc("/api/v1/status/config", func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(200)
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`{"status":"success","data":{"yaml":"global:\n scrape_interval: 30s\n"}}`))
})

http.HandleFunc("/api/v1/query", func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(200)
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`{
"status":"success",
"data":{
"resultType":"vector",
"result":[]
}
}`))
})

listener, err := net.Listen("tcp", "127.0.0.1:7055")
if err != nil {
log.Fatal(err)
}

server := &http.Server{
Addr: "127.0.0.1:7055",
}

go func() {
_ = server.Serve(listener)
}()

pid := os.Getpid()
err = os.WriteFile("prometheus.pid", []byte(strconv.Itoa(pid)), 0644)
if err != nil {
log.Fatal(err)
}

stop := make(chan os.Signal, 1)
signal.Notify(stop, os.Interrupt, syscall.SIGINT, syscall.SIGTERM)
go func() {
time.Sleep(time.Minute*2)
stop <- syscall.SIGTERM
}()
<-stop
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
server.Shutdown(ctx)
}

-- prometheus.sh --
env GOCACHE=$TMPDIR go run prometheus.go
5 changes: 4 additions & 1 deletion docs/changelog.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
# Changelog

## v0.12.1
## v0.13.0

### Changed

- `prometheus` config block now allows to specify failover URIs using `failover` field.
If failover URIs are set and main URI fails to respond pint will attempt to use them
in the order specified until one of them works.
- Renamed `pint/parse` to `yaml/parse` and added missing documentation for it.

## v0.12.0
Expand Down
13 changes: 10 additions & 3 deletions docs/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,15 +103,22 @@ Syntax:

```js
prometheus "$name" {
uri = "https://..."
timeout = "60s"
paths = ["...", ...]
uri = "https://..."
failover = ["https://...", ...]
timeout = "60s"
paths = ["...", ...]
}
```

- `$name` - each defined server should have a unique name that can be used in check
definitions.
- `uri` - base URI of this Prometheus server, used for API requests and queries.
- `failover` - list of URIs to try (in order they are specified) if `uri` doesn't respond
to requests or returns an error. This allows to configure failover Prometheus servers
to avoid CI failures in case main Prometheus server is unreachable.
It's highly recommended that all URIs point to Prometheus servers with identical
configuration, otherwise pint checks might return unreliable results and potential
false positives.
- `timeout` - timeout to be used for API requests.
- `paths` - optional path filter, if specified only paths matching one of listed regex
patterns will use this Prometheus server for checks.
Expand Down
4 changes: 2 additions & 2 deletions internal/checks/alerts_count.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ const (
AlertsCheckName = "alerts/count"
)

func NewAlertsCheck(prom *promapi.Prometheus, lookBack, step, resolve time.Duration) AlertsCheck {
func NewAlertsCheck(prom *promapi.FailoverGroup, lookBack, step, resolve time.Duration) AlertsCheck {
return AlertsCheck{
prom: prom,
lookBack: lookBack,
Expand All @@ -24,7 +24,7 @@ func NewAlertsCheck(prom *promapi.Prometheus, lookBack, step, resolve time.Durat
}

type AlertsCheck struct {
prom *promapi.Prometheus
prom *promapi.FailoverGroup
lookBack time.Duration
step time.Duration
resolve time.Duration
Expand Down
17 changes: 8 additions & 9 deletions internal/checks/alerts_count_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ import (
"time"

"github.com/cloudflare/pint/internal/checks"
"github.com/cloudflare/pint/internal/promapi"

"github.com/rs/zerolog"
)
Expand Down Expand Up @@ -86,17 +85,17 @@ func TestAlertsCheck(t *testing.T) {
{
description: "ignores recording rules",
content: "- record: foo\n expr: up == 0\n",
checker: checks.NewAlertsCheck(promapi.NewPrometheus("prom", "http://localhost", time.Second*5), time.Hour*24, time.Minute, time.Minute*5),
checker: checks.NewAlertsCheck(simpleProm("prom", "http://localhost", time.Second*5), time.Hour*24, time.Minute, time.Minute*5),
},
{
description: "ignores rules with syntax errors",
content: "- alert: Foo Is Down\n expr: sum(\n",
checker: checks.NewAlertsCheck(promapi.NewPrometheus("prom", "http://localhost", time.Second*5), time.Hour*24, time.Minute, time.Minute*5),
checker: checks.NewAlertsCheck(simpleProm("prom", "http://localhost", time.Second*5), time.Hour*24, time.Minute, time.Minute*5),
},
{
description: "bad request",
content: content,
checker: checks.NewAlertsCheck(promapi.NewPrometheus("prom", srv.URL+"/400/", time.Second*5), time.Hour*24, time.Minute, time.Minute*5),
checker: checks.NewAlertsCheck(simpleProm("prom", srv.URL+"/400/", time.Second*5), time.Hour*24, time.Minute, time.Minute*5),
problems: []checks.Problem{
{
Fragment: `up{job="foo"} == 0`,
Expand All @@ -110,7 +109,7 @@ func TestAlertsCheck(t *testing.T) {
{
description: "empty response",
content: content,
checker: checks.NewAlertsCheck(promapi.NewPrometheus("prom", srv.URL+"/empty/", time.Second*5), time.Hour*24, time.Minute, time.Minute*5),
checker: checks.NewAlertsCheck(simpleProm("prom", srv.URL+"/empty/", time.Second*5), time.Hour*24, time.Minute, time.Minute*5),
problems: []checks.Problem{
{
Fragment: `up{job="foo"} == 0`,
Expand All @@ -124,7 +123,7 @@ func TestAlertsCheck(t *testing.T) {
{
description: "multiple alerts",
content: content,
checker: checks.NewAlertsCheck(promapi.NewPrometheus("prom", srv.URL+"/alerts/", time.Second*5), time.Hour*24, time.Minute, time.Minute*5),
checker: checks.NewAlertsCheck(simpleProm("prom", srv.URL+"/alerts/", time.Second*5), time.Hour*24, time.Minute, time.Minute*5),
problems: []checks.Problem{
{
Fragment: `up{job="foo"} == 0`,
Expand All @@ -138,7 +137,7 @@ func TestAlertsCheck(t *testing.T) {
{
description: "for: 10m",
content: "- alert: Foo Is Down\n for: 10m\n expr: up{job=\"foo\"} == 0\n",
checker: checks.NewAlertsCheck(promapi.NewPrometheus("prom", srv.URL+"/alerts/", time.Second*5), time.Hour*24, time.Minute*6, time.Minute*10),
checker: checks.NewAlertsCheck(simpleProm("prom", srv.URL+"/alerts/", time.Second*5), time.Hour*24, time.Minute*6, time.Minute*10),
problems: []checks.Problem{
{
Fragment: `up{job="foo"} == 0`,
Expand All @@ -155,7 +154,7 @@ func TestAlertsCheck(t *testing.T) {
- alert: foo
expr: '{__name__="up", job="foo"} == 0'
`,
checker: checks.NewAlertsCheck(promapi.NewPrometheus("prom", srv.URL+"/alerts/", time.Second*5), time.Hour*24, time.Minute*6, time.Minute*10),
checker: checks.NewAlertsCheck(simpleProm("prom", srv.URL+"/alerts/", time.Second*5), time.Hour*24, time.Minute*6, time.Minute*10),
problems: []checks.Problem{
{
Fragment: `{__name__="up", job="foo"} == 0`,
Expand All @@ -172,7 +171,7 @@ func TestAlertsCheck(t *testing.T) {
- alert: foo
expr: '{__name__=~"(up|foo)", job="foo"} == 0'
`,
checker: checks.NewAlertsCheck(promapi.NewPrometheus("prom", srv.URL+"/alerts/", time.Second*5), time.Hour*24, time.Minute*6, time.Minute*10),
checker: checks.NewAlertsCheck(simpleProm("prom", srv.URL+"/alerts/", time.Second*5), time.Hour*24, time.Minute*6, time.Minute*10),
problems: []checks.Problem{
{
Fragment: `{__name__=~"(up|foo)", job="foo"} == 0`,
Expand Down
11 changes: 11 additions & 0 deletions internal/checks/base_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@ package checks_test
import (
"context"
"testing"
"time"

"github.com/google/go-cmp/cmp"

"github.com/cloudflare/pint/internal/checks"
"github.com/cloudflare/pint/internal/parser"
"github.com/cloudflare/pint/internal/promapi"
)

type checkTest struct {
Expand Down Expand Up @@ -97,3 +99,12 @@ func TestParseSeverity(t *testing.T) {
})
}
}

func simpleProm(name, uri string, timeout time.Duration) *promapi.FailoverGroup {
return promapi.NewFailoverGroup(
name,
[]*promapi.Prometheus{
promapi.NewPrometheus(name, uri, timeout),
},
)
}
4 changes: 2 additions & 2 deletions internal/checks/promql_rate.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@ const (
RateCheckName = "promql/rate"
)

func NewRateCheck(prom *promapi.Prometheus) RateCheck {
func NewRateCheck(prom *promapi.FailoverGroup) RateCheck {
return RateCheck{prom: prom}
}

type RateCheck struct {
prom *promapi.Prometheus
prom *promapi.FailoverGroup
}

func (c RateCheck) String() string {
Expand Down
Loading

0 comments on commit 5b03212

Please sign in to comment.