Skip to content

Commit

Permalink
Add pint_prometheus_query_errors_total metric
Browse files Browse the repository at this point in the history
  • Loading branch information
prymitive committed Feb 11, 2022
1 parent ecc0138 commit a10c5b4
Show file tree
Hide file tree
Showing 7 changed files with 210 additions and 1 deletion.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
### Added

- Added `promql/regexp` check that will warn about unnecessary regexp matchers.
- Add `pint_prometheus_queries_total` and `pint_prometheus_query_errors_total`
metric when running `pint watch`.

## v0.10.1

Expand Down
150 changes: 150 additions & 0 deletions cmd/pint/tests/0054_watch_metrics_prometheus.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
exec bash -x ./prometheus.sh &
exec bash -c 'while [ ! -f prometheus.pid ]; do sleep 1 ; done'

exec bash -x ./test.sh &

pint.ok watch --listen=:6054 --pidfile=pint.pid rules
cmp curl.txt metrics.txt

-- test.sh --
sleep 15
curl -s http://127.0.0.1:6054/metrics | grep 'pint_' | perl -pe "s/^([a-zA-Z].+)[ ]([0-9\.\-\+eE]+)$/\1/g" > curl.txt
cat pint.pid | xargs kill
cat prometheus.pid | xargs kill

-- rules/1.yml --
- record: broken
expr: foo / count())

- record: aggregate
expr: sum(foo) without(job)

- alert: comparison
expr: foo

-- .pint.hcl --
prometheus "prom1" {
uri = "http://127.0.0.1:7054"
timeout = "5s"
}

prometheus "prom2" {
uri = "http://127.0.0.1:1054"
timeout = "5s"
}

-- metrics.txt --
# HELP pint_check_duration_seconds How long did a check took to complete
# TYPE pint_check_duration_seconds summary
pint_check_duration_seconds_sum{check="alerts/for"}
pint_check_duration_seconds_count{check="alerts/for"}
pint_check_duration_seconds_sum{check="alerts/template"}
pint_check_duration_seconds_count{check="alerts/template"}
pint_check_duration_seconds_sum{check="promql/comparison"}
pint_check_duration_seconds_count{check="promql/comparison"}
pint_check_duration_seconds_sum{check="promql/fragile"}
pint_check_duration_seconds_count{check="promql/fragile"}
pint_check_duration_seconds_sum{check="promql/rate"}
pint_check_duration_seconds_count{check="promql/rate"}
pint_check_duration_seconds_sum{check="promql/regexp"}
pint_check_duration_seconds_count{check="promql/regexp"}
pint_check_duration_seconds_sum{check="promql/series"}
pint_check_duration_seconds_count{check="promql/series"}
pint_check_duration_seconds_sum{check="promql/syntax"}
pint_check_duration_seconds_count{check="promql/syntax"}
pint_check_duration_seconds_sum{check="promql/vector_matching"}
pint_check_duration_seconds_count{check="promql/vector_matching"}
# HELP pint_check_iterations_total Total number of completed check iterations since pint start
# TYPE pint_check_iterations_total counter
pint_check_iterations_total
# HELP pint_problem Prometheus rule problem reported by pint
# TYPE pint_problem gauge
pint_problem{kind="alerting",name="comparison",problem="failed to query prom1 prometheus config: failed to query Prometheus config: server_error: server error: 500",reporter="promql/rate",severity="bug"}
pint_problem{kind="alerting",name="comparison",problem="failed to query prom2 prometheus config: failed to query Prometheus config: Get \"http://127.0.0.1:1054/api/v1/status/config\": dial tcp 127.0.0.1:1054: connect: connection refused",reporter="promql/rate",severity="bug"}
pint_problem{kind="alerting",name="comparison",problem="query using prom1 failed with: bad_response: Unmarshal: there are bytes left after unmarshal, error found in #10 byte of ...|ry\"\n }Fatal error|..., bigger context ...|pe\":\"bad_data\",\n \"error\":\"bogus query\"\n }Fatal error|...",reporter="promql/series",severity="bug"}
pint_problem{kind="alerting",name="comparison",problem="query using prom2 failed with: Post \"http://127.0.0.1:1054/api/v1/query\": dial tcp 127.0.0.1:1054: connect: connection refused",reporter="promql/series",severity="bug"}
pint_problem{kind="recording",name="aggregate",problem="failed to query prom1 prometheus config: failed to query Prometheus config: server_error: server error: 500",reporter="promql/rate",severity="bug"}
pint_problem{kind="recording",name="aggregate",problem="failed to query prom2 prometheus config: failed to query Prometheus config: Get \"http://127.0.0.1:1054/api/v1/status/config\": dial tcp 127.0.0.1:1054: connect: connection refused",reporter="promql/rate",severity="bug"}
pint_problem{kind="recording",name="aggregate",problem="query using prom1 failed with: bad_response: Unmarshal: there are bytes left after unmarshal, error found in #10 byte of ...|ry\"\n }Fatal error|..., bigger context ...|pe\":\"bad_data\",\n \"error\":\"bogus query\"\n }Fatal error|...",reporter="promql/series",severity="bug"}
pint_problem{kind="recording",name="aggregate",problem="query using prom2 failed with: Post \"http://127.0.0.1:1054/api/v1/query\": dial tcp 127.0.0.1:1054: connect: connection refused",reporter="promql/series",severity="bug"}
pint_problem{kind="recording",name="broken",problem="syntax error: no arguments for aggregate expression provided",reporter="promql/syntax",severity="fatal"}
# HELP pint_problems Total number of problems reported by pint
# TYPE pint_problems gauge
pint_problems
# HELP pint_prometheus_queries_total Total number of all prometheus queries
# TYPE pint_prometheus_queries_total counter
pint_prometheus_queries_total{endpoint="/api/v1/query",name="prom1"}
pint_prometheus_queries_total{endpoint="/api/v1/query",name="prom2"}
pint_prometheus_queries_total{endpoint="/api/v1/status/config",name="prom1"}
pint_prometheus_queries_total{endpoint="/api/v1/status/config",name="prom2"}
# HELP pint_prometheus_query_errors_total Total number of failed prometheus queries
# TYPE pint_prometheus_query_errors_total counter
pint_prometheus_query_errors_total{endpoint="/api/v1/query",name="prom1",reason="api/bad_response"}
pint_prometheus_query_errors_total{endpoint="/api/v1/query",name="prom2",reason="connection/error"}
pint_prometheus_query_errors_total{endpoint="/api/v1/status/config",name="prom1",reason="api/server_error"}
pint_prometheus_query_errors_total{endpoint="/api/v1/status/config",name="prom2",reason="connection/error"}
# HELP pint_version Version information
# TYPE pint_version gauge
pint_version{version="unknown"}
-- prometheus.go --
package main

import (
"context"
"io"
"log"
"net"
"net/http"
"os"
"os/signal"
"strconv"
"syscall"
"time"
)

func main() {
http.HandleFunc("/api/v1/status/config", func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(500)
io.WriteString(w, "Fatal error")
})

http.HandleFunc("/api/v1/query", func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(400)
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`{
"status":"error",
"errorType":"bad_data",
"error":"bogus query"
}`))
io.WriteString(w, "Fatal error")
})

listener, err := net.Listen("tcp", "127.0.0.1:7054")
if err != nil {
log.Fatal(err)
}

server := &http.Server{
Addr: "127.0.0.1:7054",
}

go func() {
_ = server.Serve(listener)
}()

pid := os.Getpid()
err = os.WriteFile("prometheus.pid", []byte(strconv.Itoa(pid)), 0644)
if err != nil {
log.Fatal(err)
}

stop := make(chan os.Signal, 1)
signal.Notify(stop, os.Interrupt, syscall.SIGINT, syscall.SIGTERM)
<-stop
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
server.Shutdown(ctx)
}

-- prometheus.sh --
env GOCACHE=$TMPDIR go run prometheus.go
2 changes: 2 additions & 0 deletions cmd/pint/watch.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"github.com/cloudflare/pint/internal/checks"
"github.com/cloudflare/pint/internal/config"
"github.com/cloudflare/pint/internal/discovery"
"github.com/cloudflare/pint/internal/promapi"
"github.com/cloudflare/pint/internal/reporter"

"github.com/prometheus/client_golang/prometheus"
Expand Down Expand Up @@ -125,6 +126,7 @@ func actionWatch(c *cli.Context) (err error) {
prometheus.MustRegister(checkDuration)
prometheus.MustRegister(checkIterationsTotal)
prometheus.MustRegister(pintVersion)
promapi.RegisterMetrics()
pintVersion.WithLabelValues(version).Set(1)
http.Handle("/metrics", promhttp.Handler())
listen := c.String(listenFlag)
Expand Down
3 changes: 3 additions & 0 deletions internal/promapi/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,17 @@ func (p *Prometheus) Config(ctx context.Context) (*PrometheusConfig, error) {
ctx, cancel := context.WithTimeout(ctx, p.timeout)
defer cancel()

prometheusQueriesTotal.WithLabelValues(p.name, "/api/v1/status/config")
resp, err := p.api.Config(ctx)
if err != nil {
log.Error().Err(err).Str("uri", p.uri).Msg("Failed to query Prometheus configuration")
prometheusQueryErrorsTotal.WithLabelValues(p.name, "/api/v1/status/config", errReason(err))
return nil, fmt.Errorf("failed to query Prometheus config: %w", err)
}

var cfg PrometheusConfig
if err = yaml.Unmarshal([]byte(resp.YAML), &cfg); err != nil {
prometheusQueryErrorsTotal.WithLabelValues(p.name, "/api/v1/status/config", errReason(err))
return nil, fmt.Errorf("failed to decode config data in %s response: %w", p.uri, err)
}

Expand Down
46 changes: 46 additions & 0 deletions internal/promapi/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
package promapi

import (
"errors"
"fmt"
"net"

v1 "github.com/prometheus/client_golang/api/prometheus/v1"
"github.com/prometheus/client_golang/prometheus"
)

var (
prometheusQueriesTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "pint_prometheus_queries_total",
Help: "Total number of all prometheus queries",
},
[]string{"name", "endpoint"},
)
prometheusQueryErrorsTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "pint_prometheus_query_errors_total",
Help: "Total number of failed prometheus queries",
},
[]string{"name", "endpoint", "reason"},
)
)

func RegisterMetrics() {
prometheus.MustRegister(prometheusQueriesTotal)
prometheus.MustRegister(prometheusQueryErrorsTotal)
}

func errReason(err error) string {
var neterr net.Error
if ok := errors.As(err, &neterr); ok && neterr.Timeout() {
return "connection/timeout"
}

var v1err *v1.Error
if ok := errors.As(err, &v1err); ok {
return fmt.Sprintf("api/%s", v1err.Type)
}

return "connection/error"
}
5 changes: 4 additions & 1 deletion internal/promapi/query.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ func (p *Prometheus) Query(ctx context.Context, expr string) (*QueryResult, erro
ctx, cancel := context.WithTimeout(ctx, p.timeout)
defer cancel()

prometheusQueriesTotal.WithLabelValues(p.name, "/api/v1/query")
start := time.Now()
result, _, err := p.api.Query(ctx, expr, start)
duration := time.Since(start)
Expand All @@ -45,6 +46,7 @@ func (p *Prometheus) Query(ctx context.Context, expr string) (*QueryResult, erro
Str("uri", p.uri).
Str("query", expr).
Msg("Query failed")
prometheusQueryErrorsTotal.WithLabelValues(p.name, "/api/v1/query", errReason(err))
return nil, err
}

Expand All @@ -57,7 +59,8 @@ func (p *Prometheus) Query(ctx context.Context, expr string) (*QueryResult, erro
vectorVal := result.(model.Vector)
qr.Series = vectorVal
default:
log.Error().Err(err).Str("uri", p.uri).Str("query", expr).Msgf("Query returned unknown result type: %v", result.Type())
log.Error().Str("uri", p.uri).Str("query", expr).Msgf("Query returned unknown result type: %v", result.Type())
prometheusQueryErrorsTotal.WithLabelValues(p.name, "/api/v1/query", "unknown result type")
return nil, fmt.Errorf("unknown result type: %v", result.Type())
}
log.Debug().Str("uri", p.uri).Str("query", expr).Int("series", len(qr.Series)).Msg("Parsed response")
Expand Down
3 changes: 3 additions & 0 deletions internal/promapi/range.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ func (p *Prometheus) RangeQuery(ctx context.Context, expr string, start, end tim
rctx, cancel := context.WithTimeout(ctx, p.timeout)
defer cancel()

prometheusQueriesTotal.WithLabelValues(p.name, "/api/v1/query_range")
r := v1.Range{
Start: start,
End: end,
Expand All @@ -59,6 +60,7 @@ func (p *Prometheus) RangeQuery(ctx context.Context, expr string, start, end tim
Msg("Range query completed")
if err != nil {
log.Error().Err(err).Str("uri", p.uri).Str("query", expr).Msg("Range query failed")
prometheusQueryErrorsTotal.WithLabelValues(p.name, "/api/v1/query_range", errReason(err))
if delta, retryOK := canRetry(err, end.Sub(start)); retryOK {
if delta < step*2 {
log.Error().Str("uri", p.uri).Str("query", expr).Msg("No more retries possible")
Expand All @@ -82,6 +84,7 @@ func (p *Prometheus) RangeQuery(ctx context.Context, expr string, start, end tim
qr.Samples = samples
default:
log.Error().Err(err).Str("uri", p.uri).Str("query", expr).Msgf("Range query returned unknown result type: %v", result.Type())
prometheusQueryErrorsTotal.WithLabelValues(p.name, "/api/v1/query_range", "unknown result type")
return nil, fmt.Errorf("unknown result type: %v", result.Type())
}
log.Debug().Str("uri", p.uri).Str("query", expr).Int("samples", len(qr.Samples)).Msg("Parsed range response")
Expand Down

0 comments on commit a10c5b4

Please sign in to comment.