Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stateless ruler restores alert state #5230

Merged
merged 4 commits into from
Nov 11, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re

- [#5844](https://github.com/thanos-io/thanos/pull/5844) Query Frontend: Fixes @ modifier time range when splitting queries by interval.
- [#5854](https://github.com/thanos-io/thanos/pull/5854) Query Frontend: Handles `lookback_delta` param in query frontend.
- [#5230](https://github.com/thanos-io/thanos/pull/5230) Rule: Stateless ruler support restoring `for` state from query API servers. The query API servers should be able to access the remote write storage.

### Added

Expand Down
3 changes: 3 additions & 0 deletions cmd/thanos/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,7 @@ type queryConfig struct {
dnsSDInterval time.Duration
httpMethod string
dnsSDResolver string
step time.Duration
}

func (qc *queryConfig) registerFlag(cmd extkingpin.FlagClause) *queryConfig {
Expand All @@ -198,6 +199,8 @@ func (qc *queryConfig) registerFlag(cmd extkingpin.FlagClause) *queryConfig {
Default("POST").EnumVar(&qc.httpMethod, "GET", "POST")
cmd.Flag("query.sd-dns-resolver", "Resolver to use. Possible options: [golang, miekgdns]").
Default("golang").Hidden().StringVar(&qc.dnsSDResolver)
cmd.Flag("query.default-step", "Default range query step to use. This is only used in stateless Ruler and alert state restoration.").
Copy link
Member

@GiedriusS GiedriusS Jul 11, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this so small by default? 🤔

Copy link
Contributor Author

@yeya24 yeya24 Jul 12, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure whether we want to expose this to users or not. The step can be calculated simply by max((maxt - mint) / 250, 1s). But in E2E test we might still want to modify it.

For the default value 1s, I just use the same default query step set in query.go. Do you have any suggested value?

Default("1s").DurationVar(&qc.step)
return qc
}

Expand Down
71 changes: 35 additions & 36 deletions cmd/thanos/rule.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,12 +85,15 @@ type ruleConfig struct {

rwConfig *extflag.PathOrContent

resendDelay time.Duration
evalInterval time.Duration
ruleFiles []string
objStoreConfig *extflag.PathOrContent
dataDir string
lset labels.Labels
resendDelay time.Duration
evalInterval time.Duration
outageTolerance time.Duration
forGracePeriod time.Duration
ruleFiles []string
objStoreConfig *extflag.PathOrContent
dataDir string
lset labels.Labels
ignoredLabelNames []string
}

func (rc *ruleConfig) registerFlag(cmd extkingpin.FlagClause) {
Expand Down Expand Up @@ -126,6 +129,12 @@ func registerRule(app *extkingpin.App) {
Default("1m").DurationVar(&conf.resendDelay)
cmd.Flag("eval-interval", "The default evaluation interval to use.").
Default("1m").DurationVar(&conf.evalInterval)
cmd.Flag("for-outage-tolerance", "Max time to tolerate prometheus outage for restoring \"for\" state of alert.").
Default("1h").DurationVar(&conf.outageTolerance)
cmd.Flag("for-grace-period", "Minimum duration between alert and restored \"for\" state. This is maintained only for alerts with configured \"for\" time greater than grace period.").
Default("10m").DurationVar(&conf.forGracePeriod)
cmd.Flag("restore-ignored-label", "Label names to be ignored when restoring alerts from the remote storage. This is only used in stateless mode.").
StringsVar(&conf.ignoredLabelNames)

conf.rwConfig = extflag.RegisterPathOrContent(cmd, "remote-write.config", "YAML config for the remote-write configurations, that specify servers where samples should be sent to (see https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write). This automatically enables stateless mode for ruler and no series will be stored in the ruler's TSDB. If an empty config (or file) is provided, the flag is ignored and ruler is run with its own TSDB.", extflag.WithEnvSubstitution())

Expand Down Expand Up @@ -321,7 +330,10 @@ func runRule(
extprom.WrapRegistererWithPrefix("thanos_rule_query_apis_", reg),
dns.ResolverType(conf.query.dnsSDResolver),
)
var queryClients []*httpconfig.Client
var (
queryClients []*httpconfig.Client
promClients []*promclient.Client
)
queryClientMetrics := extpromhttp.NewClientMetrics(extprom.WrapRegistererWith(prometheus.Labels{"client": "query"}, reg))
for _, cfg := range queryCfg {
cfg.HTTPClientConfig.ClientMetrics = queryClientMetrics
Expand All @@ -335,6 +347,7 @@ func runRule(
return err
}
queryClients = append(queryClients, queryClient)
promClients = append(promClients, promclient.NewClient(queryClient, logger, "thanos-rule"))
// Discover and resolve query addresses.
addDiscoveryGroups(g, queryClient, conf.query.dnsSDInterval)
}
Expand Down Expand Up @@ -377,7 +390,10 @@ func runRule(
}
fanoutStore := storage.NewFanout(logger, agentDB, remoteStore)
appendable = fanoutStore
queryable = fanoutStore
// Use a separate queryable to restore the ALERTS firing states.
matej-g marked this conversation as resolved.
Show resolved Hide resolved
// We cannot use remoteStore directly because it uses remote read for
// query. However, remote read is not implemented in Thanos Receiver.
queryable = thanosrules.NewPromClientsQueryable(logger, queryClients, promClients, conf.query.httpMethod, conf.query.step, conf.ignoredLabelNames)
} else {
tsdbDB, err = tsdb.Open(conf.dataDir, log.With(logger, "component", "tsdb"), reg, tsdbOpts, nil)
if err != nil {
Expand Down Expand Up @@ -495,14 +511,16 @@ func runRule(
reg,
conf.dataDir,
rules.ManagerOptions{
NotifyFunc: notifyFunc,
Logger: logger,
Appendable: appendable,
ExternalURL: nil,
Queryable: queryable,
ResendDelay: conf.resendDelay,
NotifyFunc: notifyFunc,
Logger: logger,
Appendable: appendable,
ExternalURL: nil,
Queryable: queryable,
ResendDelay: conf.resendDelay,
OutageTolerance: conf.outageTolerance,
ForGracePeriod: conf.forGracePeriod,
},
queryFuncCreator(logger, queryClients, metrics.duplicatedQuery, metrics.ruleEvalWarnings, conf.query.httpMethod),
queryFuncCreator(logger, queryClients, promClients, metrics.duplicatedQuery, metrics.ruleEvalWarnings, conf.query.httpMethod),
conf.lset,
// In our case the querying URL is the external URL because in Prometheus
// --web.external-url points to it i.e. it points at something where the user
Expand Down Expand Up @@ -774,24 +792,10 @@ func labelsTSDBToProm(lset labels.Labels) (res labels.Labels) {
return res
}

func removeDuplicateQueryEndpoints(logger log.Logger, duplicatedQueriers prometheus.Counter, urls []*url.URL) []*url.URL {
set := make(map[string]struct{})
deduplicated := make([]*url.URL, 0, len(urls))
for _, u := range urls {
if _, ok := set[u.String()]; ok {
level.Warn(logger).Log("msg", "duplicate query address is provided", "addr", u.String())
duplicatedQueriers.Inc()
continue
}
deduplicated = append(deduplicated, u)
set[u.String()] = struct{}{}
}
return deduplicated
}

func queryFuncCreator(
logger log.Logger,
queriers []*httpconfig.Client,
promClients []*promclient.Client,
duplicatedQuery prometheus.Counter,
ruleEvalWarnings *prometheus.CounterVec,
httpMethod string,
Expand All @@ -812,15 +816,10 @@ func queryFuncCreator(
panic(errors.Errorf("unknown partial response strategy %v", partialResponseStrategy).Error())
}

promClients := make([]*promclient.Client, 0, len(queriers))
for _, q := range queriers {
promClients = append(promClients, promclient.NewClient(q, logger, "thanos-rule"))
}

return func(ctx context.Context, q string, t time.Time) (promql.Vector, error) {
for _, i := range rand.Perm(len(queriers)) {
promClient := promClients[i]
endpoints := removeDuplicateQueryEndpoints(logger, duplicatedQuery, queriers[i].Endpoints())
endpoints := thanosrules.RemoveDuplicateQueryEndpoints(logger, duplicatedQuery, queriers[i].Endpoints())
for _, i := range rand.Perm(len(endpoints)) {
span, ctx := tracing.StartSpan(ctx, spanID)
v, warns, err := promClient.PromqlQueryInstant(ctx, endpoints[i], q, t, promclient.QueryOptions{
Expand Down
13 changes: 13 additions & 0 deletions docs/components/rule.md
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,12 @@ Flags:
prefix for the regular Alertmanager API path.
--data-dir="data/" data directory
--eval-interval=1m The default evaluation interval to use.
--for-grace-period=10m Minimum duration between alert and restored
"for" state. This is maintained only for alerts
with configured "for" time greater than grace
period.
--for-outage-tolerance=1h Max time to tolerate prometheus outage for
restoring "for" state of alert.
--grpc-address="0.0.0.0:10901"
Listen ip:port address for gRPC endpoints
(StoreAPI). Make sure this address is routable
Expand Down Expand Up @@ -385,6 +391,9 @@ Flags:
https://thanos.io/tip/components/rule.md/#configuration.
If defined, it takes precedence over the
'--query' and '--query.sd-files' flags.
--query.default-step=1s Default range query step to use. This is
only used in stateless Ruler and alert state
restoration.
--query.http-method=POST HTTP method to use when sending queries.
Possible options: [GET, POST]
--query.sd-dns-interval=30s
Expand Down Expand Up @@ -429,6 +438,10 @@ Flags:
https://thanos.io/tip/thanos/logging.md/#configuration
--resend-delay=1m Minimum amount of time to wait before resending
an alert to Alertmanager.
--restore-ignored-label=RESTORE-IGNORED-LABEL ...
Label names to be ignored when restoring alerts
from the remote storage. This is only used in
stateless mode.
--rule-file=rules/ ... Rule files that should be used by rule
manager. Can be in glob format (repeated).
Note that rules are not automatically detected,
Expand Down
23 changes: 23 additions & 0 deletions pkg/promclient/promclient.go
Original file line number Diff line number Diff line change
Expand Up @@ -780,6 +780,29 @@ func (c *Client) RulesInGRPC(ctx context.Context, base *url.URL, typeRules strin
return m.Data.Groups, nil
}

// AlertsInGRPC returns the rules from Prometheus alerts API. It uses gRPC errors.
// NOTE: This method is tested in pkg/store/prometheus_test.go against Prometheus.
func (c *Client) AlertsInGRPC(ctx context.Context, base *url.URL) ([]*rulespb.AlertInstance, error) {
u := *base
u.Path = path.Join(u.Path, "/api/v1/alerts")

var m struct {
Data struct {
Alerts []*rulespb.AlertInstance `json:"alerts"`
} `json:"data"`
}

if err := c.get2xxResultWithGRPCErrors(ctx, "/prom_alerts HTTP[client]", &u, &m); err != nil {
return nil, err
}

// Prometheus does not support PartialResponseStrategy, and probably would never do. Make it Abort by default.
for _, g := range m.Data.Alerts {
g.PartialResponseStrategy = storepb.PartialResponseStrategy_ABORT
}
return m.Data.Alerts, nil
}

// MetricMetadataInGRPC returns the metadata from Prometheus metric metadata API. It uses gRPC errors.
func (c *Client) MetricMetadataInGRPC(ctx context.Context, base *url.URL, metric string, limit int) (map[string][]metadatapb.Meta, error) {
u := *base
Expand Down
150 changes: 150 additions & 0 deletions pkg/rules/queryable.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
// Copyright (c) The Thanos Authors.
// Licensed under the Apache License 2.0.

package rules

import (
"context"
"math/rand"
"net/url"
"strings"
"time"

"github.com/go-kit/log"
"github.com/go-kit/log/level"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"github.com/prometheus/common/model"
"github.com/prometheus/prometheus/model/labels"
"github.com/prometheus/prometheus/storage"

"github.com/thanos-io/thanos/internal/cortex/querier/series"
"github.com/thanos-io/thanos/pkg/httpconfig"
"github.com/thanos-io/thanos/pkg/promclient"
"github.com/thanos-io/thanos/pkg/store/storepb"
)

type promClientsQueryable struct {
httpMethod string
step time.Duration

logger log.Logger
promClients []*promclient.Client
queryClients []*httpconfig.Client
ignoredLabelNames []string

duplicatedQuery prometheus.Counter
}
type promClientsQuerier struct {
ctx context.Context
mint, maxt int64
step int64
httpMethod string

logger log.Logger
promClients []*promclient.Client
queryClients []*httpconfig.Client
restoreIgnoreLabels []string
Copy link
Member

@GiedriusS GiedriusS Jul 11, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps we could make this a bit more generic and rename it to ignoredLabelNames?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated.


// We use a dummy counter here because the duplicated
// addresses are already tracked by rule evaluation part.
duplicatedQuery prometheus.Counter
}

// NewPromClientsQueryable creates a queryable that queries queriers from Prometheus clients.
func NewPromClientsQueryable(logger log.Logger, queryClients []*httpconfig.Client, promClients []*promclient.Client,
httpMethod string, step time.Duration, ignoredLabelNames []string) *promClientsQueryable {
return &promClientsQueryable{
logger: logger,
queryClients: queryClients,
promClients: promClients,
duplicatedQuery: promauto.With(nil).NewCounter(prometheus.CounterOpts{}),
httpMethod: httpMethod,
step: step,
ignoredLabelNames: ignoredLabelNames,
}
}

// Querier returns a new Querier for the given time range.
func (q *promClientsQueryable) Querier(ctx context.Context, mint, maxt int64) (storage.Querier, error) {
return &promClientsQuerier{
ctx: ctx,
mint: mint,
maxt: maxt,
step: int64(q.step / time.Second),
httpMethod: q.httpMethod,
logger: q.logger,
queryClients: q.queryClients,
promClients: q.promClients,
restoreIgnoreLabels: q.ignoredLabelNames,
}, nil
}

// Select implements storage.Querier interface.
func (q *promClientsQuerier) Select(_ bool, _ *storage.SelectHints, matchers ...*labels.Matcher) storage.SeriesSet {
query := storepb.PromMatchersToString(matchers...)

for _, i := range rand.Perm(len(q.queryClients)) {
promClient := q.promClients[i]
endpoints := RemoveDuplicateQueryEndpoints(q.logger, q.duplicatedQuery, q.queryClients[i].Endpoints())
for _, i := range rand.Perm(len(endpoints)) {
m, warns, err := promClient.QueryRange(q.ctx, endpoints[i], query, q.mint, q.maxt, q.step, promclient.QueryOptions{
Deduplicate: true,
Method: q.httpMethod,
})

if err != nil {
level.Error(q.logger).Log("err", err, "query", q)
continue
}
if len(warns) > 0 {
level.Warn(q.logger).Log("warnings", strings.Join(warns, ", "), "query", q)
}
matrix := make([]*model.SampleStream, 0, m.Len())
for _, metric := range m {
for _, label := range q.restoreIgnoreLabels {
delete(metric.Metric, model.LabelName(label))
}

matrix = append(matrix, &model.SampleStream{
Metric: metric.Metric,
Values: metric.Values,
})
}

return series.MatrixToSeriesSet(matrix)
}
}
return storage.NoopSeriesSet()
}

// LabelValues implements storage.LabelQuerier interface.
func (q *promClientsQuerier) LabelValues(name string, matchers ...*labels.Matcher) ([]string, storage.Warnings, error) {
return nil, nil, nil
}

// LabelNames implements storage.LabelQuerier interface.
func (q *promClientsQuerier) LabelNames(matchers ...*labels.Matcher) ([]string, storage.Warnings, error) {
return nil, nil, nil
}

// Close implements storage.LabelQuerier interface.
func (q *promClientsQuerier) Close() error {
return nil
}

// RemoveDuplicateQueryEndpoints removes duplicate endpoints from the list of urls.
func RemoveDuplicateQueryEndpoints(logger log.Logger, duplicatedQueriers prometheus.Counter, urls []*url.URL) []*url.URL {
set := make(map[string]struct{})
deduplicated := make([]*url.URL, 0, len(urls))
for _, u := range urls {
if _, ok := set[u.String()]; ok {
level.Warn(logger).Log("msg", "duplicate query address is provided", "addr", u.String())
duplicatedQueriers.Inc()
continue
}
deduplicated = append(deduplicated, u)
set[u.String()] = struct{}{}
}
return deduplicated
}
Loading