From 7d8f413b0183a2dbbaf95bed01e8a901032f4640 Mon Sep 17 00:00:00 2001 From: arthur yang Date: Sat, 7 Dec 2019 22:51:11 +0800 Subject: [PATCH 01/18] returns error messages when trigger reload with http Signed-off-by: arthur yang --- cmd/thanos/rule.go | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/cmd/thanos/rule.go b/cmd/thanos/rule.go index b275ba8b36..19eb97b5f9 100644 --- a/cmd/thanos/rule.go +++ b/cmd/thanos/rule.go @@ -472,13 +472,14 @@ func runRule( } // Handle reload and termination interrupts. - reload := make(chan struct{}, 1) + reload := make(chan chan error, 1) { cancel := make(chan struct{}) - reload <- struct{}{} // Initial reload. + reload <- make(chan error, 1) // Initial reload. g.Add(func() error { for { + var rc chan error select { case <-cancel: return errors.New("canceled") @@ -492,6 +493,7 @@ func runRule( fs, err := filepath.Glob(pat) if err != nil { // The only error can be a bad pattern. + rc <- errors.Wrapf(err, "retrieving rule files failed. Ignoring file. pattern %s", pat) level.Error(logger).Log("msg", "retrieving rule files failed. Ignoring file.", "pattern", pat, "err", err) continue } @@ -503,6 +505,7 @@ func runRule( if err := ruleMgr.Update(evalInterval, files); err != nil { configSuccess.Set(0) + rc <- errors.Wrap(err, "reloading rules failed") level.Error(logger).Log("msg", "reloading rules failed", "err", err) continue } @@ -514,7 +517,7 @@ func runRule( for _, group := range ruleMgr.RuleGroups() { rulesLoaded.WithLabelValues(group.PartialResponseStrategy.String(), group.File(), group.Name()).Set(float64(len(group.Rules()))) } - + rc <- nil } }, func(error) { close(cancel) @@ -564,7 +567,11 @@ func runRule( } router.WithPrefix(webRoutePrefix).Post("/-/reload", func(w http.ResponseWriter, r *http.Request) { - reload <- struct{}{} + rc := make(chan error) + reload <- rc + if err := <-rc; err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + } }) flagsMap := map[string]string{ From 3dea587d4ea4370703cb3c79f66db4dc1843d152 Mon Sep 17 00:00:00 2001 From: "yapo.yang" Date: Tue, 10 Dec 2019 14:11:57 +0800 Subject: [PATCH 02/18] use simple reloadRules function instead of magic chan error error Signed-off-by: yapo.yang --- cmd/thanos/rule.go | 100 ++++++++++++++++++++++++++------------------- 1 file changed, 59 insertions(+), 41 deletions(-) diff --git a/cmd/thanos/rule.go b/cmd/thanos/rule.go index 19eb97b5f9..84c9914bb7 100644 --- a/cmd/thanos/rule.go +++ b/cmd/thanos/rule.go @@ -472,52 +472,27 @@ func runRule( } // Handle reload and termination interrupts. - reload := make(chan chan error, 1) { cancel := make(chan struct{}) - reload <- make(chan error, 1) // Initial reload. - g.Add(func() error { + + //initialize rules + if err := reloadRules(logger, ruleFiles, ruleMgr, evalInterval, configSuccess, configSuccessTime, rulesLoaded); err != nil { + level.Error(logger).Log("msg", "initialize rules failed", "err", err) + //returns when initialize with invalid pattern error + if _, ok := err.(*errInvalidPattern); ok { + return err + } + } for { - var rc chan error select { - case <-cancel: - return errors.New("canceled") - case <-reload: case <-reloadSignal: - } - - level.Debug(logger).Log("msg", "configured rule files", "files", strings.Join(ruleFiles, ",")) - var files []string - for _, pat := range ruleFiles { - fs, err := filepath.Glob(pat) - if err != nil { - // The only error can be a bad pattern. - rc <- errors.Wrapf(err, "retrieving rule files failed. Ignoring file. pattern %s", pat) - level.Error(logger).Log("msg", "retrieving rule files failed. Ignoring file.", "pattern", pat, "err", err) - continue + if err := reloadRules(logger, ruleFiles, ruleMgr, evalInterval, configSuccess, configSuccessTime, rulesLoaded); err != nil { + level.Error(logger).Log("msg", "reload rules by sighup failed", "err", err) } - - files = append(files, fs...) - } - - level.Info(logger).Log("msg", "reload rule files", "numFiles", len(files)) - - if err := ruleMgr.Update(evalInterval, files); err != nil { - configSuccess.Set(0) - rc <- errors.Wrap(err, "reloading rules failed") - level.Error(logger).Log("msg", "reloading rules failed", "err", err) - continue - } - - configSuccess.Set(1) - configSuccessTime.SetToCurrentTime() - - rulesLoaded.Reset() - for _, group := range ruleMgr.RuleGroups() { - rulesLoaded.WithLabelValues(group.PartialResponseStrategy.String(), group.File(), group.Name()).Set(float64(len(group.Rules()))) + case <-cancel: + return errors.New("canceled") } - rc <- nil } }, func(error) { close(cancel) @@ -567,9 +542,8 @@ func runRule( } router.WithPrefix(webRoutePrefix).Post("/-/reload", func(w http.ResponseWriter, r *http.Request) { - rc := make(chan error) - reload <- rc - if err := <-rc; err != nil { + if err := reloadRules(logger, ruleFiles, ruleMgr, evalInterval, configSuccess, configSuccessTime, rulesLoaded); err != nil { + level.Error(logger).Log("msg", "reload rules by webhandler failed", "err", err) http.Error(w, err.Error(), http.StatusInternalServerError) } }) @@ -765,3 +739,47 @@ func addDiscoveryGroups(g *run.Group, c *http_util.Client, interval time.Duratio cancel() }) } + +type errInvalidPattern struct { + err error + pat string +} + +func (e *errInvalidPattern) Error() string { + return errors.Wrapf(e.err, "retrieving rule files failed. Ignoring file. pattern %s", e.pat).Error() +} +func reloadRules(logger log.Logger, + ruleFiles []string, + ruleMgr *thanosrule.Manager, + evalInterval time.Duration, + configSuccess prometheus.Gauge, + configSuccessTime prometheus.Gauge, + rulesLoaded *prometheus.GaugeVec) error { + level.Debug(logger).Log("msg", "configured rule files", "files", strings.Join(ruleFiles, ",")) + var files []string + for _, pat := range ruleFiles { + fs, err := filepath.Glob(pat) + if err != nil { + //check errInvalidPattern when initialize + return &errInvalidPattern{err, pat} + } + + files = append(files, fs...) + } + + level.Info(logger).Log("msg", "reload rule files", "numFiles", len(files)) + + if err := ruleMgr.Update(evalInterval, files); err != nil { + configSuccess.Set(0) + return errors.Wrap(err, "reloading rules failed") + } + + configSuccess.Set(1) + configSuccessTime.Set(float64(time.Now().UnixNano()) / 1e9) + + rulesLoaded.Reset() + for _, group := range ruleMgr.RuleGroups() { + rulesLoaded.WithLabelValues(group.PartialResponseStrategy.String(), group.File(), group.Name()).Set(float64(len(group.Rules()))) + } + return nil +} From 3ae2be2103ebb35f3186c51c742dbe07ab6ae94c Mon Sep 17 00:00:00 2001 From: "yapo.yang" Date: Tue, 10 Dec 2019 15:28:31 +0800 Subject: [PATCH 03/18] add tailing period for comment Signed-off-by: yapo.yang --- cmd/thanos/rule.go | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cmd/thanos/rule.go b/cmd/thanos/rule.go index 84c9914bb7..9cc1e4dd03 100644 --- a/cmd/thanos/rule.go +++ b/cmd/thanos/rule.go @@ -475,11 +475,10 @@ func runRule( { cancel := make(chan struct{}) g.Add(func() error { - - //initialize rules + //initialize rules. if err := reloadRules(logger, ruleFiles, ruleMgr, evalInterval, configSuccess, configSuccessTime, rulesLoaded); err != nil { level.Error(logger).Log("msg", "initialize rules failed", "err", err) - //returns when initialize with invalid pattern error + //returns when initialize with invalid pattern error. if _, ok := err.(*errInvalidPattern); ok { return err } @@ -760,7 +759,7 @@ func reloadRules(logger log.Logger, for _, pat := range ruleFiles { fs, err := filepath.Glob(pat) if err != nil { - //check errInvalidPattern when initialize + //check errInvalidPattern when initialize. return &errInvalidPattern{err, pat} } From fef63bca4a93cf3e3eb163454283265484084e21 Mon Sep 17 00:00:00 2001 From: arthur yang Date: Fri, 21 Feb 2020 13:40:07 +0800 Subject: [PATCH 04/18] fix comment Signed-off-by: arthur yang --- cmd/thanos/rule.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cmd/thanos/rule.go b/cmd/thanos/rule.go index 9cc1e4dd03..1cb1773fb1 100644 --- a/cmd/thanos/rule.go +++ b/cmd/thanos/rule.go @@ -475,10 +475,10 @@ func runRule( { cancel := make(chan struct{}) g.Add(func() error { - //initialize rules. + // Initialize rules. if err := reloadRules(logger, ruleFiles, ruleMgr, evalInterval, configSuccess, configSuccessTime, rulesLoaded); err != nil { level.Error(logger).Log("msg", "initialize rules failed", "err", err) - //returns when initialize with invalid pattern error. + // Return when initialize with invalid pattern error. if _, ok := err.(*errInvalidPattern); ok { return err } @@ -759,7 +759,7 @@ func reloadRules(logger log.Logger, for _, pat := range ruleFiles { fs, err := filepath.Glob(pat) if err != nil { - //check errInvalidPattern when initialize. + // Check errInvalidPattern when initialize. return &errInvalidPattern{err, pat} } From 44f0e05f143580aea66d85c083625301fd97dd7e Mon Sep 17 00:00:00 2001 From: arthur yang Date: Fri, 21 Feb 2020 21:47:38 +0800 Subject: [PATCH 05/18] add white space for better code reading Signed-off-by: arthur yang --- cmd/thanos/rule.go | 1 + 1 file changed, 1 insertion(+) diff --git a/cmd/thanos/rule.go b/cmd/thanos/rule.go index 1cb1773fb1..993416b12c 100644 --- a/cmd/thanos/rule.go +++ b/cmd/thanos/rule.go @@ -747,6 +747,7 @@ type errInvalidPattern struct { func (e *errInvalidPattern) Error() string { return errors.Wrapf(e.err, "retrieving rule files failed. Ignoring file. pattern %s", e.pat).Error() } + func reloadRules(logger log.Logger, ruleFiles []string, ruleMgr *thanosrule.Manager, From 732452880dad6176a6ee51e5ddb4cd479b27fef8 Mon Sep 17 00:00:00 2001 From: arthur yang Date: Sat, 22 Feb 2020 10:02:05 +0800 Subject: [PATCH 06/18] collect thanos rule metrics into one struct Signed-off-by: arthur yang --- cmd/thanos/rule.go | 105 ++++++++++++++++++++++++++------------------- 1 file changed, 60 insertions(+), 45 deletions(-) diff --git a/cmd/thanos/rule.go b/cmd/thanos/rule.go index 993416b12c..9f3b372c27 100644 --- a/cmd/thanos/rule.go +++ b/cmd/thanos/rule.go @@ -200,6 +200,55 @@ func registerRule(m map[string]setupFunc, app *kingpin.Application) { } } +// RuleMetrics defines thanos rule metrics. +type RuleMetrics struct { + configSuccess prometheus.Gauge + configSuccessTime prometheus.Gauge + duplicatedQuery prometheus.Counter + rulesLoaded *prometheus.GaugeVec + ruleEvalWarnings *prometheus.CounterVec +} + +func newRuleMetrics(reg *prometheus.Registry) *RuleMetrics { + m := new(RuleMetrics) + + m.configSuccess = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "thanos_rule_config_last_reload_successful", + Help: "Whether the last configuration reload attempt was successful.", + }) + m.configSuccessTime = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "thanos_rule_config_last_reload_success_timestamp_seconds", + Help: "Timestamp of the last successful configuration reload.", + }) + m.duplicatedQuery = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "thanos_rule_duplicated_query_addresses_total", + Help: "The number of times a duplicated query addresses is detected from the different configs in rule", + }) + m.rulesLoaded = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "thanos_rule_loaded_rules", + Help: "Loaded rules partitioned by file and group", + }, + []string{"strategy", "file", "group"}, + ) + m.ruleEvalWarnings = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "thanos_rule_evaluation_with_warnings_total", + Help: "The total number of rule evaluation that were successful but had warnings which can indicate partial error.", + }, []string{"strategy"}, + ) + m.ruleEvalWarnings.WithLabelValues(strings.ToLower(storepb.PartialResponseStrategy_ABORT.String())) + m.ruleEvalWarnings.WithLabelValues(strings.ToLower(storepb.PartialResponseStrategy_WARN.String())) + + reg.MustRegister(m.configSuccess) + reg.MustRegister(m.configSuccessTime) + reg.MustRegister(m.duplicatedQuery) + reg.MustRegister(m.rulesLoaded) + reg.MustRegister(m.ruleEvalWarnings) + + return m +} + // runRule runs a rule evaluation component that continuously evaluates alerting and recording // rules. It sends alert notifications and writes TSDB data for results like a regular Prometheus server. func runRule( @@ -239,39 +288,7 @@ func runRule( dnsSDResolver string, comp component.Component, ) error { - configSuccess := prometheus.NewGauge(prometheus.GaugeOpts{ - Name: "thanos_rule_config_last_reload_successful", - Help: "Whether the last configuration reload attempt was successful.", - }) - configSuccessTime := prometheus.NewGauge(prometheus.GaugeOpts{ - Name: "thanos_rule_config_last_reload_success_timestamp_seconds", - Help: "Timestamp of the last successful configuration reload.", - }) - duplicatedQuery := prometheus.NewCounter(prometheus.CounterOpts{ - Name: "thanos_rule_duplicated_query_addresses_total", - Help: "The number of times a duplicated query addresses is detected from the different configs in rule", - }) - rulesLoaded := prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Name: "thanos_rule_loaded_rules", - Help: "Loaded rules partitioned by file and group", - }, - []string{"strategy", "file", "group"}, - ) - ruleEvalWarnings := prometheus.NewCounterVec( - prometheus.CounterOpts{ - Name: "thanos_rule_evaluation_with_warnings_total", - Help: "The total number of rule evaluation that were successful but had warnings which can indicate partial error.", - }, []string{"strategy"}, - ) - ruleEvalWarnings.WithLabelValues(strings.ToLower(storepb.PartialResponseStrategy_ABORT.String())) - ruleEvalWarnings.WithLabelValues(strings.ToLower(storepb.PartialResponseStrategy_WARN.String())) - - reg.MustRegister(configSuccess) - reg.MustRegister(configSuccessTime) - reg.MustRegister(duplicatedQuery) - reg.MustRegister(rulesLoaded) - reg.MustRegister(ruleEvalWarnings) + metrics := newRuleMetrics(reg) var queryCfg []query.Config if len(queryConfigYAML) > 0 { @@ -435,7 +452,7 @@ func runRule( opts := opts opts.Registerer = extprom.WrapRegistererWith(prometheus.Labels{"strategy": strings.ToLower(s.String())}, reg) opts.Context = ctx - opts.QueryFunc = queryFunc(logger, queryClients, duplicatedQuery, ruleEvalWarnings, s) + opts.QueryFunc = queryFunc(logger, queryClients, metrics.duplicatedQuery, metrics.ruleEvalWarnings, s) mgr := rules.NewManager(&opts) ruleMgr.SetRuleManager(s, mgr) @@ -476,7 +493,7 @@ func runRule( cancel := make(chan struct{}) g.Add(func() error { // Initialize rules. - if err := reloadRules(logger, ruleFiles, ruleMgr, evalInterval, configSuccess, configSuccessTime, rulesLoaded); err != nil { + if err := reloadRules(logger, ruleFiles, ruleMgr, evalInterval, metrics); err != nil { level.Error(logger).Log("msg", "initialize rules failed", "err", err) // Return when initialize with invalid pattern error. if _, ok := err.(*errInvalidPattern); ok { @@ -486,7 +503,7 @@ func runRule( for { select { case <-reloadSignal: - if err := reloadRules(logger, ruleFiles, ruleMgr, evalInterval, configSuccess, configSuccessTime, rulesLoaded); err != nil { + if err := reloadRules(logger, ruleFiles, ruleMgr, evalInterval, metrics); err != nil { level.Error(logger).Log("msg", "reload rules by sighup failed", "err", err) } case <-cancel: @@ -541,7 +558,7 @@ func runRule( } router.WithPrefix(webRoutePrefix).Post("/-/reload", func(w http.ResponseWriter, r *http.Request) { - if err := reloadRules(logger, ruleFiles, ruleMgr, evalInterval, configSuccess, configSuccessTime, rulesLoaded); err != nil { + if err := reloadRules(logger, ruleFiles, ruleMgr, evalInterval, metrics); err != nil { level.Error(logger).Log("msg", "reload rules by webhandler failed", "err", err) http.Error(w, err.Error(), http.StatusInternalServerError) } @@ -752,9 +769,7 @@ func reloadRules(logger log.Logger, ruleFiles []string, ruleMgr *thanosrule.Manager, evalInterval time.Duration, - configSuccess prometheus.Gauge, - configSuccessTime prometheus.Gauge, - rulesLoaded *prometheus.GaugeVec) error { + metrics *RuleMetrics) error { level.Debug(logger).Log("msg", "configured rule files", "files", strings.Join(ruleFiles, ",")) var files []string for _, pat := range ruleFiles { @@ -770,16 +785,16 @@ func reloadRules(logger log.Logger, level.Info(logger).Log("msg", "reload rule files", "numFiles", len(files)) if err := ruleMgr.Update(evalInterval, files); err != nil { - configSuccess.Set(0) + metrics.configSuccess.Set(0) return errors.Wrap(err, "reloading rules failed") } - configSuccess.Set(1) - configSuccessTime.Set(float64(time.Now().UnixNano()) / 1e9) + metrics.configSuccess.Set(1) + metrics.configSuccessTime.Set(float64(time.Now().UnixNano()) / 1e9) - rulesLoaded.Reset() + metrics.rulesLoaded.Reset() for _, group := range ruleMgr.RuleGroups() { - rulesLoaded.WithLabelValues(group.PartialResponseStrategy.String(), group.File(), group.Name()).Set(float64(len(group.Rules()))) + metrics.rulesLoaded.WithLabelValues(group.PartialResponseStrategy.String(), group.File(), group.Name()).Set(float64(len(group.Rules()))) } return nil } From 1f6f3f5716854708c6f4bdd77a63b13d9a5428fb Mon Sep 17 00:00:00 2001 From: arthur yang Date: Sun, 23 Feb 2020 10:14:43 +0800 Subject: [PATCH 07/18] remove termination logic and keep log only Signed-off-by: arthur yang --- cmd/thanos/rule.go | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/cmd/thanos/rule.go b/cmd/thanos/rule.go index 9f3b372c27..1ec769350f 100644 --- a/cmd/thanos/rule.go +++ b/cmd/thanos/rule.go @@ -26,6 +26,7 @@ import ( "github.com/prometheus/prometheus/promql" "github.com/prometheus/prometheus/rules" "github.com/prometheus/prometheus/storage/tsdb" + tsdberrors "github.com/prometheus/prometheus/tsdb/errors" "github.com/prometheus/prometheus/util/strutil" "github.com/thanos-io/thanos/pkg/alert" "github.com/thanos-io/thanos/pkg/block/metadata" @@ -495,10 +496,6 @@ func runRule( // Initialize rules. if err := reloadRules(logger, ruleFiles, ruleMgr, evalInterval, metrics); err != nil { level.Error(logger).Log("msg", "initialize rules failed", "err", err) - // Return when initialize with invalid pattern error. - if _, ok := err.(*errInvalidPattern); ok { - return err - } } for { select { @@ -756,27 +753,22 @@ func addDiscoveryGroups(g *run.Group, c *http_util.Client, interval time.Duratio }) } -type errInvalidPattern struct { - err error - pat string -} - -func (e *errInvalidPattern) Error() string { - return errors.Wrapf(e.err, "retrieving rule files failed. Ignoring file. pattern %s", e.pat).Error() -} - func reloadRules(logger log.Logger, ruleFiles []string, ruleMgr *thanosrule.Manager, evalInterval time.Duration, metrics *RuleMetrics) error { level.Debug(logger).Log("msg", "configured rule files", "files", strings.Join(ruleFiles, ",")) - var files []string + var ( + errs tsdberrors.MultiError + files []string + ) for _, pat := range ruleFiles { fs, err := filepath.Glob(pat) if err != nil { - // Check errInvalidPattern when initialize. - return &errInvalidPattern{err, pat} + // The only error can be a bad pattern. + errs.Add(errors.Wrapf(err, "retrieving rule files failed. Ignoring file. pattern %s", pat)) + continue } files = append(files, fs...) @@ -786,7 +778,7 @@ func reloadRules(logger log.Logger, if err := ruleMgr.Update(evalInterval, files); err != nil { metrics.configSuccess.Set(0) - return errors.Wrap(err, "reloading rules failed") + errs.Add(errors.Wrap(err, "reloading rules failed")) } metrics.configSuccess.Set(1) @@ -796,5 +788,5 @@ func reloadRules(logger log.Logger, for _, group := range ruleMgr.RuleGroups() { metrics.rulesLoaded.WithLabelValues(group.PartialResponseStrategy.String(), group.File(), group.Name()).Set(float64(len(group.Rules()))) } - return nil + return errs.Err() } From 9048e0c01d458bc0d3ac7427a340f55c590b58da Mon Sep 17 00:00:00 2001 From: arthur yang Date: Sun, 23 Feb 2020 10:51:05 +0800 Subject: [PATCH 08/18] update changelog for #1848 Signed-off-by: arthur yang --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index bc68770465..94f359a819 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,7 @@ We use *breaking* word for marking changes that are not backward compatible (rel - [#2049](https://github.com/thanos-io/thanos/pull/2049) Tracing: Support sampling on Elastic APM with new sample_rate setting. - [#2008](https://github.com/thanos-io/thanos/pull/2008) Querier, Receiver, Sidecar, Store: Add gRPC [health check](https://github.com/grpc/grpc/blob/master/doc/health-checking.md) endpoints. - [#2145](https://github.com/thanos-io/thanos/pull/2145) Tracing: track query sent to prometheus via remote read api. +- [#1848](https://github.com/thanos-io/thanos/pull/1848) Ruler: Return error messages when trigger reload with http ### Changed From dbfca4e27f79ecb678afc80bdbe7179c1b54e846 Mon Sep 17 00:00:00 2001 From: arthur yang Date: Sun, 23 Feb 2020 10:52:53 +0800 Subject: [PATCH 09/18] add tailing period Signed-off-by: arthur yang --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 94f359a819..54817a27a0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,7 +29,7 @@ We use *breaking* word for marking changes that are not backward compatible (rel - [#2049](https://github.com/thanos-io/thanos/pull/2049) Tracing: Support sampling on Elastic APM with new sample_rate setting. - [#2008](https://github.com/thanos-io/thanos/pull/2008) Querier, Receiver, Sidecar, Store: Add gRPC [health check](https://github.com/grpc/grpc/blob/master/doc/health-checking.md) endpoints. - [#2145](https://github.com/thanos-io/thanos/pull/2145) Tracing: track query sent to prometheus via remote read api. -- [#1848](https://github.com/thanos-io/thanos/pull/1848) Ruler: Return error messages when trigger reload with http +- [#1848](https://github.com/thanos-io/thanos/pull/1848) Ruler: Return error messages when trigger reload with http. ### Changed From cd8d601c7663014b2357afda2d2cf26a754666d8 Mon Sep 17 00:00:00 2001 From: arthur yang Date: Tue, 25 Feb 2020 08:13:17 +0800 Subject: [PATCH 10/18] check whether registry is nil Signed-off-by: arthur yang --- cmd/thanos/rule.go | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/cmd/thanos/rule.go b/cmd/thanos/rule.go index 1ec769350f..77bec2889e 100644 --- a/cmd/thanos/rule.go +++ b/cmd/thanos/rule.go @@ -240,13 +240,13 @@ func newRuleMetrics(reg *prometheus.Registry) *RuleMetrics { ) m.ruleEvalWarnings.WithLabelValues(strings.ToLower(storepb.PartialResponseStrategy_ABORT.String())) m.ruleEvalWarnings.WithLabelValues(strings.ToLower(storepb.PartialResponseStrategy_WARN.String())) - - reg.MustRegister(m.configSuccess) - reg.MustRegister(m.configSuccessTime) - reg.MustRegister(m.duplicatedQuery) - reg.MustRegister(m.rulesLoaded) - reg.MustRegister(m.ruleEvalWarnings) - + if reg != nil { + reg.MustRegister(m.configSuccess) + reg.MustRegister(m.configSuccessTime) + reg.MustRegister(m.duplicatedQuery) + reg.MustRegister(m.rulesLoaded) + reg.MustRegister(m.ruleEvalWarnings) + } return m } From 084682f5de65cbb32ec5b17ba4aa9b7948777b54 Mon Sep 17 00:00:00 2001 From: arthur yang Date: Fri, 28 Feb 2020 08:44:04 +0800 Subject: [PATCH 11/18] tailing period in metrics Signed-off-by: arthur yang --- cmd/thanos/rule.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmd/thanos/rule.go b/cmd/thanos/rule.go index 77bec2889e..7de2cef013 100644 --- a/cmd/thanos/rule.go +++ b/cmd/thanos/rule.go @@ -223,12 +223,12 @@ func newRuleMetrics(reg *prometheus.Registry) *RuleMetrics { }) m.duplicatedQuery = prometheus.NewCounter(prometheus.CounterOpts{ Name: "thanos_rule_duplicated_query_addresses_total", - Help: "The number of times a duplicated query addresses is detected from the different configs in rule", + Help: "The number of times a duplicated query addresses is detected from the different configs in rule.", }) m.rulesLoaded = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Name: "thanos_rule_loaded_rules", - Help: "Loaded rules partitioned by file and group", + Help: "Loaded rules partitioned by file and group.", }, []string{"strategy", "file", "group"}, ) From 9583043c275fd6a7355da8dad8d92486b19982d4 Mon Sep 17 00:00:00 2001 From: arthur yang Date: Fri, 28 Feb 2020 08:53:45 +0800 Subject: [PATCH 12/18] cancel with context Signed-off-by: arthur yang --- cmd/thanos/rule.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cmd/thanos/rule.go b/cmd/thanos/rule.go index 7de2cef013..b92b3cb9dc 100644 --- a/cmd/thanos/rule.go +++ b/cmd/thanos/rule.go @@ -491,7 +491,7 @@ func runRule( // Handle reload and termination interrupts. { - cancel := make(chan struct{}) + ctx, cancel := context.WithCancel(context.Background()) g.Add(func() error { // Initialize rules. if err := reloadRules(logger, ruleFiles, ruleMgr, evalInterval, metrics); err != nil { @@ -503,12 +503,12 @@ func runRule( if err := reloadRules(logger, ruleFiles, ruleMgr, evalInterval, metrics); err != nil { level.Error(logger).Log("msg", "reload rules by sighup failed", "err", err) } - case <-cancel: + case <-ctx.Done(): return errors.New("canceled") } } }, func(error) { - close(cancel) + cancel() }) } From daca66163c07e19ef67f3461349c265462c8d3f2 Mon Sep 17 00:00:00 2001 From: arthur yang Date: Fri, 28 Feb 2020 09:32:30 +0800 Subject: [PATCH 13/18] return ctx.Err() instead of errors.New Signed-off-by: arthur yang --- cmd/thanos/rule.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/thanos/rule.go b/cmd/thanos/rule.go index b92b3cb9dc..5268bf00a4 100644 --- a/cmd/thanos/rule.go +++ b/cmd/thanos/rule.go @@ -504,7 +504,7 @@ func runRule( level.Error(logger).Log("msg", "reload rules by sighup failed", "err", err) } case <-ctx.Done(): - return errors.New("canceled") + return ctx.Err() } } }, func(error) { From 6def41383f47da0e3f6fdc7fba440c54a3dce5ff Mon Sep 17 00:00:00 2001 From: arthur yang Date: Sun, 1 Mar 2020 09:52:11 +0800 Subject: [PATCH 14/18] register thanos rule metrics with promauto Signed-off-by: arthur yang --- cmd/thanos/rule.go | 20 ++++++++------------ go.mod | 2 +- go.sum | 2 ++ 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/cmd/thanos/rule.go b/cmd/thanos/rule.go index 5268bf00a4..3998f1604b 100644 --- a/cmd/thanos/rule.go +++ b/cmd/thanos/rule.go @@ -20,6 +20,7 @@ import ( "github.com/opentracing/opentracing-go" "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" "github.com/prometheus/common/model" "github.com/prometheus/common/route" "github.com/prometheus/prometheus/pkg/labels" @@ -213,26 +214,27 @@ type RuleMetrics struct { func newRuleMetrics(reg *prometheus.Registry) *RuleMetrics { m := new(RuleMetrics) - m.configSuccess = prometheus.NewGauge(prometheus.GaugeOpts{ + factory := promauto.With(reg) + m.configSuccess = factory.NewGauge(prometheus.GaugeOpts{ Name: "thanos_rule_config_last_reload_successful", Help: "Whether the last configuration reload attempt was successful.", }) - m.configSuccessTime = prometheus.NewGauge(prometheus.GaugeOpts{ + m.configSuccessTime = factory.NewGauge(prometheus.GaugeOpts{ Name: "thanos_rule_config_last_reload_success_timestamp_seconds", Help: "Timestamp of the last successful configuration reload.", }) - m.duplicatedQuery = prometheus.NewCounter(prometheus.CounterOpts{ + m.duplicatedQuery = factory.NewCounter(prometheus.CounterOpts{ Name: "thanos_rule_duplicated_query_addresses_total", Help: "The number of times a duplicated query addresses is detected from the different configs in rule.", }) - m.rulesLoaded = prometheus.NewGaugeVec( + m.rulesLoaded = factory.NewGaugeVec( prometheus.GaugeOpts{ Name: "thanos_rule_loaded_rules", Help: "Loaded rules partitioned by file and group.", }, []string{"strategy", "file", "group"}, ) - m.ruleEvalWarnings = prometheus.NewCounterVec( + m.ruleEvalWarnings = factory.NewCounterVec( prometheus.CounterOpts{ Name: "thanos_rule_evaluation_with_warnings_total", Help: "The total number of rule evaluation that were successful but had warnings which can indicate partial error.", @@ -240,13 +242,7 @@ func newRuleMetrics(reg *prometheus.Registry) *RuleMetrics { ) m.ruleEvalWarnings.WithLabelValues(strings.ToLower(storepb.PartialResponseStrategy_ABORT.String())) m.ruleEvalWarnings.WithLabelValues(strings.ToLower(storepb.PartialResponseStrategy_WARN.String())) - if reg != nil { - reg.MustRegister(m.configSuccess) - reg.MustRegister(m.configSuccessTime) - reg.MustRegister(m.duplicatedQuery) - reg.MustRegister(m.rulesLoaded) - reg.MustRegister(m.ruleEvalWarnings) - } + return m } diff --git a/go.mod b/go.mod index 4351bc37d1..780194f68f 100644 --- a/go.mod +++ b/go.mod @@ -38,7 +38,7 @@ require ( github.com/opentracing/opentracing-go v1.1.1-0.20200124165624-2876d2018785 github.com/pkg/errors v0.9.1 github.com/prometheus/alertmanager v0.20.0 - github.com/prometheus/client_golang v1.4.2-0.20200214154132-b25ce2693a6d + github.com/prometheus/client_golang v1.4.2-0.20200228153534-057bfe813ae5 github.com/prometheus/client_model v0.2.0 github.com/prometheus/common v0.9.1 github.com/prometheus/prometheus v1.8.2-0.20200110114423-1e64d757f711 // master ~ v2.15.2 diff --git a/go.sum b/go.sum index 787df65dae..19caf7967b 100644 --- a/go.sum +++ b/go.sum @@ -606,6 +606,8 @@ github.com/prometheus/client_golang v1.2.1 h1:JnMpQc6ppsNgw9QPAGF6Dod479itz7lvls github.com/prometheus/client_golang v1.2.1/go.mod h1:XMU6Z2MjaRKVu/dC1qupJI9SiNkDYzz3xecMgSW/F+U= github.com/prometheus/client_golang v1.4.2-0.20200214154132-b25ce2693a6d h1:6GpNaEnOxPO8IxMm5zmXdIpCGayuQmp7udttdxB2BbM= github.com/prometheus/client_golang v1.4.2-0.20200214154132-b25ce2693a6d/go.mod h1:e9GMxYsXl05ICDXkRhurwBS4Q3OK1iX/F2sw+iXX5zU= +github.com/prometheus/client_golang v1.4.2-0.20200228153534-057bfe813ae5 h1:x2ltTd52P16b4O8OY6iGQDKXVpOYfmC6TVm2lfvbEJE= +github.com/prometheus/client_golang v1.4.2-0.20200228153534-057bfe813ae5/go.mod h1:e9GMxYsXl05ICDXkRhurwBS4Q3OK1iX/F2sw+iXX5zU= github.com/prometheus/client_model v0.0.0-20170216185247-6f3806018612/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= github.com/prometheus/client_model v0.0.0-20190115171406-56726106282f/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= From 94a653a0d53318c7aa3ac5688a24b8c2fb0113a7 Mon Sep 17 00:00:00 2001 From: arthur yang Date: Sun, 1 Mar 2020 09:58:40 +0800 Subject: [PATCH 15/18] return errs before set success related metrics Signed-off-by: arthur yang --- cmd/thanos/rule.go | 1 + 1 file changed, 1 insertion(+) diff --git a/cmd/thanos/rule.go b/cmd/thanos/rule.go index 3998f1604b..a7e0ca5623 100644 --- a/cmd/thanos/rule.go +++ b/cmd/thanos/rule.go @@ -775,6 +775,7 @@ func reloadRules(logger log.Logger, if err := ruleMgr.Update(evalInterval, files); err != nil { metrics.configSuccess.Set(0) errs.Add(errors.Wrap(err, "reloading rules failed")) + return errs.Err() } metrics.configSuccess.Set(1) From 315e3c33a88e0c6e62045bc6a909c4f814929f16 Mon Sep 17 00:00:00 2001 From: arthur yang Date: Sun, 1 Mar 2020 12:15:10 +0800 Subject: [PATCH 16/18] revert go.sum go.mod change Signed-off-by: arthur yang --- go.mod | 2 +- go.sum | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 780194f68f..4351bc37d1 100644 --- a/go.mod +++ b/go.mod @@ -38,7 +38,7 @@ require ( github.com/opentracing/opentracing-go v1.1.1-0.20200124165624-2876d2018785 github.com/pkg/errors v0.9.1 github.com/prometheus/alertmanager v0.20.0 - github.com/prometheus/client_golang v1.4.2-0.20200228153534-057bfe813ae5 + github.com/prometheus/client_golang v1.4.2-0.20200214154132-b25ce2693a6d github.com/prometheus/client_model v0.2.0 github.com/prometheus/common v0.9.1 github.com/prometheus/prometheus v1.8.2-0.20200110114423-1e64d757f711 // master ~ v2.15.2 diff --git a/go.sum b/go.sum index 19caf7967b..787df65dae 100644 --- a/go.sum +++ b/go.sum @@ -606,8 +606,6 @@ github.com/prometheus/client_golang v1.2.1 h1:JnMpQc6ppsNgw9QPAGF6Dod479itz7lvls github.com/prometheus/client_golang v1.2.1/go.mod h1:XMU6Z2MjaRKVu/dC1qupJI9SiNkDYzz3xecMgSW/F+U= github.com/prometheus/client_golang v1.4.2-0.20200214154132-b25ce2693a6d h1:6GpNaEnOxPO8IxMm5zmXdIpCGayuQmp7udttdxB2BbM= github.com/prometheus/client_golang v1.4.2-0.20200214154132-b25ce2693a6d/go.mod h1:e9GMxYsXl05ICDXkRhurwBS4Q3OK1iX/F2sw+iXX5zU= -github.com/prometheus/client_golang v1.4.2-0.20200228153534-057bfe813ae5 h1:x2ltTd52P16b4O8OY6iGQDKXVpOYfmC6TVm2lfvbEJE= -github.com/prometheus/client_golang v1.4.2-0.20200228153534-057bfe813ae5/go.mod h1:e9GMxYsXl05ICDXkRhurwBS4Q3OK1iX/F2sw+iXX5zU= github.com/prometheus/client_model v0.0.0-20170216185247-6f3806018612/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= github.com/prometheus/client_model v0.0.0-20190115171406-56726106282f/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= From f397bed555977a5e0af1c82dbd3ed90830e86e12 Mon Sep 17 00:00:00 2001 From: arthur yang Date: Tue, 3 Mar 2020 21:30:47 +0800 Subject: [PATCH 17/18] reload webhandler/sighup in one for loop Signed-off-by: arthur yang --- cmd/thanos/rule.go | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/cmd/thanos/rule.go b/cmd/thanos/rule.go index a7e0ca5623..dae59869c1 100644 --- a/cmd/thanos/rule.go +++ b/cmd/thanos/rule.go @@ -246,6 +246,11 @@ func newRuleMetrics(reg *prometheus.Registry) *RuleMetrics { return m } +// reloadErr handlers webhandler and returns err message. +type reloadErr struct { + errMsg chan error +} + // runRule runs a rule evaluation component that continuously evaluates alerting and recording // rules. It sends alert notifications and writes TSDB data for results like a regular Prometheus server. func runRule( @@ -486,6 +491,7 @@ func runRule( } // Handle reload and termination interrupts. + reloadWebhandler := make(chan reloadErr) { ctx, cancel := context.WithCancel(context.Background()) g.Add(func() error { @@ -499,6 +505,12 @@ func runRule( if err := reloadRules(logger, ruleFiles, ruleMgr, evalInterval, metrics); err != nil { level.Error(logger).Log("msg", "reload rules by sighup failed", "err", err) } + case reloadMsg := <-reloadWebhandler: + err := reloadRules(logger, ruleFiles, ruleMgr, evalInterval, metrics) + if err != nil { + level.Error(logger).Log("msg", "reload rules by webhandler failed", "err", err) + } + reloadMsg.errMsg <- err case <-ctx.Done(): return ctx.Err() } @@ -551,8 +563,9 @@ func runRule( } router.WithPrefix(webRoutePrefix).Post("/-/reload", func(w http.ResponseWriter, r *http.Request) { - if err := reloadRules(logger, ruleFiles, ruleMgr, evalInterval, metrics); err != nil { - level.Error(logger).Log("msg", "reload rules by webhandler failed", "err", err) + reloadMsg := reloadErr{make(chan error)} + reloadWebhandler <- reloadMsg + if err := <-reloadMsg.errMsg; err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) } }) From 56334f3eba6455d471f074af00894d9cf4d4879d Mon Sep 17 00:00:00 2001 From: "yapo.yang" Date: Wed, 4 Mar 2020 14:46:22 +0800 Subject: [PATCH 18/18] reload with chan chan error Signed-off-by: yapo.yang --- cmd/thanos/rule.go | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/cmd/thanos/rule.go b/cmd/thanos/rule.go index dae59869c1..297f509265 100644 --- a/cmd/thanos/rule.go +++ b/cmd/thanos/rule.go @@ -246,11 +246,6 @@ func newRuleMetrics(reg *prometheus.Registry) *RuleMetrics { return m } -// reloadErr handlers webhandler and returns err message. -type reloadErr struct { - errMsg chan error -} - // runRule runs a rule evaluation component that continuously evaluates alerting and recording // rules. It sends alert notifications and writes TSDB data for results like a regular Prometheus server. func runRule( @@ -491,7 +486,7 @@ func runRule( } // Handle reload and termination interrupts. - reloadWebhandler := make(chan reloadErr) + reloadWebhandler := make(chan chan error) { ctx, cancel := context.WithCancel(context.Background()) g.Add(func() error { @@ -510,7 +505,7 @@ func runRule( if err != nil { level.Error(logger).Log("msg", "reload rules by webhandler failed", "err", err) } - reloadMsg.errMsg <- err + reloadMsg <- err case <-ctx.Done(): return ctx.Err() } @@ -563,9 +558,9 @@ func runRule( } router.WithPrefix(webRoutePrefix).Post("/-/reload", func(w http.ResponseWriter, r *http.Request) { - reloadMsg := reloadErr{make(chan error)} + reloadMsg := make(chan error) reloadWebhandler <- reloadMsg - if err := <-reloadMsg.errMsg; err != nil { + if err := <-reloadMsg; err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) } })