From 98f200d66858624b5084b934aa2f5d1d68c23dab Mon Sep 17 00:00:00 2001 From: Michael Hoffmann Date: Thu, 2 May 2024 14:04:52 +0200 Subject: [PATCH] Sidecar: wait for prometheus on startup Signed-off-by: Michael Hoffmann --- CHANGELOG.md | 2 + cmd/thanos/sidecar.go | 99 ++++++++++++++++++++++++++----------------- 2 files changed, 63 insertions(+), 38 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fb9de3966a..5e746aaa12 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,8 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re ### Fixed +- [#7323](https://github.com/thanos-io/thanos/pull/7323) Sidecar: wait for prometheus on startup + ### Added - [#7317](https://github.com/thanos-io/thanos/pull/7317) Tracing: allow specifying resource attributes for the OTLP configuration. diff --git a/cmd/thanos/sidecar.go b/cmd/thanos/sidecar.go index 9b8c2feded..95ad4ba693 100644 --- a/cmd/thanos/sidecar.go +++ b/cmd/thanos/sidecar.go @@ -172,64 +172,87 @@ func runSidecar( Help: "Boolean indicator whether the sidecar can reach its Prometheus peer.", }) - ctx, cancel := context.WithCancel(context.Background()) - g.Add(func() error { - // Only check Prometheus's flags when upload is enabled. - if uploads { - // Check prometheus's flags to ensure same sidecar flags. - if err := validatePrometheus(ctx, m.client, logger, conf.shipper.ignoreBlockSize, m); err != nil { - return errors.Wrap(err, "validate Prometheus flags") - } - } + ctx := context.Background() + // Only check Prometheus's flags when upload is enabled. + if uploads { + // Check prometheus's flags to ensure same sidecar flags. + // We retry infinitely until we validated prometheus flags + err := runutil.Retry(conf.prometheus.getConfigInterval, ctx.Done(), func() error { + iterCtx, iterCancel := context.WithTimeout(context.Background(), conf.prometheus.getConfigTimeout) + defer iterCancel() - // We retry infinitely until we reach and fetch BuildVersion from our Prometheus. - err := runutil.Retry(2*time.Second, ctx.Done(), func() error { - if err := m.BuildVersion(ctx); err != nil { + if err := validatePrometheus(iterCtx, m.client, logger, conf.shipper.ignoreBlockSize, m); err != nil { level.Warn(logger).Log( - "msg", "failed to fetch prometheus version. Is Prometheus running? Retrying", + "msg", "failed to validate prometheus flags. Is Prometheus running? Retrying", "err", err, ) return err } level.Info(logger).Log( - "msg", "successfully loaded prometheus version", + "msg", "successfully validated prometheus flags", ) return nil }) if err != nil { - return errors.Wrap(err, "failed to get prometheus version") + return errors.Wrap(err, "failed to validate prometheus flags") } + } - // Blocking query of external labels before joining as a Source Peer into gossip. - // We retry infinitely until we reach and fetch labels from our Prometheus. - err = runutil.Retry(2*time.Second, ctx.Done(), func() error { - if err := m.UpdateLabels(ctx); err != nil { - level.Warn(logger).Log( - "msg", "failed to fetch initial external labels. Is Prometheus running? Retrying", - "err", err, - ) - promUp.Set(0) - statusProber.NotReady(err) - return err - } + // We retry infinitely until we reach and fetch BuildVersion from our Prometheus. + err := runutil.Retry(conf.prometheus.getConfigInterval, ctx.Done(), func() error { + iterCtx, iterCancel := context.WithTimeout(context.Background(), conf.prometheus.getConfigTimeout) + defer iterCancel() - level.Info(logger).Log( - "msg", "successfully loaded prometheus external labels", - "external_labels", m.Labels().String(), + if err := m.BuildVersion(iterCtx); err != nil { + level.Warn(logger).Log( + "msg", "failed to fetch prometheus version. Is Prometheus running? Retrying", + "err", err, ) - promUp.Set(1) - statusProber.Ready() - return nil - }) - if err != nil { - return errors.Wrap(err, "initial external labels query") + return err } - if len(m.Labels()) == 0 { - return errors.New("no external labels configured on Prometheus server, uniquely identifying external labels must be configured; see https://thanos.io/tip/thanos/storage.md#external-labels for details.") + level.Info(logger).Log( + "msg", "successfully loaded prometheus version", + ) + return nil + }) + if err != nil { + return errors.Wrap(err, "failed to get prometheus version") + } + + // Blocking query of external labels before joining as a Source Peer into gossip. + // We retry infinitely until we reach and fetch labels from our Prometheus. + err = runutil.Retry(conf.prometheus.getConfigInterval, ctx.Done(), func() error { + iterCtx, iterCancel := context.WithTimeout(context.Background(), conf.prometheus.getConfigTimeout) + defer iterCancel() + + if err := m.UpdateLabels(iterCtx); err != nil { + level.Warn(logger).Log( + "msg", "failed to fetch initial external labels. Is Prometheus running? Retrying", + "err", err, + ) + return err } + level.Info(logger).Log( + "msg", "successfully loaded prometheus external labels", + "external_labels", m.Labels().String(), + ) + return nil + }) + if err != nil { + return errors.Wrap(err, "initial external labels query") + } + + if len(m.Labels()) == 0 { + return errors.New("no external labels configured on Prometheus server, uniquely identifying external labels must be configured; see https://thanos.io/tip/thanos/storage.md#external-labels for details.") + } + promUp.Set(1) + statusProber.Ready() + + ctx, cancel := context.WithCancel(context.Background()) + g.Add(func() error { // Periodically query the Prometheus config. We use this as a heartbeat as well as for updating // the external labels we apply. return runutil.Repeat(conf.prometheus.getConfigInterval, ctx.Done(), func() error {