Skip to content

Commit

Permalink
fix(sidecar): Allow sidecar to not crash on bucket initialization #7585
Browse files Browse the repository at this point in the history
This commit allows sidecar to continue to serve prometheus read path if objstore is not available at startup.
Bucket creation will be attempted again on next upload.

This commit brings a new metric to alert in case of bucket initialization crash: thanos_sidecar_shipper_up

Signed-off-by: Amaury Decrême <amaury.decreme@gmail.com>
  • Loading branch information
amaury-d committed Sep 10, 2024
1 parent 27412d2 commit 6ec7fdc
Show file tree
Hide file tree
Showing 5 changed files with 33 additions and 23 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
.envrc
.bin

/prometheus
/thanos
Expand Down
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re

### Fixed

- [#7585](https://github.com/thanos-io/thanos/pull/7511) fix(sidecar): Allow sidecar to not crash on startup if objstore is not available
- [#7511](https://github.com/thanos-io/thanos/pull/7511) Query Frontend: fix doubled gzip compression for response body.
- [#7592](https://github.com/thanos-io/thanos/pull/7592) Ruler: Only increment `thanos_rule_evaluation_with_warnings_total` metric for non PromQL warnings.
- [#7614](https://github.com/thanos-io/thanos/pull/7614) *: fix debug log formatting.
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<p align="center"><img src="docs/img/Thanos-logo_fullmedium.png" alt="Thanos Logo"></p>

[![Latest Release](https://img.shields.io/github/release/thanos-io/thanos.svg?style=flat-square)](https://github.com/thanos-io/thanos/releases/latest) [![Go Report Card](https://goreportcard.com/badge/github.com/thanos-io/thanos)](https://goreportcard.com/report/github.com/thanos-io/thanos) [![Go Code reference](https://img.shields.io/badge/code%20reference-go.dev-darkblue.svg)](https://pkg.go.dev/github.com/thanos-io/thanos?tab=subdirectories) [![Slack](https://img.shields.io/badge/join%20slack-%23thanos-brightgreen.svg)](https://slack.cncf.io/) [![Netlify Status](https://api.netlify.com/api/v1/badges/664a5091-934c-4b0e-a7b6-bc12f822a590/deploy-status)](https://app.netlify.com/sites/thanos-io/deploys) [![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/3048/badge)](https://bestpractices.coreinfrastructure.org/projects/3048)
[![Latest Release](https://img.shields.io/github/release/thanos-io/thanos.svg?style=flat-square)](https://github.com/thanos-io/thanos/releases/latest) [![Go Report Card](https://goreportcard.com/badge/github.com/thanos-io/thanos)](https://goreportcard.com/report/github.com/thanos-io/thanos) [![Go Code reference](https://img.shields.io/badge/code%20reference-go.dev-darkblue.svg)](https://pkg.go.dev/github.com/thanos-io/thanos?tab=subdirectories) [![Slack](https://img.shields.io/badge/join%20slack-%23thanos-brightgreen.svg)](https://slack.cncf.io/) [![Netlify Status](https://api.netlify.com/api/v1/badges/664a5091-934c-4b0e-a7b6-bc12f822a590/deploy-status)](https://app.netlify.com/sites/thanos-io/deploys) [![OpenSSF Best Practices](https://www.bestpractices.dev/projects/3048/badge)](https://www.bestpractices.dev/en/projects/3048)

[![CI](https://github.com/thanos-io/thanos/workflows/CI/badge.svg)](https://github.com/thanos-io/thanos/actions?query=workflow%3ACI) [![CI](https://circleci.com/gh/thanos-io/thanos.svg?style=svg)](https://circleci.com/gh/thanos-io/thanos) [![go](https://github.com/thanos-io/thanos/workflows/go/badge.svg)](https://github.com/thanos-io/thanos/actions?query=workflow%3Ago) [![react](https://github.com/thanos-io/thanos/workflows/react/badge.svg)](https://github.com/thanos-io/thanos/actions?query=workflow%3Areact) [![docs](https://github.com/thanos-io/thanos/workflows/docs/badge.svg)](https://github.com/thanos-io/thanos/actions?query=workflow%3Adocs) [![Gitpod ready-to-code](https://img.shields.io/badge/Gitpod-ready--to--code-blue?logo=gitpod)](https://gitpod.io/#https://github.com/thanos-io/thanos) [![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://github.com/codespaces/new?hide_repo_select=true&ref=main&repo=109162639)

Expand Down
50 changes: 29 additions & 21 deletions cmd/thanos/sidecar.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ func runSidecar(

limitMinTime: conf.limitMinTime,
client: promclient.NewWithTracingClient(logger, httpClient, "thanos-sidecar"),
shipper: nil,
}

confContentYaml, err := conf.objStore.Content()
Expand Down Expand Up @@ -365,29 +366,18 @@ func runSidecar(
})
}
if uploads {
// The background shipper continuously scans the data directory and uploads
// new blocks to Google Cloud Storage or an S3-compatible storage service.
bkt, err := client.NewBucket(logger, confContentYaml, component.Sidecar.String())
if err != nil {
return err
}
bkt = objstoretracing.WrapWithTraces(objstore.WrapWithMetrics(bkt, extprom.WrapRegistererWithPrefix("thanos_", reg), bkt.Name()))

// Ensure we close up everything properly.
defer func() {
if err != nil {
runutil.CloseWithLogOnErr(logger, bkt, "bucket client")
}
}()
shipperUp := promauto.With(reg).NewGauge(prometheus.GaugeOpts{
Name: "thanos_sidecar_shipper_up",
Help: "Boolean indicator whether the sidecar shipper is running.",
})
shipperUp.Set(0)

if err := promclient.IsWALDirAccessible(conf.tsdb.path); err != nil {
level.Error(logger).Log("err", err)
}

ctx, cancel := context.WithCancel(context.Background())
g.Add(func() error {
defer runutil.CloseWithLogOnErr(logger, bkt, "bucket client")

promReadyTimeout := conf.prometheus.readyTimeout
extLabelsCtx, cancel := context.WithTimeout(ctx, promReadyTimeout)
defer cancel()
Expand All @@ -402,15 +392,32 @@ func runSidecar(
}

uploadCompactedFunc := func() bool { return conf.shipper.uploadCompacted }
s := shipper.New(logger, reg, conf.tsdb.path, bkt, m.Labels, metadata.SidecarSource,
uploadCompactedFunc, conf.shipper.allowOutOfOrderUpload, metadata.HashFunc(conf.shipper.hashFunc), conf.shipper.metaFileName)

return runutil.Repeat(30*time.Second, ctx.Done(), func() error {
if uploaded, err := s.Sync(ctx); err != nil {
if m.shipper == nil {
bkt, err := client.NewBucket(logger, confContentYaml, component.Sidecar.String())
if err != nil {
level.Warn(logger).Log("err", err, "msg", "Failed to create bucket. Sidecar will start without upload feature and will retry later.")
return nil // Allow sidecar to start without bucket.
}
bkt = objstoretracing.WrapWithTraces(objstore.WrapWithMetrics(bkt, extprom.WrapRegistererWithPrefix("thanos_", reg), bkt.Name()))
shipperUp.Set(1)
level.Debug(logger).Log("msg", "Bucket created")

defer runutil.CloseWithLogOnErr(logger, bkt, "bucket client")

m.shipper = shipper.New(logger, reg, conf.tsdb.path, bkt, m.Labels, metadata.SidecarSource,
uploadCompactedFunc, conf.shipper.allowOutOfOrderUpload, metadata.HashFunc(conf.shipper.hashFunc), conf.shipper.metaFileName)
level.Debug(logger).Log("msg", "Shipper created")
}

// The background shipper continuously scans the data directory and uploads
// new blocks to Google Cloud Storage or an S3-compatible storage service.
if uploaded, err := m.shipper.Sync(ctx); err != nil {
level.Warn(logger).Log("err", err, "uploaded", uploaded)
}

minTime, _, err := s.Timestamps()
minTime, _, err := m.shipper.Timestamps()
if err != nil {
level.Warn(logger).Log("msg", "reading timestamps failed", "err", err)
return nil
Expand Down Expand Up @@ -474,7 +481,8 @@ type promMetadata struct {
promVersion string
limitMinTime thanosmodel.TimeOrDurationValue

client *promclient.Client
client *promclient.Client
shipper *shipper.Shipper
}

func (s *promMetadata) UpdateLabels(ctx context.Context) error {
Expand Down
2 changes: 1 addition & 1 deletion docs/components/tools.md
Original file line number Diff line number Diff line change
Expand Up @@ -950,7 +950,7 @@ Flags:
--rewrite.to-relabel-config-file=<file-path>
Path to YAML file that contains relabel configs
that will be applied to blocks
--tmp.dir="/tmp/thanos-rewrite"
--tmp.dir="/var/folders/q2/s4g6w4q96y97d2x3slxq0qq40000gp/T/thanos-rewrite"
Working directory for temporary files
--tracing.config=<content>
Alternative to 'tracing.config-file' flag
Expand Down

0 comments on commit 6ec7fdc

Please sign in to comment.