Skip to content

Commit

Permalink
compact: add schedule-delete and delete-delay
Browse files Browse the repository at this point in the history
Signed-off-by: khyatisoneji <khyatisoneji5@gmail.com>
  • Loading branch information
khyatisoneji committed Feb 22, 2020
1 parent 7515974 commit efe234b
Show file tree
Hide file tree
Showing 5 changed files with 163 additions and 13 deletions.
34 changes: 32 additions & 2 deletions cmd/thanos/compact.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,9 @@ func registerCompact(m map[string]setupFunc, app *kingpin.Application) {
compactionConcurrency := cmd.Flag("compact.concurrency", "Number of goroutines to use when compacting groups.").
Default("1").Int()

deleteDelay := modelDuration(cmd.Flag("delete-delay", fmt.Sprintf("Time before a block marked for deletion is deleted from bucket")).
Default("15m"))

selectorRelabelConf := regSelectorRelabelFlags(cmd)

m[component.Compact.String()] = func(g *run.Group, logger log.Logger, reg *prometheus.Registry, tracer opentracing.Tracer, _ <-chan struct{}, _ bool) error {
Expand All @@ -130,6 +133,7 @@ func registerCompact(m map[string]setupFunc, app *kingpin.Application) {
*dataDir,
objStoreConfig,
time.Duration(*consistencyDelay),
time.Duration(*deleteDelay),
*haltOnError,
*acceptMalformedIndex,
*wait,
Expand Down Expand Up @@ -158,6 +162,7 @@ func runCompact(
dataDir string,
objStoreConfig *extflag.PathOrContent,
consistencyDelay time.Duration,
deleteDelay time.Duration,
haltOnError bool,
acceptMalformedIndex bool,
wait bool,
Expand Down Expand Up @@ -187,7 +192,13 @@ func runCompact(
Name: "thanos_compactor_aborted_partial_uploads_deletion_attempts_total",
Help: "Total number of started deletions of blocks that are assumed aborted and only partially uploaded.",
})
reg.MustRegister(halted, retried, iterations, partialUploadDeleteAttempts)
deleteDelayMetric := prometheus.NewGaugeFunc(prometheus.GaugeOpts{
Name: "thanos_delete_delay_seconds",
Help: "Configured delete delay in seconds.",
}, func() float64 {
return deleteDelay.Seconds()
})
reg.MustRegister(halted, retried, iterations, partialUploadDeleteAttempts, deleteDelayMetric)

downsampleMetrics := newDownsampleMetrics(reg)

Expand Down Expand Up @@ -285,6 +296,7 @@ func runCompact(
return errors.Wrap(err, "clean working downsample directory")
}

blockDeletionScheduler := compact.NewScheduleBlockDelete(logger, compactDir, bkt, deleteDelay)
compactor, err := compact.NewBucketCompactor(logger, sy, comp, compactDir, bkt, concurrency)
if err != nil {
cancel()
Expand Down Expand Up @@ -330,10 +342,28 @@ func runCompact(
return errors.Wrap(err, fmt.Sprintf("retention failed"))
}

compact.BestEffortCleanAbortedPartialUploads(ctx, logger, metaFetcher, bkt, partialUploadDeleteAttempts)
compact.BestEffortCleanAbortedPartialUploads(ctx, logger, metaFetcher, blockDeletionScheduler, partialUploadDeleteAttempts)
return nil
}

g.Add(func() error {
if !wait {
return blockDeletionScheduler.ScheduleDelete(ctx)
}

// --wait=true is specified.
return runutil.Repeat(5*time.Minute, ctx.Done(), func() error {
err := blockDeletionScheduler.ScheduleDelete(ctx)
if err == nil {
return nil
}

return errors.Wrap(err, "error cleaning blocks")
})
}, func(error) {
cancel()
})

g.Add(func() error {
defer runutil.CloseWithLogOnErr(logger, bkt, "bucket client")

Expand Down
15 changes: 8 additions & 7 deletions docs/components/compact.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ It is generally not semantically concurrency safe and must be deployed as a sing

It is also responsible for downsampling of data:

* creating 5m downsampling for blocks larger than **40 hours** (2d, 2w)
* creating 1h downsampling for blocks larger than **10 days** (2w).
- creating 5m downsampling for blocks larger than **40 hours** (2d, 2w)
- creating 1h downsampling for blocks larger than **10 days** (2w).

Example:

Expand All @@ -35,9 +35,9 @@ On-disk data is safe to delete between restarts and should be the first attempt
Resolution - distance between data points on your graphs. E.g.
* raw - the same as scrape interval at the moment of data ingestion
* 5m - data point is every 5 minutes
* 1h - data point is every 1h
- raw - the same as scrape interval at the moment of data ingestion
- 5m - data point is every 5 minutes
- 1h - data point is every 1h
Keep in mind, that the initial goal of downsampling is not saving disk space (Read further for elaboration on storage space consumption). The goal of downsampling is providing an opportunity to get fast results for range queries of big time intervals like months or years. In other words, if you set `--retention.resolution-raw` less then `--retention.resolution-5m` and `--retention.resolution-1h` - you might run into a problem of not being able to "zoom in" to your historical data.

Expand Down Expand Up @@ -66,7 +66,8 @@ compacting blocks from an instance even when a Prometheus instance goes down for

## Flags

[embedmd]:# (flags/compact.txt $)
[embedmd]: # "flags/compact.txt $"

```$
usage: thanos compact [<flags>]
Expand Down Expand Up @@ -144,5 +145,5 @@ Flags:
selecting blocks. It follows native Prometheus
relabel-config syntax. See format details:
https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config
--delete-delay=15m Time before a block marked for deletion is deleted from bucket.
```
5 changes: 2 additions & 3 deletions pkg/compact/clean.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ import (
"github.com/oklog/ulid"
"github.com/prometheus/client_golang/prometheus"
"github.com/thanos-io/thanos/pkg/block"
"github.com/thanos-io/thanos/pkg/objstore"
)

const (
Expand All @@ -21,7 +20,7 @@ const (
PartialUploadThresholdAge = 2 * 24 * time.Hour
)

func BestEffortCleanAbortedPartialUploads(ctx context.Context, logger log.Logger, fetcher block.MetadataFetcher, bkt objstore.Bucket, deleteAttempts prometheus.Counter) {
func BestEffortCleanAbortedPartialUploads(ctx context.Context, logger log.Logger, fetcher block.MetadataFetcher, blockDeletionScheduler *ScheduleBlockDelete, deleteAttempts prometheus.Counter) {
level.Info(logger).Log("msg", "started cleaning of aborted partial uploads")
_, partial, err := fetcher.Fetch(ctx)
if err != nil {
Expand All @@ -41,7 +40,7 @@ func BestEffortCleanAbortedPartialUploads(ctx context.Context, logger log.Logger
}

deleteAttempts.Inc()
if err := block.Delete(ctx, logger, bkt, id); err != nil {
if err := blockDeletionScheduler.MarkBlockForDeletion(id); err != nil {
level.Warn(logger).Log("msg", "failed to delete aborted partial upload; skipping", "block", id, "thresholdAge", PartialUploadThresholdAge, "err", err)
return
}
Expand Down
10 changes: 9 additions & 1 deletion pkg/compact/clean_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ import (
"bytes"
"context"
"encoding/json"
"io/ioutil"
"os"
"path"
"testing"
"time"
Expand All @@ -28,6 +30,12 @@ func TestBestEffortCleanAbortedPartialUploads(t *testing.T) {
bkt := inmem.NewBucket()
logger := log.NewNopLogger()

// Create fresh, empty directory for actual test.
dir, err := ioutil.TempDir("", "test-clean")
testutil.Ok(t, err)
defer func() { testutil.Ok(t, os.RemoveAll(dir)) }()

blockDeletionScheduler := NewScheduleBlockDelete(logger, dir, nil, 15*time.Minute)
metaFetcher, err := block.NewMetaFetcher(nil, 32, bkt, "", nil)
testutil.Ok(t, err)

Expand Down Expand Up @@ -58,7 +66,7 @@ func TestBestEffortCleanAbortedPartialUploads(t *testing.T) {
testutil.Ok(t, bkt.Upload(ctx, path.Join(shouldIgnoreID2.String(), "chunks", "000001"), &fakeChunk))

deleteAttempts := prometheus.NewCounter(prometheus.CounterOpts{})
BestEffortCleanAbortedPartialUploads(ctx, logger, metaFetcher, bkt, deleteAttempts)
BestEffortCleanAbortedPartialUploads(ctx, logger, metaFetcher, blockDeletionScheduler, deleteAttempts)
testutil.Equals(t, 1.0, promtest.ToFloat64(deleteAttempts))

exists, err := bkt.Exists(ctx, path.Join(shouldDeleteID.String(), "chunks", "000001"))
Expand Down
112 changes: 112 additions & 0 deletions pkg/compact/schedule_delete.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
// Copyright (c) The Thanos Authors.
// Licensed under the Apache License 2.0.

package compact

import (
"context"
"encoding/json"
"io/ioutil"
"os"
"path/filepath"
"strings"
"time"

"github.com/go-kit/kit/log"
"github.com/go-kit/kit/log/level"
"github.com/oklog/ulid"
"github.com/pkg/errors"
"github.com/thanos-io/thanos/pkg/block"
"github.com/thanos-io/thanos/pkg/objstore"
"github.com/thanos-io/thanos/pkg/runutil"
)

// ScheduleDeleteFilename is file to store compactor metadata
// about when the block is scheduled to be deleted.
const ScheduleDeleteFilename = "compactor-meta.json"

// CompactorMeta stores block id and when block was marked for deletion.
type CompactorMeta struct {
ID ulid.ULID `json:"id"`
DeletionTime int64 `json:"deletion_time"`
}

// ScheduleBlockDelete marks the block to be deleted.
type ScheduleBlockDelete struct {
dir string
logger log.Logger
deleteDelay time.Duration
bkt objstore.Bucket
}

// NewScheduleBlockDelete creates a new ScheduleBlockDelete.
func NewScheduleBlockDelete(logger log.Logger, dir string, bkt objstore.Bucket, deleteDelay time.Duration) *ScheduleBlockDelete {
return &ScheduleBlockDelete{
dir: dir,
logger: logger,
deleteDelay: deleteDelay,
bkt: bkt,
}
}

// ScheduleDelete deletes blocks from bucket
// deleteDelay duration after block is marked for deletion.
func (s *ScheduleBlockDelete) ScheduleDelete(ctx context.Context) error {
return filepath.Walk(s.dir, func(path string, info os.FileInfo, err error) error {
if strings.HasSuffix(path, ScheduleDeleteFilename) {
compactorMetaBytes, err := ioutil.ReadFile(path)
if err != nil {
return errors.Wrap(err, "read compactor meta")
}

compactorMeta := CompactorMeta{}

if err := json.Unmarshal([]byte(compactorMetaBytes), &compactorMeta); err != nil {
return errors.Wrap(err, "unmarshal compactor meta")
}

if time.Now().Unix()-compactorMeta.DeletionTime > s.deleteDelay.Milliseconds() {
if err := block.Delete(ctx, s.logger, s.bkt, compactorMeta.ID); err != nil {
return errors.Wrap(err, "delete block")
}

if err := os.RemoveAll(filepath.Join(s.dir, compactorMeta.ID.String())); err != nil {
return errors.Wrap(err, "delete compactor-meta.json")
}
}
}
return nil
})
}

// MarkBlockForDeletion creates a file
// which stores information about when the block was marked for deletion.
func (s *ScheduleBlockDelete) MarkBlockForDeletion(id ulid.ULID) error {
path := filepath.Join(s.dir, id.String(), ScheduleDeleteFilename)
if _, err := os.Stat(path); err == nil {
level.Warn(s.logger).Log("msg", "compactor-meta already exists for block id", id.String())
return nil
}
compactorMeta := &CompactorMeta{
ID: id,
DeletionTime: time.Now().Unix(),
}

f, err := os.Create(path)
if err != nil {
return err
}

enc := json.NewEncoder(f)
enc.SetIndent("", "\t")

if err := enc.Encode(compactorMeta); err != nil {
runutil.CloseWithLogOnErr(s.logger, f, "write meta file close")
return err
}
if err := f.Close(); err != nil {
return err
}

return nil
}

0 comments on commit efe234b

Please sign in to comment.