Skip to content

Commit 79cccfd

Browse files
authored
feat(blooms): Add task timining and sizing metrics (#15032)
1 parent a2b66d3 commit 79cccfd

File tree

5 files changed

+93
-24
lines changed

5 files changed

+93
-24
lines changed

pkg/bloombuild/planner/metrics.go

+9
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ type Metrics struct {
4040
tenantsDiscovered prometheus.Counter
4141
tenantTasksPlanned *prometheus.GaugeVec
4242
tenantTasksCompleted *prometheus.GaugeVec
43+
tenantTasksTiming *prometheus.HistogramVec
4344

4445
// Retention metrics
4546
retentionRunning prometheus.Gauge
@@ -166,6 +167,14 @@ func NewMetrics(
166167
Name: "tenant_tasks_completed",
167168
Help: "Number of tasks completed for a tenant during the current build iteration.",
168169
}, []string{"tenant", "status"}),
170+
tenantTasksTiming: promauto.With(r).NewHistogramVec(prometheus.HistogramOpts{
171+
Namespace: metricsNamespace,
172+
Subsystem: metricsSubsystem,
173+
Name: "tenant_tasks_time_seconds",
174+
Help: "Time spent building tasks for a tenant during the current build iteration.",
175+
// 1s --> 1h (steps of 1 minute)
176+
Buckets: prometheus.LinearBuckets(1, 60, 60),
177+
}, []string{"tenant", "status"}),
169178

170179
// Retention
171180
retentionRunning: promauto.With(r).NewGauge(prometheus.GaugeOpts{

pkg/bloombuild/planner/planner.go

+18-12
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,8 @@ type Planner struct {
4949
tsdbStore common.TSDBStore
5050
bloomStore bloomshipper.StoreBase
5151

52-
tasksQueue *queue.Queue
52+
tasksQueue *queue.Queue
53+
planFactory *strategies.Factory
5354

5455
metrics *Metrics
5556
logger log.Logger
@@ -86,14 +87,15 @@ func New(
8687
}
8788

8889
p := &Planner{
89-
cfg: cfg,
90-
limits: limits,
91-
schemaCfg: schemaCfg,
92-
tsdbStore: tsdbStore,
93-
bloomStore: bloomStore,
94-
tasksQueue: tasksQueue,
95-
metrics: NewMetrics(r, tasksQueue.GetConnectedConsumersMetric),
96-
logger: logger,
90+
cfg: cfg,
91+
limits: limits,
92+
schemaCfg: schemaCfg,
93+
tsdbStore: tsdbStore,
94+
bloomStore: bloomStore,
95+
tasksQueue: tasksQueue,
96+
planFactory: strategies.NewFactory(limits, strategies.NewMetrics(r), logger),
97+
metrics: NewMetrics(r, tasksQueue.GetConnectedConsumersMetric),
98+
logger: logger,
9799
}
98100

99101
p.retentionManager = NewRetentionManager(
@@ -370,7 +372,7 @@ func (p *Planner) computeTasks(
370372
table config.DayTable,
371373
tenant string,
372374
) ([]*protos.Task, []bloomshipper.Meta, error) {
373-
strategy, err := strategies.NewStrategy(tenant, p.limits, p.logger)
375+
strategy, err := p.planFactory.GetStrategy(tenant)
374376
if err != nil {
375377
return nil, nil, fmt.Errorf("error creating strategy: %w", err)
376378
}
@@ -770,8 +772,10 @@ func (p *Planner) BuilderLoop(builder protos.PlannerForBuilder_BuilderLoopServer
770772
continue
771773
}
772774

775+
startTime := time.Now()
773776
result, err := p.forwardTaskToBuilder(builder, builderID, task)
774777
if err != nil {
778+
p.metrics.tenantTasksTiming.WithLabelValues(task.Tenant, statusFailure).Observe(time.Since(startTime).Seconds())
775779
maxRetries := p.limits.BloomTaskMaxRetries(task.Tenant)
776780
if maxRetries > 0 && int(task.timesEnqueued.Load()) >= maxRetries {
777781
p.tasksQueue.Release(task.ProtoTask)
@@ -811,10 +815,12 @@ func (p *Planner) BuilderLoop(builder protos.PlannerForBuilder_BuilderLoopServer
811815

812816
level.Debug(logger).Log(
813817
"msg", "task completed",
814-
"duration", time.Since(task.queueTime).Seconds(),
818+
"timeSinceEnqueued", time.Since(task.queueTime).Seconds(),
819+
"buildTime", time.Since(startTime).Seconds(),
815820
"retries", task.timesEnqueued.Load()-1, // -1 because the first enqueue is not a retry
816821
)
817822
p.tasksQueue.Release(task.ProtoTask)
823+
p.metrics.tenantTasksTiming.WithLabelValues(task.Tenant, statusSuccess).Observe(time.Since(startTime).Seconds())
818824

819825
// Send the result back to the task. The channel is buffered, so this should not block.
820826
task.resultsChannel <- result
@@ -866,7 +872,7 @@ func (p *Planner) forwardTaskToBuilder(
866872
case err := <-errCh:
867873
return nil, err
868874
case <-timeout:
869-
return nil, fmt.Errorf("timeout waiting for response from builder (%s)", builderID)
875+
return nil, fmt.Errorf("timeout (%s) waiting for response from builder (%s)", taskTimeout, builderID)
870876
}
871877
}
872878

pkg/bloombuild/planner/strategies/chunksize.go

+33-5
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ import (
99
"github.com/dustin/go-humanize"
1010
"github.com/go-kit/log"
1111
"github.com/go-kit/log/level"
12+
"github.com/prometheus/client_golang/prometheus"
13+
"github.com/prometheus/client_golang/prometheus/promauto"
1214
"github.com/prometheus/common/model"
1315
"github.com/prometheus/prometheus/model/labels"
1416

@@ -21,22 +23,47 @@ import (
2123
"github.com/grafana/loki/v3/pkg/storage/stores/shipper/indexshipper/tsdb/index"
2224
)
2325

26+
const (
27+
metricsNamespace = "loki"
28+
metricsSubsystem = "bloomplanner"
29+
)
30+
31+
type ChunkSizeStrategyMetrics struct {
32+
tenantTaskSize *prometheus.HistogramVec
33+
}
34+
35+
func NewChunkSizeStrategyMetrics(r prometheus.Registerer) *ChunkSizeStrategyMetrics {
36+
return &ChunkSizeStrategyMetrics{
37+
tenantTaskSize: promauto.With(r).NewHistogramVec(prometheus.HistogramOpts{
38+
Namespace: metricsNamespace,
39+
Subsystem: metricsSubsystem,
40+
Name: "tenant_task_size_bytes",
41+
Help: "Size of tasks generated by the chunk size strategy",
42+
// 1GB --> 512GB
43+
Buckets: prometheus.ExponentialBuckets(1e9, 2, 10),
44+
}, []string{"tenant"}),
45+
}
46+
}
47+
2448
type ChunkSizeStrategyLimits interface {
2549
BloomTaskTargetSeriesChunksSizeBytes(tenantID string) uint64
2650
}
2751

2852
type ChunkSizeStrategy struct {
29-
limits ChunkSizeStrategyLimits
30-
logger log.Logger
53+
limits ChunkSizeStrategyLimits
54+
metrics *ChunkSizeStrategyMetrics
55+
logger log.Logger
3156
}
3257

3358
func NewChunkSizeStrategy(
3459
limits ChunkSizeStrategyLimits,
60+
metrics *ChunkSizeStrategyMetrics,
3561
logger log.Logger,
3662
) (*ChunkSizeStrategy, error) {
3763
return &ChunkSizeStrategy{
38-
limits: limits,
39-
logger: logger,
64+
limits: limits,
65+
metrics: metrics,
66+
logger: logger,
4067
}, nil
4168
}
4269

@@ -82,8 +109,9 @@ func (s *ChunkSizeStrategy) Plan(
82109
continue
83110
}
84111

85-
bounds := series.Bounds()
112+
s.metrics.tenantTaskSize.WithLabelValues(tenant).Observe(float64(series.Size()))
86113

114+
bounds := series.Bounds()
87115
blocks, err := getBlocksMatchingBounds(metas, bounds)
88116
if err != nil {
89117
return nil, fmt.Errorf("failed to get blocks matching bounds: %w", err)

pkg/bloombuild/planner/strategies/chunksize_test.go

+2-1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"testing"
66

77
"github.com/go-kit/log"
8+
"github.com/prometheus/client_golang/prometheus"
89
"github.com/stretchr/testify/require"
910

1011
"github.com/grafana/loki/v3/pkg/bloombuild/planner/plannertest"
@@ -228,7 +229,7 @@ func Test_ChunkSizeStrategy_Plan(t *testing.T) {
228229
logger := log.NewNopLogger()
229230
//logger := log.NewLogfmtLogger(os.Stdout)
230231

231-
strategy, err := NewChunkSizeStrategy(tc.limits, logger)
232+
strategy, err := NewChunkSizeStrategy(tc.limits, NewChunkSizeStrategyMetrics(prometheus.NewPedanticRegistry()), logger)
232233
require.NoError(t, err)
233234

234235
actual, err := strategy.Plan(context.Background(), plannertest.TestTable, "fake", tc.tsdbs, tc.originalMetas)

pkg/bloombuild/planner/strategies/factory.go

+31-6
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"fmt"
66

77
"github.com/go-kit/log"
8+
"github.com/prometheus/client_golang/prometheus"
89

910
"github.com/grafana/loki/v3/pkg/bloombuild/common"
1011
"github.com/grafana/loki/v3/pkg/bloombuild/protos"
@@ -32,18 +33,42 @@ type PlanningStrategy interface {
3233
Plan(ctx context.Context, table config.DayTable, tenant string, tsdbs TSDBSet, metas []bloomshipper.Meta) ([]*protos.Task, error)
3334
}
3435

35-
func NewStrategy(
36-
tenantID string,
36+
type Metrics struct {
37+
*ChunkSizeStrategyMetrics
38+
}
39+
40+
func NewMetrics(reg prometheus.Registerer) *Metrics {
41+
return &Metrics{
42+
ChunkSizeStrategyMetrics: NewChunkSizeStrategyMetrics(reg),
43+
}
44+
}
45+
46+
type Factory struct {
47+
limits Limits
48+
logger log.Logger
49+
metrics *Metrics
50+
}
51+
52+
func NewFactory(
3753
limits Limits,
54+
metrics *Metrics,
3855
logger log.Logger,
39-
) (PlanningStrategy, error) {
40-
strategy := limits.BloomPlanningStrategy(tenantID)
56+
) *Factory {
57+
return &Factory{
58+
limits: limits,
59+
logger: logger,
60+
metrics: metrics,
61+
}
62+
}
63+
64+
func (f *Factory) GetStrategy(tenantID string) (PlanningStrategy, error) {
65+
strategy := f.limits.BloomPlanningStrategy(tenantID)
4166

4267
switch strategy {
4368
case SplitKeyspaceStrategyName:
44-
return NewSplitKeyspaceStrategy(limits, logger)
69+
return NewSplitKeyspaceStrategy(f.limits, f.logger)
4570
case SplitBySeriesChunkSizeStrategyName:
46-
return NewChunkSizeStrategy(limits, logger)
71+
return NewChunkSizeStrategy(f.limits, f.metrics.ChunkSizeStrategyMetrics, f.logger)
4772
default:
4873
return nil, fmt.Errorf("unknown bloom planning strategy (%s)", strategy)
4974
}

0 commit comments

Comments
 (0)