-
Notifications
You must be signed in to change notification settings - Fork 2.1k
/
lazy_postings.go
319 lines (287 loc) · 11.6 KB
/
lazy_postings.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
// Copyright (c) The Thanos Authors.
// Licensed under the Apache License 2.0.
package store
import (
"context"
"math"
"strings"
"github.com/go-kit/log/level"
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/prometheus/model/labels"
"github.com/prometheus/prometheus/storage"
"github.com/prometheus/prometheus/tsdb/index"
"golang.org/x/exp/slices"
"github.com/thanos-io/thanos/pkg/block/indexheader"
)
var emptyLazyPostings = &lazyExpandedPostings{postings: nil, matchers: nil}
// lazyExpandedPostings contains expanded postings (series IDs). If lazy posting expansion is
// enabled, it might contain matchers that can be lazily applied during series filtering time.
type lazyExpandedPostings struct {
postings []storage.SeriesRef
matchers []*labels.Matcher
}
func newLazyExpandedPostings(ps []storage.SeriesRef, matchers ...*labels.Matcher) *lazyExpandedPostings {
return &lazyExpandedPostings{
postings: ps,
matchers: matchers,
}
}
func (p *lazyExpandedPostings) lazyExpanded() bool {
return p != nil && len(p.matchers) > 0
}
func optimizePostingsFetchByDownloadedBytes(r *bucketIndexReader, postingGroups []*postingGroup, seriesMaxSize int64, seriesMatchRatio float64, lazyExpandedPostingSizeBytes prometheus.Counter) ([]*postingGroup, bool, error) {
if len(postingGroups) <= 1 {
return postingGroups, false, nil
}
// Collect posting cardinality of each posting group.
for _, pg := range postingGroups {
// A posting group can have either add keys or remove keys but not both the same time.
vals := pg.addKeys
if len(pg.removeKeys) > 0 {
vals = pg.removeKeys
}
rngs, err := r.block.indexHeaderReader.PostingsOffsets(pg.name, vals...)
if err != nil {
return nil, false, errors.Wrapf(err, "postings offsets for %s", pg.name)
}
for _, rng := range rngs {
if rng == indexheader.NotFoundRange {
continue
}
if rng.End <= rng.Start {
level.Error(r.logger).Log("msg", "invalid index range, fallback to non lazy posting optimization")
return postingGroups, false, nil
}
// Each range starts from the #entries field which is 4 bytes.
// Need to subtract it when calculating number of postings.
// https://github.com/prometheus/prometheus/blob/v2.46.0/tsdb/docs/format/index.md.
pg.cardinality += (rng.End - rng.Start - 4) / 4
}
// If the posting group adds keys, 0 cardinality means the posting doesn't exist.
// If the posting group removes keys, no posting ranges found is fine as it is a noop.
if len(pg.addKeys) > 0 && pg.cardinality == 0 {
return nil, true, nil
}
}
slices.SortFunc(postingGroups, func(a, b *postingGroup) int {
if a.cardinality == b.cardinality {
return strings.Compare(a.name, b.name)
}
return int(a.cardinality - b.cardinality)
})
/*
Algorithm of choosing what postings we need to fetch right now and what
postings we expand lazily.
Sort posting groups by cardinality, so we can iterate from posting group with the smallest posting size.
The algorithm focuses on fetching fewer data, including postings and series.
We need to fetch at least 1 posting group with add keys in order to fetch series. So if we only fetch the first posting group with add keys,
the data bytes we need to download is formula F1: P1 * 4 + P1 * S where P1 is the number of postings in group 1
and S is the size per series. 4 is the byte size per posting.
If we are going to fetch 2 posting groups, we can intersect the two postings to reduce series we need to download (hopefully).
Assuming for each intersection, the series matching ratio is R (0 < R < 1). Then the data bytes we need to download is
formula F2: P1 * 4 + P2 * 4 + P1 * S * R.
We can get formula F3 if we are going to fetch 3 posting groups:
F3: P1 * 4 + P2 * 4 + P3 * 4 + P1 * S * R^2.
Let's compare formula F2 and F1 first.
P1 * 4 + P2 * 4 + P1 * S * R < P1 * 4 + P1 * S
=> P2 * 4 < P1 * S * (1 - R)
Left hand side is the posting group size and right hand side is basically the series size we don't need to fetch
by having the additional intersection. In order to fetch less data for F2 than F1, we just need to ensure that
the additional postings size is smaller.
Let's compare formula F3 and F2.
P1 * 4 + P2 * 4 + P3 * 4 + P1 * S * R^2 < P1 * 4 + P2 * 4 + P1 * S * R
=> P3 * 4 < P1 * S * R * (1 - R)
Same as the previous formula.
Compare formula F4 (Cost to fetch up to 4 posting groups) and F3.
P4 * 4 < P1 * S * R^2 * (1 - R)
We can generalize this to formula: Pn * 4 < P1 * S * R^(n - 2) * (1 - R)
The idea of the algorithm:
By iterating the posting group in sorted order of cardinality, we need to make sure that by fetching the current posting group,
the total data fetched is smaller than the previous posting group. If so, then we continue to next posting group,
otherwise we stop.
This ensures that when we stop at one posting group, posting groups after it always need to fetch more data.
Based on formula Pn * 4 < P1 * S * R^(n - 2) * (1 - R), left hand side is always increasing while iterating to larger
posting groups while right hand side value is always decreasing as R < 1.
*/
var (
negativeCardinalities int64
i int
)
for i = range postingGroups {
if postingGroups[i].addAll {
negativeCardinalities += postingGroups[i].cardinality
continue
}
break
}
// If the first posting group with add keys is already the last posting group
// then there is no need to set up lazy expanded posting groups.
if i >= len(postingGroups)-1 {
return postingGroups, false, nil
}
// Assume only seriesMatchRatio postings will be matched every posting group.
seriesMatched := postingGroups[i].cardinality - int64(math.Ceil(float64(negativeCardinalities)*seriesMatchRatio))
i++ // Start from next posting group as we always need to fetch at least one posting group with add keys.
for i < len(postingGroups) {
pg := postingGroups[i]
var (
underfetchedSeries int64
underfetchedSeriesSize int64
)
// Unlikely but keep it as a safeguard. When estimated matching series
// is <= 0, we stop early and mark rest of posting groups as lazy.
if seriesMatched <= 0 {
break
}
if pg.addAll {
// For posting group that has negative matchers, we assume we can underfetch
// min(pg.cardinality, current_series_matched) * match ratio series.
if pg.cardinality > seriesMatched {
underfetchedSeries = int64(math.Ceil(float64(seriesMatched) * seriesMatchRatio))
} else {
underfetchedSeries = int64(math.Ceil(float64(pg.cardinality) * seriesMatchRatio))
}
seriesMatched -= underfetchedSeries
underfetchedSeriesSize = underfetchedSeries * seriesMaxSize
} else {
underfetchedSeriesSize = seriesMaxSize * int64(math.Ceil(float64(seriesMatched)*(1-seriesMatchRatio)))
seriesMatched = int64(math.Ceil(float64(seriesMatched) * seriesMatchRatio))
}
// Need to fetch more data on postings than series we underfetch, stop here and lazy expanding rest of matchers.
if pg.cardinality*4 > underfetchedSeriesSize {
break
}
i++
}
for i < len(postingGroups) {
postingGroups[i].lazy = true
lazyExpandedPostingSizeBytes.Add(float64(4 * postingGroups[i].cardinality))
i++
}
return postingGroups, false, nil
}
func fetchLazyExpandedPostings(
ctx context.Context,
postingGroups []*postingGroup,
r *bucketIndexReader,
bytesLimiter BytesLimiter,
addAllPostings bool,
lazyExpandedPostingEnabled bool,
lazyExpandedPostingSizeBytes prometheus.Counter,
tenant string,
) (*lazyExpandedPostings, error) {
var (
err error
emptyPostingGroup bool
)
/*
There are several cases that we skip postings fetch optimization:
- Lazy expanded posting disabled.
- Add all postings. This means we don't have a posting group with any add keys.
- Block estimated max series size not set which means we don't have a way to estimate series bytes downloaded.
- `SeriesMaxSize` not set for this block then we have no way to estimate series size.
- Only one effective posting group available. We need to at least download postings from 1 posting group so no need to optimize.
*/
if lazyExpandedPostingEnabled && !addAllPostings &&
r.block.estimatedMaxSeriesSize > 0 && len(postingGroups) > 1 {
postingGroups, emptyPostingGroup, err = optimizePostingsFetchByDownloadedBytes(
r,
postingGroups,
int64(r.block.estimatedMaxSeriesSize),
0.5, // TODO(yeya24): Expose this as a flag.
lazyExpandedPostingSizeBytes,
)
if err != nil {
return nil, err
}
if emptyPostingGroup {
return emptyLazyPostings, nil
}
}
ps, matchers, err := fetchAndExpandPostingGroups(ctx, r, postingGroups, bytesLimiter, tenant)
if err != nil {
return nil, err
}
if len(ps) == 0 {
return emptyLazyPostings, nil
}
return &lazyExpandedPostings{postings: ps, matchers: matchers}, nil
}
// keysToFetchFromPostingGroups returns label pairs (postings) to fetch
// and matchers we need to use for lazy posting expansion.
// Input `postingGroups` needs to be ordered by cardinality in case lazy
// expansion is enabled. When we find the first lazy posting group we can exit.
func keysToFetchFromPostingGroups(postingGroups []*postingGroup) ([]labels.Label, []*labels.Matcher) {
var lazyMatchers []*labels.Matcher
keys := make([]labels.Label, 0)
i := 0
for i < len(postingGroups) {
pg := postingGroups[i]
if pg.lazy {
break
}
// Postings returned by fetchPostings will be in the same order as keys
// so it's important that we iterate them in the same order later.
// We don't have any other way of pairing keys and fetched postings.
for _, key := range pg.addKeys {
keys = append(keys, labels.Label{Name: pg.name, Value: key})
}
for _, key := range pg.removeKeys {
keys = append(keys, labels.Label{Name: pg.name, Value: key})
}
i++
}
if i < len(postingGroups) {
lazyMatchers = make([]*labels.Matcher, 0)
for i < len(postingGroups) {
lazyMatchers = append(lazyMatchers, postingGroups[i].matchers...)
i++
}
}
return keys, lazyMatchers
}
func fetchAndExpandPostingGroups(ctx context.Context, r *bucketIndexReader, postingGroups []*postingGroup, bytesLimiter BytesLimiter, tenant string) ([]storage.SeriesRef, []*labels.Matcher, error) {
keys, lazyMatchers := keysToFetchFromPostingGroups(postingGroups)
fetchedPostings, closeFns, err := r.fetchPostings(ctx, keys, bytesLimiter, tenant)
defer func() {
for _, closeFn := range closeFns {
closeFn()
}
}()
if err != nil {
return nil, nil, errors.Wrap(err, "get postings")
}
// Get "add" and "remove" postings from groups. We iterate over postingGroups and their keys
// again, and this is exactly the same order as before (when building the groups), so we can simply
// use one incrementing index to fetch postings from returned slice.
postingIndex := 0
var groupAdds, groupRemovals []index.Postings
for _, g := range postingGroups {
if g.lazy {
break
}
// We cannot add empty set to groupAdds, since they are intersected.
if len(g.addKeys) > 0 {
toMerge := make([]index.Postings, 0, len(g.addKeys))
for _, l := range g.addKeys {
toMerge = append(toMerge, checkNilPosting(g.name, l, fetchedPostings[postingIndex]))
postingIndex++
}
groupAdds = append(groupAdds, index.Merge(ctx, toMerge...))
}
for _, l := range g.removeKeys {
groupRemovals = append(groupRemovals, checkNilPosting(g.name, l, fetchedPostings[postingIndex]))
postingIndex++
}
}
result := index.Without(index.Intersect(groupAdds...), index.Merge(ctx, groupRemovals...))
if err := ctx.Err(); err != nil {
return nil, nil, err
}
ps, err := ExpandPostingsWithContext(ctx, result)
if err != nil {
return nil, nil, errors.Wrap(err, "expand")
}
return ps, lazyMatchers, nil
}