Skip to content

Commit

Permalink
statistics: add more comments for the histogram bucket (#54519)
Browse files Browse the repository at this point in the history
  • Loading branch information
Rustin170506 authored Jul 9, 2024
1 parent e209c7d commit 2c72c39
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 7 deletions.
34 changes: 29 additions & 5 deletions pkg/statistics/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -142,23 +142,36 @@ func BuildColumnHist(ctx sessionctx.Context, numBuckets, id int64, collector *Sa
}

// buildHist builds histogram from samples and other information.
// It stores the built histogram in hg and return corrXYSum used for calculating the correlation.
func buildHist(sc *stmtctx.StatementContext, hg *Histogram, samples []*SampleItem, count, ndv, numBuckets int64, memTracker *memory.Tracker) (corrXYSum float64, err error) {
// It stores the built histogram in hg and returns corrXYSum used for calculating the correlation.
func buildHist(
sc *stmtctx.StatementContext,
hg *Histogram,
samples []*SampleItem,
count, ndv, numBuckets int64,
memTracker *memory.Tracker,
) (corrXYSum float64, err error) {
sampleNum := int64(len(samples))
// As we use samples to build the histogram, the bucket number and repeat should multiply a factor.
sampleFactor := float64(count) / float64(sampleNum)
// ndvFactor is a ratio that represents the average number of times each distinct value (NDV) should appear in the dataset.
// It is calculated as the total number of rows divided by the number of distinct values.
ndvFactor := float64(count) / float64(ndv)
if ndvFactor > sampleFactor {
ndvFactor = sampleFactor
}
// Since bucket count is increased by sampleFactor, so the actual max values per bucket is
// Since bucket count is increased by sampleFactor, so the actual max values per bucket are
// floor(valuesPerBucket/sampleFactor)*sampleFactor, which may less than valuesPerBucket,
// thus we need to add a sampleFactor to avoid building too many buckets.
valuesPerBucket := float64(count)/float64(numBuckets) + sampleFactor

bucketIdx := 0
var lastCount int64
corrXYSum = float64(0)
// The underlying idea is that when a value is sampled,
// it does not necessarily mean that the actual row count of this value reaches the sample factor.
// In extreme cases, it could be that this value only appears once, and that one row happens to be sampled.
// Therefore, if the sample count of this value is only once, we use a more conservative ndvFactor.
// However, if the calculated ndvFactor is larger than the sampleFactor, we still use the sampleFactor.
hg.AppendBucket(&samples[0].Value, &samples[0].Value, int64(sampleFactor), int64(ndvFactor))
bufferedMemSize := int64(0)
bufferedReleaseSize := int64(0)
Expand All @@ -168,7 +181,9 @@ func buildHist(sc *stmtctx.StatementContext, hg *Histogram, samples []*SampleIte
memTracker.Release(bufferedReleaseSize)
}
}()

var upper = new(types.Datum)
// Note: Start from 1 because we have already processed the first sample.
for i := int64(1); i < sampleNum; i++ {
corrXYSum += float64(i) * float64(samples[i].Ordinal)
hg.UpperToDatum(bucketIdx, upper)
Expand All @@ -184,22 +199,31 @@ func buildHist(sc *stmtctx.StatementContext, hg *Histogram, samples []*SampleIte
}
totalCount := float64(i+1) * sampleFactor
if cmp == 0 {
// The new item has the same value as current bucket value, to ensure that
// The new item has the same value as the current bucket value, to ensure that
// a same value only stored in a single bucket, we do not increase bucketIdx even if it exceeds
// valuesPerBucket.
hg.Buckets[bucketIdx].Count = int64(totalCount)
// This means the value appears more than once in the sample, so we need to update the repeat count.
// Because we initialize the repeat count as ndvFactor, so we need to directly reset it to 2*sampleFactor.
// Refer to the comments for the first bucket for the reason why we use ndvFactor here.
if hg.Buckets[bucketIdx].Repeat == int64(ndvFactor) {
// This is a special case, the value appears twice in the sample.
// repeat = 2 * sampleFactor
hg.Buckets[bucketIdx].Repeat = int64(2 * sampleFactor)
} else {
// repeat = 3 * sampleFactor
// repeat = 4 * sampleFactor
// ...
hg.Buckets[bucketIdx].Repeat += int64(sampleFactor)
}
} else if totalCount-float64(lastCount) <= valuesPerBucket {
// The bucket still have room to store a new item, update the bucket.
// The bucket still has room to store a new item, update the bucket.
hg.updateLastBucket(&samples[i].Value, int64(totalCount), int64(ndvFactor), false)
} else {
lastCount = hg.Buckets[bucketIdx].Count
// The bucket is full, store the item in the next bucket.
bucketIdx++
// Refer to the comments for the first bucket for the reason why we use ndvFactor here.
hg.AppendBucket(&samples[i].Value, &samples[i].Value, int64(totalCount), int64(ndvFactor))
}
}
Expand Down
9 changes: 7 additions & 2 deletions pkg/statistics/histogram.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,14 @@ const EmptyHistogramSize = int64(unsafe.Sizeof(Histogram{}))

// Bucket store the bucket count and repeat.
type Bucket struct {
Count int64
// Count is the number of items till this bucket.
Count int64
// Repeat is the number of times the upper-bound value of the bucket appears in the data.
// For example, in the range [x, y], Repeat indicates how many times y appears.
// It is used to estimate the row count of values equal to the upper bound of the bucket, similar to TopN.
Repeat int64
NDV int64
// NDV is the number of distinct values in the bucket.
NDV int64
}

// EmptyBucketSize is the size of empty bucket, 3*8=24 now.
Expand Down

0 comments on commit 2c72c39

Please sign in to comment.