From 858561434274dc8bda02fcfeb26a16456adf27b2 Mon Sep 17 00:00:00 2001 From: Haibin Xie Date: Wed, 22 May 2019 16:04:42 +0800 Subject: [PATCH] stats: merge non-overlapped feedback when update bucket count (#10476) --- statistics/feedback.go | 65 +++++++++++++++++++++++++++++++++++-- statistics/feedback_test.go | 19 +++++++++-- 2 files changed, 80 insertions(+), 4 deletions(-) diff --git a/statistics/feedback.go b/statistics/feedback.go index 622c0ce031d95..3423eced30d84 100644 --- a/statistics/feedback.go +++ b/statistics/feedback.go @@ -401,12 +401,13 @@ func (b *BucketFeedback) splitBucket(newNumBkts int, totalCount float64, originB // Split the bucket. bounds := b.getBoundaries(newNumBkts + 1) bkts := make([]bucket, 0, len(bounds)-1) + sc := &stmtctx.StatementContext{TimeZone: time.UTC} for i := 1; i < len(bounds); i++ { newBkt := bucket{&bounds[i-1], bounds[i].Copy(), 0, 0} // get bucket count _, ratio := getOverlapFraction(feedback{b.lower, b.upper, int64(originBucketCount), 0}, newBkt) countInNewBkt := originBucketCount * ratio - countInNewBkt = b.refineBucketCount(newBkt, countInNewBkt) + countInNewBkt = b.refineBucketCount(sc, newBkt, countInNewBkt) // do not split if the count of result bucket is too small. if countInNewBkt < minBucketFraction*totalCount { bounds[i] = bounds[i-1] @@ -448,11 +449,71 @@ func getOverlapFraction(fb feedback, bkt bucket) (float64, float64) { return overlap, ratio } +// mergeFullyContainedFeedback merges the max fraction of non-overlapped feedbacks that are fully contained in the bucket. +func (b *BucketFeedback) mergeFullyContainedFeedback(sc *stmtctx.StatementContext, bkt bucket) (float64, float64, bool) { + var feedbacks []feedback + // Get all the fully contained feedbacks. + for _, fb := range b.feedback { + res, err := outOfRange(sc, bkt.lower, bkt.upper, fb.lower) + if res != 0 || err != nil { + return 0, 0, false + } + res, err = outOfRange(sc, bkt.lower, bkt.upper, fb.upper) + if res != 0 || err != nil { + return 0, 0, false + } + feedbacks = append(feedbacks, fb) + } + if len(feedbacks) == 0 { + return 0, 0, false + } + // Sort feedbacks by end point and start point incrementally, then pick every feedback that is not overlapped + // with the previous chosen feedbacks. + var existsErr bool + sort.Slice(feedbacks, func(i, j int) bool { + res, err := feedbacks[i].upper.CompareDatum(sc, feedbacks[j].upper) + if err != nil { + existsErr = true + } + if existsErr || res != 0 { + return res < 0 + } + res, err = feedbacks[i].lower.CompareDatum(sc, feedbacks[j].lower) + if err != nil { + existsErr = true + } + return res < 0 + }) + if existsErr { + return 0, 0, false + } + previousEnd := &types.Datum{} + var sumFraction, sumCount float64 + for _, fb := range feedbacks { + res, err := previousEnd.CompareDatum(sc, fb.lower) + if err != nil { + return 0, 0, false + } + if res <= 0 { + fraction, _ := getOverlapFraction(fb, bkt) + sumFraction += fraction + sumCount += float64(fb.count) + previousEnd = fb.upper + } + } + return sumFraction, sumCount, true +} + // refineBucketCount refine the newly split bucket count. It uses the feedback that overlaps most // with the bucket to get the bucket count. -func (b *BucketFeedback) refineBucketCount(bkt bucket, defaultCount float64) float64 { +func (b *BucketFeedback) refineBucketCount(sc *stmtctx.StatementContext, bkt bucket, defaultCount float64) float64 { bestFraction := minBucketFraction count := defaultCount + sumFraction, sumCount, ok := b.mergeFullyContainedFeedback(sc, bkt) + if ok && sumFraction > bestFraction { + bestFraction = sumFraction + count = sumCount / sumFraction + } for _, fb := range b.feedback { fraction, ratio := getOverlapFraction(fb, bkt) // choose the max overlap fraction diff --git a/statistics/feedback_test.go b/statistics/feedback_test.go index da19fc233342d..6f0cb05960f3f 100644 --- a/statistics/feedback_test.go +++ b/statistics/feedback_test.go @@ -70,9 +70,9 @@ func (s *testFeedbackSuite) TestUpdateHistogram(c *C) { defaultBucketCount = 7 defer func() { defaultBucketCount = originBucketCount }() c.Assert(UpdateHistogram(q.Hist(), q).ToString(0), Equals, - "column:0 ndv:10057 totColSize:0\n"+ + "column:0 ndv:10058 totColSize:0\n"+ "num: 10000 lower_bound: 0 upper_bound: 1 repeats: 0\n"+ - "num: 8 lower_bound: 2 upper_bound: 7 repeats: 0\n"+ + "num: 9 lower_bound: 2 upper_bound: 7 repeats: 0\n"+ "num: 11 lower_bound: 8 upper_bound: 19 repeats: 0\n"+ "num: 0 lower_bound: 20 upper_bound: 20 repeats: 0\n"+ "num: 18 lower_bound: 21 upper_bound: 39 repeats: 0\n"+ @@ -152,6 +152,21 @@ func (s *testFeedbackSuite) TestSplitBuckets(c *C) { "num: 0 lower_bound: 11 upper_bound: 1000000 repeats: 0") c.Assert(isNewBuckets, DeepEquals, []bool{true, true}) c.Assert(totalCount, Equals, int64(1)) + + // test merge the non-overlapped feedbacks. + h = NewHistogram(0, 0, 0, 0, types.NewFieldType(mysql.TypeLong), 5, 0) + appendBucket(h, 0, 10000) + feedbacks = feedbacks[:0] + feedbacks = append(feedbacks, newFeedback(0, 4000, 4000)) + feedbacks = append(feedbacks, newFeedback(4001, 9999, 1000)) + q = NewQueryFeedback(0, h, 0, false) + q.feedback = feedbacks + buckets, isNewBuckets, totalCount = splitBuckets(q.Hist(), q) + c.Assert(buildNewHistogram(q.Hist(), buckets).ToString(0), Equals, + "column:0 ndv:0 totColSize:0\n"+ + "num: 5001 lower_bound: 0 upper_bound: 10000 repeats: 0") + c.Assert(isNewBuckets, DeepEquals, []bool{false}) + c.Assert(totalCount, Equals, int64(5001)) } func (s *testFeedbackSuite) TestMergeBuckets(c *C) {