From e4e255200bd7a27715c7176e83e05bc788e8c9f0 Mon Sep 17 00:00:00 2001 From: Haibin Xie Date: Wed, 11 Sep 2019 17:02:45 +0800 Subject: [PATCH] stats: do not split excluded lower value ranges (#12009) --- statistics/feedback.go | 14 ++- statistics/feedback_test.go | 15 ++-- statistics/handle/update.go | 6 +- statistics/handle/update_test.go | 144 +++++++++++++++---------------- statistics/histogram.go | 57 ++++++------ statistics/histogram_test.go | 12 ++- 6 files changed, 128 insertions(+), 120 deletions(-) diff --git a/statistics/feedback.go b/statistics/feedback.go index 992bb3922325a..89e5b3c540924 100644 --- a/statistics/feedback.go +++ b/statistics/feedback.go @@ -314,15 +314,21 @@ func buildBucketFeedback(h *Histogram, feedback *QueryFeedback) (map[int]*Bucket if skip { continue } - idx, _ := h.Bounds.LowerBound(0, fb.Lower) + idx := h.Bounds.UpperBound(0, fb.Lower) bktIdx := 0 // The last bucket also stores the feedback that falls outside the upper bound. - if idx >= h.Bounds.NumRows()-2 { + if idx >= h.Bounds.NumRows()-1 { bktIdx = h.Len() - 1 + } else if h.Len() == 1 { + bktIdx = 0 } else { - bktIdx = idx / 2 + if idx == 0 { + bktIdx = 0 + } else { + bktIdx = (idx - 1) / 2 + } // Make sure that this feedback lies within the bucket. - if chunk.Compare(h.Bounds.GetRow(2*bktIdx+1), 0, fb.Upper) < 0 { + if chunk.Compare(h.Bounds.GetRow(2*(bktIdx+1)), 0, fb.Upper) < 0 { continue } } diff --git a/statistics/feedback_test.go b/statistics/feedback_test.go index 4bec1e93ce2ad..c2e5664b01f58 100644 --- a/statistics/feedback_test.go +++ b/statistics/feedback_test.go @@ -70,14 +70,13 @@ func (s *testFeedbackSuite) TestUpdateHistogram(c *C) { defaultBucketCount = 7 defer func() { defaultBucketCount = originBucketCount }() c.Assert(UpdateHistogram(q.Hist, q).ToString(0), Equals, - "column:0 ndv:10058 totColSize:0\n"+ - "num: 10000 lower_bound: 0 upper_bound: 1 repeats: 0\n"+ - "num: 9 lower_bound: 2 upper_bound: 7 repeats: 0\n"+ - "num: 11 lower_bound: 8 upper_bound: 19 repeats: 0\n"+ - "num: 0 lower_bound: 20 upper_bound: 20 repeats: 0\n"+ - "num: 18 lower_bound: 21 upper_bound: 39 repeats: 0\n"+ - "num: 18 lower_bound: 40 upper_bound: 58 repeats: 0\n"+ - "num: 2 lower_bound: 59 upper_bound: 60 repeats: 0") + "column:0 ndv:10053 totColSize:0\n"+ + "num: 10001 lower_bound: 0 upper_bound: 2 repeats: 0\n"+ + "num: 7 lower_bound: 2 upper_bound: 5 repeats: 0\n"+ + "num: 4 lower_bound: 5 upper_bound: 7 repeats: 0\n"+ + "num: 11 lower_bound: 10 upper_bound: 20 repeats: 0\n"+ + "num: 19 lower_bound: 30 upper_bound: 49 repeats: 0\n"+ + "num: 11 lower_bound: 50 upper_bound: 60 repeats: 0") } func (s *testFeedbackSuite) TestSplitBuckets(c *C) { diff --git a/statistics/handle/update.go b/statistics/handle/update.go index 28a284b0922fe..38a82f12c5292 100644 --- a/statistics/handle/update.go +++ b/statistics/handle/update.go @@ -778,11 +778,11 @@ func formatBuckets(hg *statistics.Histogram, lowBkt, highBkt, idxCols int) strin return hg.BucketToString(lowBkt, idxCols) } if lowBkt+1 == highBkt { - return fmt.Sprintf("%s, %s", hg.BucketToString(lowBkt, 0), hg.BucketToString(highBkt, 0)) + return fmt.Sprintf("%s, %s", hg.BucketToString(lowBkt, idxCols), hg.BucketToString(highBkt, idxCols)) } // do not care the middle buckets - return fmt.Sprintf("%s, (%d buckets, total count %d), %s", hg.BucketToString(lowBkt, 0), - highBkt-lowBkt-1, hg.Buckets[highBkt-1].Count-hg.Buckets[lowBkt].Count, hg.BucketToString(highBkt, 0)) + return fmt.Sprintf("%s, (%d buckets, total count %d), %s", hg.BucketToString(lowBkt, idxCols), + highBkt-lowBkt-1, hg.Buckets[highBkt-1].Count-hg.Buckets[lowBkt].Count, hg.BucketToString(highBkt, idxCols)) } func colRangeToStr(c *statistics.Column, ran *ranger.Range, actual int64, factor float64) string { diff --git a/statistics/handle/update_test.go b/statistics/handle/update_test.go index fc91d29a84cee..0843cb5efc872 100644 --- a/statistics/handle/update_test.go +++ b/statistics/handle/update_test.go @@ -684,18 +684,18 @@ func (s *testStatsSuite) TestSplitRange(c *C) { { points: []int64{0, 1, 3, 8, 8, 20}, exclude: []bool{true, false, true, false, true, false}, - result: "(0,1],(3,5],(5,7],(7,8],(8,20]", + result: "(0,1],(3,7),[7,8),[8,8],(8,10),[10,20]", }, { points: []int64{8, 10, 20, 30}, exclude: []bool{false, false, true, true}, - result: "[8,8],(8,10],(20,30)", + result: "[8,10),[10,10],(20,30)", }, { // test remove invalid range points: []int64{8, 9}, exclude: []bool{false, true}, - result: "[8,8]", + result: "[8,9)", }, } for _, t := range tests { @@ -743,25 +743,25 @@ func (s *testStatsSuite) TestQueryFeedback(c *C) { // test primary key feedback sql: "select * from t where t.a <= 5", hist: "column:1 ndv:4 totColSize:0\n" + - "num: 1 lower_bound: -9223372036854775808 upper_bound: 1 repeats: 0\n" + - "num: 1 lower_bound: 2 upper_bound: 2 repeats: 1\n" + - "num: 2 lower_bound: 3 upper_bound: 5 repeats: 0", + "num: 1 lower_bound: -9223372036854775808 upper_bound: 2 repeats: 0\n" + + "num: 2 lower_bound: 2 upper_bound: 4 repeats: 0\n" + + "num: 1 lower_bound: 4 upper_bound: 4 repeats: 1", idxCols: 0, }, { // test index feedback by double read sql: "select * from t use index(idx) where t.b <= 5", hist: "index:1 ndv:2\n" + - "num: 2 lower_bound: -inf upper_bound: 2 repeats: 0\n" + - "num: 2 lower_bound: 3 upper_bound: 6 repeats: 0", + "num: 3 lower_bound: -inf upper_bound: 5 repeats: 0\n" + + "num: 1 lower_bound: 5 upper_bound: 5 repeats: 1", idxCols: 1, }, { // test index feedback by single read sql: "select b from t use index(idx) where t.b <= 5", hist: "index:1 ndv:2\n" + - "num: 2 lower_bound: -inf upper_bound: 2 repeats: 0\n" + - "num: 2 lower_bound: 3 upper_bound: 6 repeats: 0", + "num: 3 lower_bound: -inf upper_bound: 5 repeats: 0\n" + + "num: 1 lower_bound: 5 upper_bound: 5 repeats: 1", idxCols: 1, }, } @@ -855,7 +855,7 @@ func (s *testStatsSuite) TestQueryFeedbackForPartition(c *C) { // test primary key feedback sql: "select * from t where t.a <= 5", hist: "column:1 ndv:2 totColSize:0\n" + - "num: 1 lower_bound: -9223372036854775808 upper_bound: 1 repeats: 0\n" + + "num: 1 lower_bound: -9223372036854775808 upper_bound: 2 repeats: 0\n" + "num: 1 lower_bound: 2 upper_bound: 5 repeats: 0", idxCols: 0, }, @@ -987,8 +987,8 @@ func (s *testStatsSuite) TestUpdateStatsByLocalFeedback(c *C) { c.Assert(tbl.Columns[tblInfo.Columns[0].ID].ToString(0), Equals, "column:1 ndv:3 totColSize:0\n"+ "num: 1 lower_bound: 1 upper_bound: 1 repeats: 1\n"+ - "num: 1 lower_bound: 2 upper_bound: 2 repeats: 1\n"+ - "num: 2 lower_bound: 3 upper_bound: 9223372036854775807 repeats: 0") + "num: 2 lower_bound: 2 upper_bound: 4 repeats: 0\n"+ + "num: 1 lower_bound: 4 upper_bound: 9223372036854775807 repeats: 0") sc := &stmtctx.StatementContext{TimeZone: time.Local} low, err := codec.EncodeKey(sc, nil, types.NewIntDatum(5)) c.Assert(err, IsNil) @@ -996,8 +996,8 @@ func (s *testStatsSuite) TestUpdateStatsByLocalFeedback(c *C) { c.Assert(tbl.Indices[tblInfo.Indices[0].ID].CMSketch.QueryBytes(low), Equals, uint64(2)) c.Assert(tbl.Indices[tblInfo.Indices[0].ID].ToString(1), Equals, "index:1 ndv:2\n"+ - "num: 2 lower_bound: -inf upper_bound: 2 repeats: 0\n"+ - "num: 2 lower_bound: 3 upper_bound: 6 repeats: 0") + "num: 2 lower_bound: -inf upper_bound: 5 repeats: 0\n"+ + "num: 1 lower_bound: 5 upper_bound: 5 repeats: 1") // Test that it won't cause panic after update. testKit.MustQuery("select * from t use index(idx) where b > 0") @@ -1038,8 +1038,8 @@ func (s *testStatsSuite) TestUpdatePartitionStatsByLocalFeedback(c *C) { c.Assert(tbl.Columns[tblInfo.Columns[0].ID].ToString(0), Equals, "column:1 ndv:3 totColSize:0\n"+ "num: 1 lower_bound: 1 upper_bound: 1 repeats: 1\n"+ - "num: 1 lower_bound: 2 upper_bound: 2 repeats: 1\n"+ - "num: 2 lower_bound: 3 upper_bound: 9223372036854775807 repeats: 0") + "num: 2 lower_bound: 2 upper_bound: 4 repeats: 0\n"+ + "num: 1 lower_bound: 4 upper_bound: 9223372036854775807 repeats: 0") } type logHook struct { @@ -1112,13 +1112,13 @@ func (s *testStatsSuite) TestLogDetailedInfo(c *C) { }{ { sql: "select * from t where t.a <= 15", - result: "[stats-feedback] test.t, column=a, rangeStr=range: [-inf,7), actual: 8, expected: 7, buckets: {num: 8 lower_bound: 0 upper_bound: 7 repeats: 1}" + + result: "[stats-feedback] test.t, column=a, rangeStr=range: [-inf,8), actual: 8, expected: 8, buckets: {num: 8 lower_bound: 0 upper_bound: 7 repeats: 1, num: 8 lower_bound: 8 upper_bound: 15 repeats: 1}" + "[stats-feedback] test.t, column=a, rangeStr=range: [8,15), actual: 8, expected: 7, buckets: {num: 8 lower_bound: 8 upper_bound: 15 repeats: 1}", }, { sql: "select * from t use index(idx) where t.b <= 15", - result: "[stats-feedback] test.t, index=idx, rangeStr=range: [-inf,7), actual: 8, expected: 7, histogram: {num: 8 lower_bound: 0 upper_bound: 7 repeats: 1}" + - "[stats-feedback] test.t, index=idx, rangeStr=range: [8,15), actual: 8, expected: 7, histogram: {num: 8 lower_bound: 8 upper_bound: 15 repeats: 1}", + result: "[stats-feedback] test.t, index=idx, rangeStr=range: [-inf,8), actual: 8, expected: 8, histogram: {num: 8 lower_bound: 0 upper_bound: 7 repeats: 1, num: 8 lower_bound: 8 upper_bound: 15 repeats: 1}" + + "[stats-feedback] test.t, index=idx, rangeStr=range: [8,16), actual: 8, expected: 8, histogram: {num: 8 lower_bound: 8 upper_bound: 15 repeats: 1, num: 4 lower_bound: 16 upper_bound: 19 repeats: 1}", }, { sql: "select b from t use index(idx_ba) where b = 1 and a <= 5", @@ -1297,8 +1297,8 @@ func (s *testStatsSuite) TestIndexQueryFeedback(c *C) { { sql: "select * from t use index(idx_ab) where a = 1 and b < 21", hist: "index:1 ndv:20\n" + - "num: 16 lower_bound: -inf upper_bound: 7 repeats: 0\n" + - "num: 16 lower_bound: 8 upper_bound: 15 repeats: 0\n" + + "num: 15 lower_bound: -inf upper_bound: 8 repeats: 0\n" + + "num: 15 lower_bound: 8 upper_bound: 16 repeats: 0\n" + "num: 8 lower_bound: 16 upper_bound: 21 repeats: 0", rangeID: tblInfo.Indices[0].ID, idxID: tblInfo.Indices[1].ID, @@ -1308,9 +1308,9 @@ func (s *testStatsSuite) TestIndexQueryFeedback(c *C) { { sql: "select * from t use index(idx_ac) where a = 1 and c < 21", hist: "column:3 ndv:20 totColSize:20\n" + - "num: 13 lower_bound: -9223372036854775808 upper_bound: 6 repeats: 0\n" + - "num: 13 lower_bound: 7 upper_bound: 13 repeats: 0\n" + - "num: 12 lower_bound: 14 upper_bound: 21 repeats: 0", + "num: 15 lower_bound: -9223372036854775808 upper_bound: 7 repeats: 0\n" + + "num: 14 lower_bound: 7 upper_bound: 14 repeats: 0\n" + + "num: 13 lower_bound: 14 upper_bound: 21 repeats: 0", rangeID: tblInfo.Columns[2].ID, idxID: tblInfo.Indices[2].ID, idxCols: 0, @@ -1319,57 +1319,57 @@ func (s *testStatsSuite) TestIndexQueryFeedback(c *C) { { sql: "select * from t use index(idx_ad) where a = 1 and d < 21", hist: "column:4 ndv:20 totColSize:160\n" + - "num: 13 lower_bound: -10000000000000 upper_bound: 6 repeats: 0\n" + - "num: 12 lower_bound: 7 upper_bound: 13 repeats: 0\n" + - "num: 10 lower_bound: 14 upper_bound: 21 repeats: 0", + "num: 15 lower_bound: -10000000000000 upper_bound: 7 repeats: 0\n" + + "num: 14 lower_bound: 7 upper_bound: 14 repeats: 0\n" + + "num: 13 lower_bound: 14 upper_bound: 21 repeats: 0", rangeID: tblInfo.Columns[3].ID, idxID: tblInfo.Indices[3].ID, idxCols: 0, - eqCount: 32, + eqCount: 35, }, { sql: "select * from t use index(idx_ae) where a = 1 and e < 21", hist: "column:5 ndv:20 totColSize:160\n" + - "num: 13 lower_bound: -100000000000000000000000 upper_bound: 6 repeats: 0\n" + - "num: 12 lower_bound: 7 upper_bound: 13 repeats: 0\n" + - "num: 10 lower_bound: 14 upper_bound: 21 repeats: 0", + "num: 15 lower_bound: -100000000000000000000000 upper_bound: 7 repeats: 0\n" + + "num: 14 lower_bound: 7 upper_bound: 14 repeats: 0\n" + + "num: 13 lower_bound: 14 upper_bound: 21 repeats: 0", rangeID: tblInfo.Columns[4].ID, idxID: tblInfo.Indices[4].ID, idxCols: 0, - eqCount: 32, + eqCount: 35, }, { sql: "select * from t use index(idx_af) where a = 1 and f < 21", hist: "column:6 ndv:20 totColSize:200\n" + - "num: 13 lower_bound: -999999999999999.99 upper_bound: 6.00 repeats: 0\n" + - "num: 12 lower_bound: 7.00 upper_bound: 13.00 repeats: 0\n" + - "num: 10 lower_bound: 14.00 upper_bound: 21.00 repeats: 0", + "num: 15 lower_bound: -999999999999999.99 upper_bound: 7.00 repeats: 0\n" + + "num: 14 lower_bound: 7.00 upper_bound: 14.00 repeats: 0\n" + + "num: 13 lower_bound: 14.00 upper_bound: 21.00 repeats: 0", rangeID: tblInfo.Columns[5].ID, idxID: tblInfo.Indices[5].ID, idxCols: 0, - eqCount: 32, + eqCount: 35, }, { sql: "select * from t use index(idx_ag) where a = 1 and g < 21", hist: "column:7 ndv:20 totColSize:98\n" + - "num: 13 lower_bound: -838:59:59 upper_bound: 00:00:06 repeats: 0\n" + - "num: 12 lower_bound: 00:00:07 upper_bound: 00:00:13 repeats: 0\n" + - "num: 10 lower_bound: 00:00:14 upper_bound: 00:00:21 repeats: 0", + "num: 15 lower_bound: -838:59:59 upper_bound: 00:00:07 repeats: 0\n" + + "num: 14 lower_bound: 00:00:07 upper_bound: 00:00:14 repeats: 0\n" + + "num: 13 lower_bound: 00:00:14 upper_bound: 00:00:21 repeats: 0", rangeID: tblInfo.Columns[6].ID, idxID: tblInfo.Indices[6].ID, idxCols: 0, - eqCount: 32, + eqCount: 35, }, { sql: `select * from t use index(idx_ah) where a = 1 and h < "1000-01-21"`, hist: "column:8 ndv:20 totColSize:180\n" + - "num: 13 lower_bound: 1000-01-01 upper_bound: 1000-01-07 repeats: 0\n" + - "num: 12 lower_bound: 1000-01-08 upper_bound: 1000-01-14 repeats: 0\n" + - "num: 10 lower_bound: 1000-01-15 upper_bound: 1000-01-21 repeats: 0", + "num: 15 lower_bound: 1000-01-01 upper_bound: 1000-01-08 repeats: 0\n" + + "num: 14 lower_bound: 1000-01-08 upper_bound: 1000-01-15 repeats: 0\n" + + "num: 13 lower_bound: 1000-01-15 upper_bound: 1000-01-21 repeats: 0", rangeID: tblInfo.Columns[7].ID, idxID: tblInfo.Indices[7].ID, idxCols: 0, - eqCount: 32, + eqCount: 35, }, } for i, t := range tests { @@ -1465,9 +1465,9 @@ func (s *testStatsSuite) TestAbnormalIndexFeedback(c *C) { // The real count of `a = 1` is 0. sql: "select * from t where a = 1 and b < 21", hist: "column:2 ndv:20 totColSize:20\n" + - "num: 4 lower_bound: -9223372036854775808 upper_bound: 6 repeats: 0\n" + - "num: 3 lower_bound: 7 upper_bound: 13 repeats: 0\n" + - "num: 6 lower_bound: 14 upper_bound: 19 repeats: 1", + "num: 5 lower_bound: -9223372036854775808 upper_bound: 7 repeats: 0\n" + + "num: 4 lower_bound: 7 upper_bound: 14 repeats: 0\n" + + "num: 4 lower_bound: 14 upper_bound: 21 repeats: 0", rangeID: tblInfo.Columns[1].ID, idxID: tblInfo.Indices[0].ID, eqCount: 3, @@ -1476,9 +1476,9 @@ func (s *testStatsSuite) TestAbnormalIndexFeedback(c *C) { // The real count of `b > 10` is 0. sql: "select * from t where a = 2 and b > 10", hist: "column:2 ndv:20 totColSize:20\n" + - "num: 4 lower_bound: -9223372036854775808 upper_bound: 6 repeats: 0\n" + - "num: 2 lower_bound: 7 upper_bound: 13 repeats: 0\n" + - "num: 6 lower_bound: 14 upper_bound: 19 repeats: 1", + "num: 5 lower_bound: -9223372036854775808 upper_bound: 7 repeats: 0\n" + + "num: 6 lower_bound: 7 upper_bound: 14 repeats: 0\n" + + "num: 7 lower_bound: 14 upper_bound: 9223372036854775807 repeats: 0", rangeID: tblInfo.Columns[1].ID, idxID: tblInfo.Indices[0].ID, eqCount: 3, @@ -1528,27 +1528,27 @@ func (s *testStatsSuite) TestFeedbackRanges(c *C) { colID int64 }{ { - sql: "select * from t where a <= 50 or (a > 130 and a < 140)", - hist: "column:1 ndv:20 totColSize:0\n" + - "num: 8 lower_bound: 0 upper_bound: 7 repeats: 1\n" + - "num: 8 lower_bound: 8 upper_bound: 15 repeats: 1\n" + - "num: 4 lower_bound: 16 upper_bound: 19 repeats: 1", + sql: "select * from t use index() where a <= 50 or (a > 130 and a < 140)", + hist: "column:1 ndv:30 totColSize:0\n" + + "num: 8 lower_bound: -128 upper_bound: 8 repeats: 0\n" + + "num: 8 lower_bound: 8 upper_bound: 16 repeats: 0\n" + + "num: 14 lower_bound: 16 upper_bound: 50 repeats: 0", colID: 1, }, { - sql: "select * from t where a >= 10", - hist: "column:1 ndv:20 totColSize:0\n" + - "num: 8 lower_bound: 0 upper_bound: 7 repeats: 1\n" + - "num: 8 lower_bound: 8 upper_bound: 15 repeats: 1\n" + - "num: 4 lower_bound: 16 upper_bound: 19 repeats: 1", + sql: "select * from t use index() where a >= 10", + hist: "column:1 ndv:30 totColSize:0\n" + + "num: 8 lower_bound: -128 upper_bound: 8 repeats: 0\n" + + "num: 8 lower_bound: 8 upper_bound: 16 repeats: 0\n" + + "num: 14 lower_bound: 16 upper_bound: 127 repeats: 0", colID: 1, }, { sql: "select * from t use index(idx) where a = 1 and (b <= 50 or (b > 130 and b < 140))", hist: "column:2 ndv:20 totColSize:20\n" + - "num: 9 lower_bound: -128 upper_bound: 6 repeats: 0\n" + - "num: 7 lower_bound: 7 upper_bound: 13 repeats: 1\n" + - "num: 6 lower_bound: 14 upper_bound: 19 repeats: 1", + "num: 8 lower_bound: -128 upper_bound: 7 repeats: 0\n" + + "num: 8 lower_bound: 7 upper_bound: 14 repeats: 0\n" + + "num: 7 lower_bound: 14 upper_bound: 51 repeats: 0", colID: 2, }, } @@ -1603,32 +1603,32 @@ func (s *testStatsSuite) TestUnsignedFeedbackRanges(c *C) { { sql: "select * from t where a <= 50", hist: "column:1 ndv:30 totColSize:0\n" + - "num: 8 lower_bound: 0 upper_bound: 7 repeats: 0\n" + - "num: 8 lower_bound: 8 upper_bound: 15 repeats: 0\n" + + "num: 8 lower_bound: 0 upper_bound: 8 repeats: 0\n" + + "num: 8 lower_bound: 8 upper_bound: 16 repeats: 0\n" + "num: 14 lower_bound: 16 upper_bound: 50 repeats: 0", tblName: "t", }, { sql: "select count(*) from t", hist: "column:1 ndv:30 totColSize:0\n" + - "num: 8 lower_bound: 0 upper_bound: 7 repeats: 0\n" + - "num: 8 lower_bound: 8 upper_bound: 15 repeats: 0\n" + + "num: 8 lower_bound: 0 upper_bound: 8 repeats: 0\n" + + "num: 8 lower_bound: 8 upper_bound: 16 repeats: 0\n" + "num: 14 lower_bound: 16 upper_bound: 255 repeats: 0", tblName: "t", }, { sql: "select * from t1 where a <= 50", hist: "column:1 ndv:30 totColSize:0\n" + - "num: 8 lower_bound: 0 upper_bound: 7 repeats: 0\n" + - "num: 8 lower_bound: 8 upper_bound: 15 repeats: 0\n" + + "num: 8 lower_bound: 0 upper_bound: 8 repeats: 0\n" + + "num: 8 lower_bound: 8 upper_bound: 16 repeats: 0\n" + "num: 14 lower_bound: 16 upper_bound: 50 repeats: 0", tblName: "t1", }, { sql: "select count(*) from t1", hist: "column:1 ndv:30 totColSize:0\n" + - "num: 8 lower_bound: 0 upper_bound: 7 repeats: 0\n" + - "num: 8 lower_bound: 8 upper_bound: 15 repeats: 0\n" + + "num: 8 lower_bound: 0 upper_bound: 8 repeats: 0\n" + + "num: 8 lower_bound: 8 upper_bound: 16 repeats: 0\n" + "num: 14 lower_bound: 16 upper_bound: 18446744073709551615 repeats: 0", tblName: "t1", }, diff --git a/statistics/histogram.go b/statistics/histogram.go index 659793dfaa3b4..f529c803d20e0 100644 --- a/statistics/histogram.go +++ b/statistics/histogram.go @@ -428,41 +428,43 @@ func (hg *Histogram) typeMatch(ranges []*ranger.Range) bool { return true } -// SplitRange splits the range according to the histogram upper bound. Note that we treat last bucket's upper bound -// as inf, so all the split Ranges will totally fall in one of the (-inf, u(0)], (u(0), u(1)],...(u(n-3), u(n-2)], -// (u(n-2), +inf), where n is the number of buckets, u(i) is the i-th bucket's upper bound. +// SplitRange splits the range according to the histogram lower bound. Note that we treat first bucket's lower bound +// as -inf and last bucket's upper bound as +inf, so all the split ranges will totally fall in one of the (-inf, l(1)), +// [l(1), l(2)),...[l(n-2), l(n-1)), [l(n-1), +inf), where n is the number of buckets, l(i) is the i-th bucket's lower bound. func (hg *Histogram) SplitRange(sc *stmtctx.StatementContext, oldRanges []*ranger.Range, encoded bool) ([]*ranger.Range, bool) { if !hg.typeMatch(oldRanges) { return oldRanges, false } + // Treat the only buckets as (-inf, +inf), so we do not need split it. + if hg.Len() == 1 { + return oldRanges, true + } ranges := make([]*ranger.Range, 0, len(oldRanges)) for _, ran := range oldRanges { ranges = append(ranges, ran.Clone()) } split := make([]*ranger.Range, 0, len(ranges)) for len(ranges) > 0 { - // Find the last bound that greater or equal to the LowVal. + // Find the first bound that greater than the LowVal. idx := hg.Bounds.UpperBound(0, &ranges[0].LowVal[0]) - if !ranges[0].LowExclude && idx > 0 { - cmp := chunk.Compare(hg.Bounds.GetRow(idx-1), 0, &ranges[0].LowVal[0]) - if cmp == 0 { - idx-- - } - } - // Treat last bucket's upper bound as inf, so we do not need split any more. - if idx >= hg.Bounds.NumRows()-2 { + // Treat last bucket's upper bound as +inf, so we do not need split any more. + if idx >= hg.Bounds.NumRows()-1 { split = append(split, ranges...) break } - // Get the corresponding upper bound. - if idx%2 == 0 { + // Treat first buckets's lower bound as -inf, just increase it to the next lower bound. + if idx == 0 { + idx = 2 + } + // Get the next lower bound. + if idx%2 == 1 { idx++ } - upperBound := hg.Bounds.GetRow(idx) + lowerBound := hg.Bounds.GetRow(idx) var i int - // Find the first range that need to be split by the upper bound. + // Find the first range that need to be split by the lower bound. for ; i < len(ranges); i++ { - if chunk.Compare(upperBound, 0, &ranges[i].HighVal[0]) < 0 { + if chunk.Compare(lowerBound, 0, &ranges[i].HighVal[0]) <= 0 { break } } @@ -471,17 +473,20 @@ func (hg *Histogram) SplitRange(sc *stmtctx.StatementContext, oldRanges []*range if len(ranges) == 0 { break } - // Split according to the upper bound. - cmp := chunk.Compare(upperBound, 0, &ranges[0].LowVal[0]) - if cmp > 0 || (cmp == 0 && !ranges[0].LowExclude) { - upper := upperBound.GetDatum(0, hg.Tp) - split = append(split, &ranger.Range{ + // Split according to the lower bound. + cmp := chunk.Compare(lowerBound, 0, &ranges[0].LowVal[0]) + if cmp > 0 { + lower := lowerBound.GetDatum(0, hg.Tp) + newRange := &ranger.Range{ LowExclude: ranges[0].LowExclude, LowVal: []types.Datum{ranges[0].LowVal[0]}, - HighVal: []types.Datum{upper}, - HighExclude: false}) - ranges[0].LowVal[0] = upper - ranges[0].LowExclude = true + HighVal: []types.Datum{lower}, + HighExclude: true} + if validRange(sc, newRange, encoded) { + split = append(split, newRange) + } + ranges[0].LowVal[0] = lower + ranges[0].LowExclude = false if !validRange(sc, ranges[0], encoded) { ranges = ranges[1:] } diff --git a/statistics/histogram_test.go b/statistics/histogram_test.go index 68b8c55f2ce33..9bbe9e35a0137 100644 --- a/statistics/histogram_test.go +++ b/statistics/histogram_test.go @@ -50,11 +50,9 @@ func (s *testStatisticsSuite) TestNewHistogramBySelectivity(c *C) { node.Ranges = append(node.Ranges, &ranger.Range{LowVal: types.MakeDatums(25), HighVal: []types.Datum{types.MaxValueDatum()}}) intColResult := `column:1 ndv:16 totColSize:0 num: 30 lower_bound: 0 upper_bound: 2 repeats: 10 -num: 10 lower_bound: 3 upper_bound: 5 repeats: 10 -num: 20 lower_bound: 6 upper_bound: 8 repeats: 10 -num: 20 lower_bound: 9 upper_bound: 11 repeats: 0 +num: 20 lower_bound: 6 upper_bound: 8 repeats: 0 +num: 30 lower_bound: 9 upper_bound: 11 repeats: 0 num: 10 lower_bound: 12 upper_bound: 14 repeats: 0 -num: 20 lower_bound: 24 upper_bound: 26 repeats: 10 num: 30 lower_bound: 27 upper_bound: 29 repeats: 0` stringCol := &Column{} @@ -85,9 +83,9 @@ num: 30 lower_bound: 27 upper_bound: 29 repeats: 0` node2.Ranges = append(node2.Ranges, &ranger.Range{LowVal: types.MakeDatums("ggg"), HighVal: []types.Datum{types.MaxValueDatum()}}) stringColResult := `column:2 ndv:9 totColSize:0 num: 60 lower_bound: a upper_bound: aaaabbbb repeats: 0 -num: 60 lower_bound: bbbb upper_bound: fdsfdsfds repeats: 20 -num: 60 lower_bound: kkkkk upper_bound: ooooo repeats: 20 -num: 60 lower_bound: oooooo upper_bound: sssss repeats: 20 +num: 52 lower_bound: bbbb upper_bound: fdsfdsfds repeats: 0 +num: 54 lower_bound: kkkkk upper_bound: ooooo repeats: 0 +num: 60 lower_bound: oooooo upper_bound: sssss repeats: 0 num: 60 lower_bound: ssssssu upper_bound: yyyyy repeats: 0` newColl := coll.NewHistCollBySelectivity(sc, []*StatsNode{node, node2})