diff --git a/executor/analyze.go b/executor/analyze.go index d160dd13827ec..6d1cc07d827fe 100755 --- a/executor/analyze.go +++ b/executor/analyze.go @@ -335,6 +335,9 @@ func (e *AnalyzeIndexExec) buildStatsFromResult(result distsql.SelectResult, nee } } err := hist.ExtractTopN(cms, len(e.idxInfo.Columns), uint32(e.opts[ast.AnalyzeOptNumTopN])) + if needCMS && cms != nil { + cms.CalcDefaultValForAnalyze(uint64(hist.NDV)) + } return hist, cms, err } @@ -525,6 +528,7 @@ func (e *AnalyzeColumnsExec) buildStats(ranges []*ranger.Range) (hists []*statis return nil, nil, err } hists = append(hists, hg) + collectors[i].CMSketch.CalcDefaultValForAnalyze(uint64(hg.NDV)) cms = append(cms, collectors[i].CMSketch) } return hists, cms, nil @@ -1236,6 +1240,7 @@ func analyzeIndexIncremental(idxExec *analyzeIndexIncrementalExec) analyzeResult if err != nil { return analyzeResult{Err: err, job: idxExec.job} } + cms.CalcDefaultValForAnalyze(uint64(hist.NDV)) } result := analyzeResult{ PhysicalTableID: idxExec.physicalTableID, diff --git a/executor/analyze_test.go b/executor/analyze_test.go index 57400193a9367..04bcb17776325 100644 --- a/executor/analyze_test.go +++ b/executor/analyze_test.go @@ -592,3 +592,35 @@ func (s *testSuite1) TestHashInTopN(c *C) { } } } + +func (s *testSuite1) TestDefaultValForAnalyze(c *C) { + tk := testkit.NewTestKit(c, s.store) + tk.MustExec("drop database if exists test_default_val_for_analyze;") + tk.MustExec("create database test_default_val_for_analyze;") + tk.MustExec("use test_default_val_for_analyze") + + tk.MustExec("create table t (a int, key(a));") + for i := 0; i < 2048; i++ { + tk.MustExec("insert into t values (0)") + } + for i := 1; i < 4; i++ { + tk.MustExec("insert into t values (?)", i) + } + tk.MustExec("analyze table t with 0 topn;") + tk.MustQuery("explain select * from t where a = 1").Check(testkit.Rows("IndexReader_6 512.00 root index:IndexRangeScan_5", + "└─IndexRangeScan_5 512.00 cop[tikv] table:t, index:a(a) range:[1,1], keep order:false")) + tk.MustQuery("explain select * from t where a = 999").Check(testkit.Rows("IndexReader_6 0.00 root index:IndexRangeScan_5", + "└─IndexRangeScan_5 0.00 cop[tikv] table:t, index:a(a) range:[999,999], keep order:false")) + + tk.MustExec("drop table t;") + tk.MustExec("create table t (a int, key(a));") + for i := 0; i < 2048; i++ { + tk.MustExec("insert into t values (0)") + } + for i := 1; i < 2049; i++ { + tk.MustExec("insert into t values (?)", i) + } + tk.MustExec("analyze table t with 0 topn;") + tk.MustQuery("explain select * from t where a = 1").Check(testkit.Rows("IndexReader_6 1.00 root index:IndexRangeScan_5", + "└─IndexRangeScan_5 1.00 cop[tikv] table:t, index:a(a) range:[1,1], keep order:false")) +} diff --git a/statistics/cmsketch.go b/statistics/cmsketch.go index dd893e27b9f75..fe4625dd47199 100644 --- a/statistics/cmsketch.go +++ b/statistics/cmsketch.go @@ -288,23 +288,32 @@ func (c *CMSketch) QueryBytes(d []byte) uint64 { func (c *CMSketch) queryHashValue(h1, h2 uint64) uint64 { vals := make([]uint32, c.depth) min := uint32(math.MaxUint32) + // We want that when res is 0 before the noise is eliminated, the default value is not used. + // So we need a temp value to distinguish before and after eliminating noise. + temp := uint32(1) for i := range c.table { j := (h1 + h2*uint64(i)) % uint64(c.width) if min > c.table[i][j] { min = c.table[i][j] } noise := (c.count - uint64(c.table[i][j])) / (uint64(c.width) - 1) - if uint64(c.table[i][j]) < noise { + if uint64(c.table[i][j]) == 0 { vals[i] = 0 + } else if uint64(c.table[i][j]) < noise { + vals[i] = temp } else { - vals[i] = c.table[i][j] - uint32(noise) + vals[i] = c.table[i][j] - uint32(noise) + temp } } sort.Sort(sortutil.Uint32Slice(vals)) res := vals[(c.depth-1)/2] + (vals[c.depth/2]-vals[(c.depth-1)/2])/2 - if res > min { - res = min + if res > min+temp { + res = min + temp } + if res == 0 { + return uint64(0) + } + res = res - temp if c.considerDefVal(uint64(res)) { return c.defaultValue } @@ -538,3 +547,16 @@ func (c *CMSketch) AppendTopN(data []byte, count uint64) { func (c *CMSketch) GetWidthAndDepth() (int32, int32) { return c.width, c.depth } + +// CalcDefaultValForAnalyze calculate the default value for Analyze. +// The value of it is count / NDV in CMSketch. This means count and NDV are not include topN. +func (c *CMSketch) CalcDefaultValForAnalyze(NDV uint64) { + // If NDV <= TopN, all values should be in TopN. + // So we set c.defaultValue to 0 and return immediately. + if NDV <= uint64(len(c.topN)) { + c.defaultValue = 0 + return + } + remainNDV := NDV - uint64(len(c.topN)) + c.defaultValue = c.count / mathutil.MaxUint64(1, remainNDV) +}