Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

statistics: add default value of CMSketch for Analyze (#19455) #19927

Merged
merged 5 commits into from
Sep 21, 2020
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions executor/analyze.go
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,9 @@ func (e *AnalyzeIndexExec) buildStatsFromResult(result distsql.SelectResult, nee
}
}
err := hist.ExtractTopN(cms, len(e.idxInfo.Columns), uint32(e.opts[ast.AnalyzeOptNumTopN]))
if needCMS && cms != nil {
cms.CalcDefaultValForAnalyze(uint64(hist.NDV))
}
return hist, cms, err
}

Expand Down Expand Up @@ -525,6 +528,7 @@ func (e *AnalyzeColumnsExec) buildStats(ranges []*ranger.Range) (hists []*statis
return nil, nil, err
}
hists = append(hists, hg)
collectors[i].CMSketch.CalcDefaultValForAnalyze(uint64(hg.NDV))
cms = append(cms, collectors[i].CMSketch)
}
return hists, cms, nil
Expand Down Expand Up @@ -1236,6 +1240,7 @@ func analyzeIndexIncremental(idxExec *analyzeIndexIncrementalExec) analyzeResult
if err != nil {
return analyzeResult{Err: err, job: idxExec.job}
}
cms.CalcDefaultValForAnalyze(uint64(hist.NDV))
}
result := analyzeResult{
PhysicalTableID: idxExec.physicalTableID,
Expand Down
79 changes: 79 additions & 0 deletions executor/analyze_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -592,3 +592,82 @@ func (s *testSuite1) TestHashInTopN(c *C) {
}
}
}

func (s *testSuite1) TestNormalAnalyzeOnCommonHandle(c *C) {
tk := testkit.NewTestKit(c, s.store)
tk.MustExec("use test")
tk.MustExec("drop table if exists t1, t2, t3, t4")
tk.Se.GetSessionVars().EnableClusteredIndex = true
tk.MustExec("CREATE TABLE t1 (a int primary key, b int)")
tk.MustExec("insert into t1 values(1,1), (2,2), (3,3)")
tk.MustExec("CREATE TABLE t2 (a varchar(255) primary key, b int)")
tk.MustExec("insert into t2 values(\"111\",1), (\"222\",2), (\"333\",3)")
tk.MustExec("CREATE TABLE t3 (a int, b int, c int, primary key (a, b), key(c))")
tk.MustExec("insert into t3 values(1,1,1), (2,2,2), (3,3,3)")

tk.MustExec("analyze table t1, t2, t3")

tk.MustQuery(`show stats_buckets where table_name in ("t1", "t2", "t3")`).Sort().Check(testkit.Rows(
"test t1 a 0 0 1 1 1 1",
"test t1 a 0 1 2 1 2 2",
"test t1 a 0 2 3 1 3 3",
"test t1 b 0 0 1 1 1 1",
"test t1 b 0 1 2 1 2 2",
"test t1 b 0 2 3 1 3 3",
"test t2 PRIMARY 1 0 1 1 111 111",
"test t2 PRIMARY 1 1 2 1 222 222",
"test t2 PRIMARY 1 2 3 1 333 333",
"test t2 a 0 0 1 1 111 111",
"test t2 a 0 1 2 1 222 222",
"test t2 a 0 2 3 1 333 333",
"test t2 b 0 0 1 1 1 1",
"test t2 b 0 1 2 1 2 2",
"test t2 b 0 2 3 1 3 3",
"test t3 PRIMARY 1 0 1 1 (1, 1) (1, 1)",
"test t3 PRIMARY 1 1 2 1 (2, 2) (2, 2)",
"test t3 PRIMARY 1 2 3 1 (3, 3) (3, 3)",
"test t3 a 0 0 1 1 1 1",
"test t3 a 0 1 2 1 2 2",
"test t3 a 0 2 3 1 3 3",
"test t3 b 0 0 1 1 1 1",
"test t3 b 0 1 2 1 2 2",
"test t3 b 0 2 3 1 3 3",
"test t3 c 0 0 1 1 1 1",
"test t3 c 0 1 2 1 2 2",
"test t3 c 0 2 3 1 3 3",
"test t3 c 1 0 1 1 1 1",
"test t3 c 1 1 2 1 2 2",
"test t3 c 1 2 3 1 3 3"))
}
qw4990 marked this conversation as resolved.
Show resolved Hide resolved

func (s *testSuite1) TestDefaultValForAnalyze(c *C) {
tk := testkit.NewTestKit(c, s.store)
tk.MustExec("drop database if exists test_default_val_for_analyze;")
tk.MustExec("create database test_default_val_for_analyze;")
tk.MustExec("use test_default_val_for_analyze")

tk.MustExec("create table t (a int, key(a));")
for i := 0; i < 2048; i++ {
tk.MustExec("insert into t values (0)")
}
for i := 1; i < 4; i++ {
tk.MustExec("insert into t values (?)", i)
}
tk.MustExec("analyze table t with 0 topn;")
tk.MustQuery("explain select * from t where a = 1").Check(testkit.Rows("IndexReader_6 512.00 root index:IndexRangeScan_5",
"└─IndexRangeScan_5 512.00 cop[tikv] table:t, index:a(a) range:[1,1], keep order:false"))
tk.MustQuery("explain select * from t where a = 999").Check(testkit.Rows("IndexReader_6 0.00 root index:IndexRangeScan_5",
"└─IndexRangeScan_5 0.00 cop[tikv] table:t, index:a(a) range:[999,999], keep order:false"))

tk.MustExec("drop table t;")
tk.MustExec("create table t (a int, key(a));")
for i := 0; i < 2048; i++ {
tk.MustExec("insert into t values (0)")
}
for i := 1; i < 2049; i++ {
tk.MustExec("insert into t values (?)", i)
}
tk.MustExec("analyze table t with 0 topn;")
tk.MustQuery("explain select * from t where a = 1").Check(testkit.Rows("IndexReader_6 1.00 root index:IndexRangeScan_5",
"└─IndexRangeScan_5 1.00 cop[tikv] table:t, index:a(a) range:[1,1], keep order:false"))
}
30 changes: 26 additions & 4 deletions statistics/cmsketch.go
Original file line number Diff line number Diff line change
Expand Up @@ -288,23 +288,32 @@ func (c *CMSketch) QueryBytes(d []byte) uint64 {
func (c *CMSketch) queryHashValue(h1, h2 uint64) uint64 {
vals := make([]uint32, c.depth)
min := uint32(math.MaxUint32)
// We want that when res is 0 before the noise is eliminated, the default value is not used.
// So we need a temp value to distinguish before and after eliminating noise.
temp := uint32(1)
for i := range c.table {
j := (h1 + h2*uint64(i)) % uint64(c.width)
if min > c.table[i][j] {
min = c.table[i][j]
}
noise := (c.count - uint64(c.table[i][j])) / (uint64(c.width) - 1)
if uint64(c.table[i][j]) < noise {
if uint64(c.table[i][j]) == 0 {
vals[i] = 0
} else if uint64(c.table[i][j]) < noise {
vals[i] = temp
} else {
vals[i] = c.table[i][j] - uint32(noise)
vals[i] = c.table[i][j] - uint32(noise) + temp
}
}
sort.Sort(sortutil.Uint32Slice(vals))
res := vals[(c.depth-1)/2] + (vals[c.depth/2]-vals[(c.depth-1)/2])/2
if res > min {
res = min
if res > min+temp {
res = min + temp
}
if res == 0 {
return uint64(0)
}
res = res - temp
if c.considerDefVal(uint64(res)) {
return c.defaultValue
}
Expand Down Expand Up @@ -538,3 +547,16 @@ func (c *CMSketch) AppendTopN(data []byte, count uint64) {
func (c *CMSketch) GetWidthAndDepth() (int32, int32) {
return c.width, c.depth
}

// CalcDefaultValForAnalyze calculate the default value for Analyze.
// The value of it is count / NDV in CMSketch. This means count and NDV are not include topN.
func (c *CMSketch) CalcDefaultValForAnalyze(NDV uint64) {
// If NDV <= TopN, all values should be in TopN.
// So we set c.defaultValue to 0 and return immediately.
if NDV <= uint64(len(c.topN)) {
c.defaultValue = 0
return
}
remainNDV := NDV - uint64(len(c.topN))
c.defaultValue = c.count / mathutil.MaxUint64(1, remainNDV)
}