Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

*: improve NULL count estimation for single column index (#9474) #9979

Merged
merged 6 commits into from
Apr 2, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cmd/explaintest/r/index_join.result
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,15 @@ insert into t2 values(1, 1);
analyze table t1, t2;
explain select /*+ TIDB_INLJ(t1, t2) */ * from t1 join t2 on t1.a=t2.a;
id count task operator info
IndexJoin_14 1.25 root inner join, inner:IndexLookUp_13, outer key:test.t2.a, inner key:test.t1.a
IndexJoin_14 5.00 root inner join, inner:IndexLookUp_13, outer key:test.t2.a, inner key:test.t1.a
├─IndexLookUp_13 5.00 root
│ ├─IndexScan_11 5.00 cop table:t1, index:a, range: decided by [test.t2.a], keep order:false
│ └─TableScan_12 5.00 cop table:t1, keep order:false
└─TableReader_16 1.00 root data:TableScan_15
└─TableScan_15 1.00 cop table:t2, range:[-inf,+inf], keep order:false
explain select * from t1 join t2 on t1.a=t2.a;
id count task operator info
IndexJoin_14 1.25 root inner join, inner:IndexLookUp_13, outer key:test.t2.a, inner key:test.t1.a
IndexJoin_14 5.00 root inner join, inner:IndexLookUp_13, outer key:test.t2.a, inner key:test.t1.a
├─IndexLookUp_13 5.00 root
│ ├─IndexScan_11 5.00 cop table:t1, index:a, range: decided by [test.t2.a], keep order:false
│ └─TableScan_12 5.00 cop table:t1, keep order:false
Expand Down
98 changes: 76 additions & 22 deletions executor/analyze.go
Original file line number Diff line number Diff line change
Expand Up @@ -150,8 +150,9 @@ func analyzeIndexPushdown(idxExec *AnalyzeIndexExec) statistics.AnalyzeResult {
Cms: []*statistics.CMSketch{cms},
IsIndex: 1,
}
result.Count = hist.NullCount
if hist.Len() > 0 {
result.Count = hist.Buckets[hist.Len()-1].Count
result.Count += hist.Buckets[hist.Len()-1].Count
}
return result
}
Expand All @@ -165,12 +166,16 @@ type AnalyzeIndexExec struct {
priority int
analyzePB *tipb.AnalyzeReq
result distsql.SelectResult
countNullRes distsql.SelectResult
maxNumBuckets uint64
}

func (e *AnalyzeIndexExec) open() error {
// fetchAnalyzeResult builds and dispatches the `kv.Request` from given ranges, and stores the `SelectResult`
// in corresponding fields based on the input `isNullRange` argument, which indicates if the range is the
// special null range for single-column index to get the null count.
func (e *AnalyzeIndexExec) fetchAnalyzeResult(ranges []*ranger.Range, isNullRange bool) error {
var builder distsql.RequestBuilder
kvReq, err := builder.SetIndexRanges(e.ctx.GetSessionVars().StmtCtx, e.physicalTableID, e.idxInfo.ID, ranger.FullRange()).
kvReq, err := builder.SetIndexRanges(e.ctx.GetSessionVars().StmtCtx, e.physicalTableID, e.idxInfo.ID, ranges).
SetAnalyzeRequest(e.analyzePB).
SetKeepOrder(true).
SetConcurrency(e.concurrency).
Expand All @@ -179,29 +184,51 @@ func (e *AnalyzeIndexExec) open() error {
return errors.Trace(err)
}
ctx := context.TODO()
e.result, err = distsql.Analyze(ctx, e.ctx.GetClient(), kvReq, e.ctx.GetSessionVars().KVVars, e.ctx.GetSessionVars().InRestrictedSQL)
result, err := distsql.Analyze(ctx, e.ctx.GetClient(), kvReq, e.ctx.GetSessionVars().KVVars, e.ctx.GetSessionVars().InRestrictedSQL)
if err != nil {
return errors.Trace(err)
}
e.result.Fetch(ctx)
result.Fetch(ctx)
if isNullRange {
e.countNullRes = result
} else {
e.result = result
}
return nil
}

func (e *AnalyzeIndexExec) buildStats() (hist *statistics.Histogram, cms *statistics.CMSketch, err error) {
if err = e.open(); err != nil {
return nil, nil, errors.Trace(err)
func (e *AnalyzeIndexExec) open() error {
ranges := ranger.FullRange()
// For single-column index, we do not load null rows from TiKV, so the built histogram would not include
// null values, and its `NullCount` would be set by result of another distsql call to get null rows.
// For multi-column index, we cannot define null for the rows, so we still use full range, and the rows
// containing null fields would exist in built histograms. Note that, the `NullCount` of histograms for
// multi-column index is always 0 then.
if len(e.idxInfo.Columns) == 1 {
ranges = ranger.FullNotNullRange()
}
defer func() {
if err1 := e.result.Close(); err1 != nil {
hist = nil
cms = nil
err = errors.Trace(err1)
err := e.fetchAnalyzeResult(ranges, false)
if err != nil {
return err
}
if len(e.idxInfo.Columns) == 1 {
ranges = ranger.NullRange()
err = e.fetchAnalyzeResult(ranges, true)
if err != nil {
return err
}
}()
hist = &statistics.Histogram{}
cms = statistics.NewCMSketch(defaultCMSketchDepth, defaultCMSketchWidth)
}
return nil
}

func (e *AnalyzeIndexExec) buildStatsFromResult(result distsql.SelectResult, needCMS bool) (*statistics.Histogram, *statistics.CMSketch, error) {
hist := &statistics.Histogram{}
var cms *statistics.CMSketch
if needCMS {
cms = statistics.NewCMSketch(defaultCMSketchDepth, defaultCMSketchWidth)
}
for {
data, err := e.result.NextRaw(context.TODO())
data, err := result.NextRaw(context.TODO())
if err != nil {
return nil, nil, errors.Trace(err)
}
Expand All @@ -215,15 +242,42 @@ func (e *AnalyzeIndexExec) buildStats() (hist *statistics.Histogram, cms *statis
}
hist, err = statistics.MergeHistograms(e.ctx.GetSessionVars().StmtCtx, hist, statistics.HistogramFromProto(resp.Hist), int(e.maxNumBuckets))
if err != nil {
return nil, nil, errors.Trace(err)
return nil, nil, err
}
if resp.Cms != nil {
err := cms.MergeCMSketch(statistics.CMSketchFromProto(resp.Cms))
if err != nil {
return nil, nil, errors.Trace(err)
if needCMS {
if resp.Cms == nil {
log.Warnf("nil CMS in response, table is %s, index is %s", e.idxInfo.Table.O, e.idxInfo.Name.O)
} else {
err := cms.MergeCMSketch(statistics.CMSketchFromProto(resp.Cms))
if err != nil {
return nil, nil, errors.Trace(err)
}
}
}
}
return hist, cms, nil
}

func (e *AnalyzeIndexExec) buildStats() (hist *statistics.Histogram, cms *statistics.CMSketch, err error) {
if err = e.open(); err != nil {
return nil, nil, err
}
defer func() {
err = closeAll(e.result, e.countNullRes)
}()
hist, cms, err = e.buildStatsFromResult(e.result, true)
if err != nil {
return nil, nil, err
}
if e.countNullRes != nil {
nullHist, _, err := e.buildStatsFromResult(e.countNullRes, false)
if err != nil {
return nil, nil, err
}
if l := nullHist.Len(); l > 0 {
hist.NullCount = nullHist.Buckets[l-1].Count
}
}
hist.ID = e.idxInfo.ID
return hist, cms, nil
}
Expand Down
4 changes: 2 additions & 2 deletions executor/analyze_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ PARTITION BY RANGE ( a ) (
for _, def := range pi.Definitions {
statsTbl := handle.GetPartitionStats(table.Meta(), def.ID)
c.Assert(statsTbl.Pseudo, IsFalse)
c.Assert(len(statsTbl.Columns), Equals, 2)
c.Assert(len(statsTbl.Columns), Equals, 3)
c.Assert(len(statsTbl.Indices), Equals, 1)
for _, col := range statsTbl.Columns {
c.Assert(col.Len(), Greater, 0)
Expand All @@ -81,7 +81,7 @@ PARTITION BY RANGE ( a ) (
statsTbl := handle.GetPartitionStats(table.Meta(), def.ID)
if i == 0 {
c.Assert(statsTbl.Pseudo, IsFalse)
c.Assert(len(statsTbl.Columns), Equals, 2)
c.Assert(len(statsTbl.Columns), Equals, 3)
c.Assert(len(statsTbl.Indices), Equals, 1)
} else {
c.Assert(statsTbl.Pseudo, IsTrue)
Expand Down
53 changes: 52 additions & 1 deletion executor/show_stats_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,5 +109,56 @@ func (s *testSuite) TestShowStatsHasNullValue(c *C) {
tk.MustExec("create table t (a int, index idx(a))")
tk.MustExec("insert into t values(NULL)")
tk.MustExec("analyze table t")
tk.MustQuery("show stats_buckets").Check(testkit.Rows("test t idx 1 0 1 1 NULL NULL"))
// Null values are excluded from histogram for single-column index.
tk.MustQuery("show stats_buckets").Check(testkit.Rows())
tk.MustExec("insert into t values(1)")
tk.MustExec("analyze table t")
tk.MustQuery("show stats_buckets").Sort().Check(testkit.Rows(
"test t a 0 0 1 1 1 1",
"test t idx 1 0 1 1 1 1",
))
tk.MustExec("drop table t")
tk.MustExec("create table t (a int, b int, index idx(a, b))")
tk.MustExec("insert into t values(NULL, NULL)")
tk.MustExec("analyze table t")
tk.MustQuery("show stats_buckets").Check(testkit.Rows("test t idx 1 0 1 1 (NULL, NULL) (NULL, NULL)"))

tk.MustExec("drop table t")
tk.MustExec("create table t(a int, b int, c int, index idx_b(b), index idx_c_a(c, a))")
tk.MustExec("insert into t values(1,null,1),(2,null,2),(3,3,3),(4,null,4),(null,null,null)")
res := tk.MustQuery("show stats_histograms where table_name = 't'")
c.Assert(len(res.Rows()), Equals, 0)
tk.MustExec("analyze table t index idx_b")
res = tk.MustQuery("show stats_histograms where table_name = 't' and column_name = 'idx_b'")
c.Assert(len(res.Rows()), Equals, 1)
c.Assert(res.Rows()[0][6], Equals, "4")
res = tk.MustQuery("show stats_histograms where table_name = 't' and column_name = 'b'")
c.Assert(len(res.Rows()), Equals, 0)
tk.MustExec("analyze table t index idx_c_a")
res = tk.MustQuery("show stats_histograms where table_name = 't' and column_name = 'idx_c_a'")
c.Assert(len(res.Rows()), Equals, 1)
c.Assert(res.Rows()[0][6], Equals, "0")
res = tk.MustQuery("show stats_histograms where table_name = 't' and column_name = 'c'")
c.Assert(len(res.Rows()), Equals, 0)
res = tk.MustQuery("show stats_histograms where table_name = 't' and column_name = 'a'")
c.Assert(len(res.Rows()), Equals, 0)
tk.MustExec("truncate table t")
tk.MustExec("insert into t values(1,null,1),(2,null,2),(3,3,3),(4,null,4),(null,null,null)")
res = tk.MustQuery("show stats_histograms where table_name = 't'")
c.Assert(len(res.Rows()), Equals, 0)
tk.MustExec("analyze table t index")
res = tk.MustQuery("show stats_histograms where table_name = 't'").Sort()
c.Assert(len(res.Rows()), Equals, 2)
c.Assert(res.Rows()[0][6], Equals, "4")
c.Assert(res.Rows()[1][6], Equals, "0")
tk.MustExec("truncate table t")
tk.MustExec("insert into t values(1,null,1),(2,null,2),(3,3,3),(4,null,4),(null,null,null)")
tk.MustExec("analyze table t")
res = tk.MustQuery("show stats_histograms where table_name = 't'").Sort()
c.Assert(len(res.Rows()), Equals, 5)
c.Assert(res.Rows()[0][6], Equals, "1")
c.Assert(res.Rows()[1][6], Equals, "4")
c.Assert(res.Rows()[2][6], Equals, "1")
c.Assert(res.Rows()[3][6], Equals, "4")
c.Assert(res.Rows()[4][6], Equals, "0")
}
14 changes: 7 additions & 7 deletions planner/core/cbo_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ func (s *testAnalyzeSuite) TestIndexRead(c *C) {
},
{
sql: "select count(*) from t where e > 1 group by b",
best: "IndexLookUp(Index(t.b)[[NULL,+inf]], Table(t)->Sel([gt(test.t.e, 1)]))->StreamAgg",
best: "TableReader(Table(t)->Sel([gt(test.t.e, 1)])->HashAgg)->HashAgg",
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For reviewers: before this PR, column b is not analyzed because of the existence of index on b, so estimated NDV of b is about 8000; after this PR, estimated NDV of b is 100 which is consistent with data, and the plan is changed due to this NDV change.

},
{
sql: "select count(e) from t where t.b <= 20",
Expand Down Expand Up @@ -453,7 +453,7 @@ func (s *testAnalyzeSuite) TestAnalyze(c *C) {
}{
{
sql: "analyze table t3",
best: "Analyze{Index(a),Table(b)}",
best: "Analyze{Index(a),Table(a, b)}",
},
// Test analyze full table.
{
Expand Down Expand Up @@ -676,11 +676,11 @@ func (s *testAnalyzeSuite) TestCorrelatedEstimation(c *C) {
" ├─TableReader_12 10.00 root data:TableScan_11",
" │ └─TableScan_11 10.00 cop table:t, range:[-inf,+inf], keep order:false",
" └─MaxOneRow_13 1.00 root ",
" └─Projection_14 0.80 root concat(cast(t1.a), \",\", cast(t1.b))",
" └─IndexLookUp_21 0.80 root ",
" ├─IndexScan_18 1.25 cop table:t1, index:c, range: decided by [eq(t1.c, test.t.c)], keep order:false",
" └─Selection_20 0.80 cop eq(t1.a, test.t.a)",
" └─TableScan_19 1.25 cop table:t, keep order:false",
" └─Projection_14 0.10 root concat(cast(t1.a), \",\", cast(t1.b))",
" └─IndexLookUp_21 0.10 root ",
" ├─IndexScan_18 1.00 cop table:t1, index:c, range: decided by [eq(t1.c, test.t.c)], keep order:false",
" └─Selection_20 0.10 cop eq(t1.a, test.t.a)",
" └─TableScan_19 1.00 cop table:t, keep order:false",
))
}

Expand Down
29 changes: 5 additions & 24 deletions planner/core/planbuilder.go
Original file line number Diff line number Diff line change
Expand Up @@ -643,35 +643,16 @@ func (b *planBuilder) buildCheckIndexSchema(tn *ast.TableName, indexName string)
// getColsInfo returns the info of index columns, normal columns and primary key.
func getColsInfo(tn *ast.TableName) (indicesInfo []*model.IndexInfo, colsInfo []*model.ColumnInfo, pkCol *model.ColumnInfo) {
tbl := tn.TableInfo
// idxNames contains all the normal columns that can be analyzed more effectively, because those columns occur as index
// columns or primary key columns with integer type.
var idxNames []string
if tbl.PKIsHandle {
for _, col := range tbl.Columns {
if mysql.HasPriKeyFlag(col.Flag) {
idxNames = append(idxNames, col.Name.L)
pkCol = col
}
for _, col := range tbl.Columns {
if tbl.PKIsHandle && mysql.HasPriKeyFlag(col.Flag) {
pkCol = col
} else {
colsInfo = append(colsInfo, col)
}
}
for _, idx := range tn.TableInfo.Indices {
if idx.State == model.StatePublic {
indicesInfo = append(indicesInfo, idx)
if len(idx.Columns) == 1 {
idxNames = append(idxNames, idx.Columns[0].Name.L)
}
}
}
for _, col := range tbl.Columns {
isIndexCol := false
for _, idx := range idxNames {
if idx == col.Name.L {
isIndexCol = true
break
}
}
if !isIndexCol {
colsInfo = append(colsInfo, col)
}
}
return
Expand Down
12 changes: 11 additions & 1 deletion statistics/feedback.go
Original file line number Diff line number Diff line change
Expand Up @@ -1073,6 +1073,7 @@ func dumpFeedbackForIndex(h *Handle, q *QueryFeedback, t *Table) error {
}

func (q *QueryFeedback) dumpRangeFeedback(h *Handle, ran *ranger.Range, rangeCount float64) error {
lowIsNull := ran.LowVal[0].IsNull()
if q.tp == indexType {
sc := &stmtctx.StatementContext{TimeZone: time.UTC}
lower, err := codec.EncodeKey(sc, nil, ran.LowVal[0])
Expand All @@ -1099,8 +1100,17 @@ func (q *QueryFeedback) dumpRangeFeedback(h *Handle, ran *ranger.Range, rangeCou
ranges := q.hist.SplitRange([]*ranger.Range{ran})
counts := make([]float64, 0, len(ranges))
sum := 0.0
for _, r := range ranges {
for i, r := range ranges {
// Though after `SplitRange`, we may have ranges like `[l, r]`, we still use
// `betweenRowCount` to compute the estimation since the ranges of feedback are all in `[l, r)`
// form, that is to say, we ignore the exclusiveness of ranges from `SplitRange` and just use
// its result of boundary values.
count := q.hist.betweenRowCount(r.LowVal[0], r.HighVal[0])
// We have to include `NullCount` of histogram for [l, r) cases where l is null because `betweenRowCount`
// does not include null values of lower bound.
if i == 0 && lowIsNull {
count += float64(q.hist.NullCount)
}
sum += count
counts = append(counts, count)
}
Expand Down
Loading