Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

plan, stats: fix inconsistent row count estimation #7233

Merged
merged 5 commits into from
Aug 6, 2018
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 11 additions & 10 deletions cmd/explaintest/r/explain_easy.result
Original file line number Diff line number Diff line change
Expand Up @@ -112,13 +112,14 @@ MemTableScan_4 10000.00 root
explain select c2 = (select c2 from t2 where t1.c1 = t2.c1 order by c1 limit 1) from t1;
id count task operator info
Projection_12 10000.00 root eq(test.t1.c2, test.t2.c2)
└─Apply_14 10000.00 root left outer join, inner:Limit_22
└─Apply_14 10000.00 root left outer join, inner:Limit_21
├─TableReader_16 10000.00 root data:TableScan_15
│ └─TableScan_15 10000.00 cop table:t1, range:[-inf,+inf], keep order:false, stats:pseudo
└─Limit_22 1.00 root offset:0, count:1
└─IndexLookUp_47 0.00 root
├─IndexScan_45 0.00 cop table:t2, index:c1, range: decided by [eq(test.t1.c1, test.t2.c1)], keep order:true, stats:pseudo
└─TableScan_46 0.00 cop table:t2, keep order:false, stats:pseudo
└─Limit_21 1.00 root offset:0, count:1
└─IndexLookUp_43 1.00 root
├─Limit_42 1.00 cop offset:0, count:1
│ └─IndexScan_40 1.25 cop table:t2, index:c1, range: decided by [eq(test.t1.c1, test.t2.c1)], keep order:true, stats:pseudo
└─TableScan_41 1.00 cop table:t2, keep order:false
explain select * from t1 order by c1 desc limit 1;
id count task operator info
Limit_10 1.00 root offset:0, count:1
Expand Down Expand Up @@ -286,8 +287,8 @@ Projection_11 10000.00 root 9_aux_0
│ └─TableScan_14 10000.00 cop table:t, range:[-inf,+inf], keep order:false, stats:pseudo
└─StreamAgg_20 1.00 root funcs:count(1)
└─IndexJoin_32 10000.00 root inner join, inner:TableReader_31, outer key:s.a, inner key:t1.a
├─IndexReader_36 10.00 root index:IndexScan_35
│ └─IndexScan_35 10.00 cop table:s, index:b, range: decided by [eq(s.b, test.t.a)], keep order:false, stats:pseudo
├─IndexReader_36 10000.00 root index:IndexScan_35
│ └─IndexScan_35 10000.00 cop table:s, index:b, range: decided by [eq(s.b, test.t.a)], keep order:false, stats:pseudo
└─TableReader_31 10.00 root data:TableScan_30
└─TableScan_30 10.00 cop table:t1, range: decided by [s.a], keep order:false, stats:pseudo
explain select t.c in (select count(*) from t s use index(idx), t t1 where s.b = t.a and s.c = t1.a) from t;
Expand All @@ -298,9 +299,9 @@ Projection_11 10000.00 root 9_aux_0
│ └─TableScan_14 10000.00 cop table:t, range:[-inf,+inf], keep order:false, stats:pseudo
└─StreamAgg_20 1.00 root funcs:count(1)
└─IndexJoin_33 10000.00 root inner join, inner:TableReader_32, outer key:s.c, inner key:t1.a
├─IndexLookUp_38 10.00 root
│ ├─IndexScan_36 10.00 cop table:s, index:b, range: decided by [eq(s.b, test.t.a)], keep order:false, stats:pseudo
│ └─TableScan_37 10.00 cop table:t, keep order:false, stats:pseudo
├─IndexLookUp_38 10000.00 root
│ ├─IndexScan_36 10000.00 cop table:s, index:b, range: decided by [eq(s.b, test.t.a)], keep order:false, stats:pseudo
│ └─TableScan_37 10000.00 cop table:t, keep order:false, stats:pseudo
└─TableReader_32 10.00 root data:TableScan_31
└─TableScan_31 10.00 cop table:t1, range: decided by [s.c], keep order:false, stats:pseudo
drop table if exists t;
Expand Down
11 changes: 6 additions & 5 deletions cmd/explaintest/r/explain_easy_stats.result
Original file line number Diff line number Diff line change
Expand Up @@ -101,13 +101,14 @@ MemTableScan_4 10000.00 root
explain select c2 = (select c2 from t2 where t1.c1 = t2.c1 order by c1 limit 1) from t1;
id count task operator info
Projection_12 1999.00 root eq(test.t1.c2, test.t2.c2)
└─Apply_14 1999.00 root left outer join, inner:Limit_22
└─Apply_14 1999.00 root left outer join, inner:Limit_21
├─TableReader_16 1999.00 root data:TableScan_15
│ └─TableScan_15 1999.00 cop table:t1, range:[-inf,+inf], keep order:false
└─Limit_22 1.00 root offset:0, count:1
└─IndexLookUp_47 0.00 root
├─IndexScan_45 0.00 cop table:t2, index:c1, range: decided by [eq(test.t1.c1, test.t2.c1)], keep order:true
└─TableScan_46 0.00 cop table:t2, keep order:false
└─Limit_21 1.00 root offset:0, count:1
└─IndexLookUp_43 1.00 root
├─Limit_42 1.00 cop offset:0, count:1
│ └─IndexScan_40 1.25 cop table:t2, index:c1, range: decided by [eq(test.t1.c1, test.t2.c1)], keep order:true
└─TableScan_41 1.00 cop table:t2, keep order:false
explain select * from t1 order by c1 desc limit 1;
id count task operator info
Limit_10 1.00 root offset:0, count:1
Expand Down
10 changes: 6 additions & 4 deletions executor/analyze.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,13 @@ type AnalyzeExec struct {
tasks []*analyzeTask
}

// MaxBucketSize is the maximum number of bucket that a histogram could contain.
var MaxBucketSize = int64(256)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not a good design. Do not export a variable.


const (
maxSampleSize = 10000
maxRegionSampleSize = 1000
maxSketchSize = 10000
maxBucketSize = 256
defaultCMSketchDepth = 5
defaultCMSketchWidth = 2048
)
Expand Down Expand Up @@ -193,7 +195,7 @@ func (e *AnalyzeIndexExec) buildStats() (hist *statistics.Histogram, cms *statis
if err != nil {
return nil, nil, errors.Trace(err)
}
hist, err = statistics.MergeHistograms(e.ctx.GetSessionVars().StmtCtx, hist, statistics.HistogramFromProto(resp.Hist), maxBucketSize)
hist, err = statistics.MergeHistograms(e.ctx.GetSessionVars().StmtCtx, hist, statistics.HistogramFromProto(resp.Hist), int(MaxBucketSize))
if err != nil {
return nil, nil, errors.Trace(err)
}
Expand Down Expand Up @@ -322,7 +324,7 @@ func (e *AnalyzeColumnsExec) buildStats() (hists []*statistics.Histogram, cms []
}
sc := e.ctx.GetSessionVars().StmtCtx
if e.pkInfo != nil {
pkHist, err = statistics.MergeHistograms(sc, pkHist, statistics.HistogramFromProto(resp.PkHist), maxBucketSize)
pkHist, err = statistics.MergeHistograms(sc, pkHist, statistics.HistogramFromProto(resp.PkHist), int(MaxBucketSize))
if err != nil {
return nil, nil, errors.Trace(err)
}
Expand All @@ -348,7 +350,7 @@ func (e *AnalyzeColumnsExec) buildStats() (hists []*statistics.Histogram, cms []
return nil, nil, errors.Trace(err)
}
}
hg, err := statistics.BuildColumn(e.ctx, maxBucketSize, col.ID, collectors[i], &col.FieldType)
hg, err := statistics.BuildColumn(e.ctx, MaxBucketSize, col.ID, collectors[i], &col.FieldType)
if err != nil {
return nil, nil, errors.Trace(err)
}
Expand Down
4 changes: 2 additions & 2 deletions executor/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -1280,7 +1280,7 @@ func (b *executorBuilder) buildAnalyzeIndexPushdown(task plan.AnalyzeIndexTask)
},
}
e.analyzePB.IdxReq = &tipb.AnalyzeIndexReq{
BucketSize: maxBucketSize,
BucketSize: MaxBucketSize,
NumColumns: int32(len(task.IndexInfo.Columns)),
}
depth := int32(defaultCMSketchDepth)
Expand Down Expand Up @@ -1316,7 +1316,7 @@ func (b *executorBuilder) buildAnalyzeColumnsPushdown(task plan.AnalyzeColumnsTa
depth := int32(defaultCMSketchDepth)
width := int32(defaultCMSketchWidth)
e.analyzePB.ColReq = &tipb.AnalyzeColumnsReq{
BucketSize: maxBucketSize,
BucketSize: MaxBucketSize,
SampleSize: maxRegionSampleSize,
SketchSize: maxSketchSize,
ColumnsInfo: model.ColumnsToProto(cols, task.TableInfo.PKIsHandle),
Expand Down
35 changes: 35 additions & 0 deletions plan/cbo_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
. "github.com/pingcap/check"
"github.com/pingcap/tidb/config"
"github.com/pingcap/tidb/domain"
"github.com/pingcap/tidb/executor"
"github.com/pingcap/tidb/kv"
"github.com/pingcap/tidb/plan"
"github.com/pingcap/tidb/session"
Expand Down Expand Up @@ -620,6 +621,40 @@ func (s *testAnalyzeSuite) TestCorrelatedEstimation(c *C) {
))
}

func (s *testAnalyzeSuite) TestInconsistentEstimation(c *C) {
defer testleak.AfterTest(c)()
store, dom, err := newStoreWithBootstrap()
c.Assert(err, IsNil)
tk := testkit.NewTestKit(c, store)
defer func() {
dom.Close()
store.Close()
}()
tk.MustExec("use test")
tk.MustExec("create table t(a int, b int, c int, index ab(a,b), index ac(a,c))")
tk.MustExec("insert into t values (1,1,1), (1000,1000,1000)")
for i := 0; i < 10; i++ {
tk.MustExec("insert into t values (5,5,5), (10,10,10)")
}
origin := executor.MaxBucketSize
defer func() { executor.MaxBucketSize = origin }()
executor.MaxBucketSize = 2
tk.MustExec("analyze table t")
// Force using the histogram to estimate.
tk.MustExec("update mysql.stats_histograms set stats_ver = 0")
dom.StatsHandle().Clear()
dom.StatsHandle().Update(dom.InfoSchema())
// Using the histogram (a, b) to estimate `a = 5` will get 1.22, while using the CM Sketch to estimate
// the `a = 5 and c = 5` will get 10, it is not consistent.
tk.MustQuery("explain select * from t use index(ab) where a = 5 and c = 5").
Check(testkit.Rows(
"IndexLookUp_8 10.00 root ",
"├─IndexScan_5 12.50 cop table:t, index:a, b, range:[5,5], keep order:false",
"└─Selection_7 10.00 cop eq(test.t.c, 5)",
" └─TableScan_6 12.50 cop table:t, keep order:false",
))
}

func newStoreWithBootstrap() (kv.Storage, *domain.Domain, error) {
store, err := mockstore.NewMockTikvStore()
if err != nil {
Expand Down
5 changes: 5 additions & 0 deletions plan/logical_plans.go
Original file line number Diff line number Diff line change
Expand Up @@ -443,6 +443,11 @@ func (ds *DataSource) deriveIndexPathStats(path *accessPath) (bool, error) {
path.countAfterAccess = ds.statisticTable.PseudoAvgCountPerValue()
}
}
// If the `countAfterAccess` is less than `stats.count`, there must be some inconsistent stats info.
// We prefer the `stats.count` because it could use more stats info to calculate the selectivity.
if path.countAfterAccess < ds.stats.count {
path.countAfterAccess = math.Min(ds.stats.count/selectionFactor, float64(ds.statisticTable.Count))
}
if path.indexFilters != nil {
selectivity, err := ds.statisticTable.Selectivity(ds.ctx, path.indexFilters)
if err != nil {
Expand Down
16 changes: 11 additions & 5 deletions statistics/selectivity.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ type exprSet struct {
mask int64
// ranges contains all the ranges we got.
ranges []*ranger.Range
// numCols is the number of columns contained in the index or column(which is always 1).
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it is always 1, why don't use a const?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is always 1 for the column, while it could also greater than 1 for the index.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

gotcha.

numCols int
}

// The type of the exprSet.
Expand Down Expand Up @@ -177,7 +179,7 @@ func (coll *HistColl) Selectivity(ctx sessionctx.Context, exprs []expression.Exp
if err != nil {
return 0, errors.Trace(err)
}
sets = append(sets, &exprSet{tp: colType, ID: col.ID, mask: maskCovered, ranges: ranges})
sets = append(sets, &exprSet{tp: colType, ID: col.ID, mask: maskCovered, ranges: ranges, numCols: 1})
if mysql.HasPriKeyFlag(colInfo.Info.Flag) {
sets[len(sets)-1].tp = pkType
}
Expand All @@ -190,7 +192,7 @@ func (coll *HistColl) Selectivity(ctx sessionctx.Context, exprs []expression.Exp
if err != nil {
return 0, errors.Trace(err)
}
sets = append(sets, &exprSet{tp: indexType, ID: idxInfo.ID, mask: maskCovered, ranges: ranges})
sets = append(sets, &exprSet{tp: indexType, ID: idxInfo.ID, mask: maskCovered, ranges: ranges, numCols: len(idxInfo.Info.Columns)})
}
}
sets = getUsableSetsByGreedy(sets)
Expand Down Expand Up @@ -254,16 +256,20 @@ func getUsableSetsByGreedy(sets []*exprSet) (newBlocks []*exprSet) {
mask := int64(math.MaxInt64)
for {
// Choose the index that covers most.
bestID, bestCount, bestTp := -1, 0, colType
bestID, bestCount, bestTp, bestNumCols := -1, 0, colType, 0
for i, set := range sets {
set.mask &= mask
bits := popCount(set.mask)
// This set cannot cover any thing, just skip it.
if bits == 0 {
continue
}
if (bestTp == colType && set.tp < colType) || bestCount < bits {
bestID, bestCount, bestTp = i, bits, set.tp
// We greedy select the stats info based on:
// (1): The stats type, always prefer the primary key or index.
// (2): The number of expression that it covers, the more the better.
// (3): The number of columns that it contains, the less the better.
if (bestTp == colType && set.tp < colType) || bestCount < bits || (bestCount == bits && bestNumCols > set.numCols) {
Copy link
Contributor Author

@alivxxx alivxxx Aug 1, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is required by this PR, because after the change in logical_plans.go, it would cause failure in TestIndexRead, it would sometimes choose index (b,c).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think set.tp != colType is better, because the type is not a scalar.

bestID, bestCount, bestTp, bestNumCols = i, bits, set.tp, set.numCols
}
}
if bestCount == 0 {
Expand Down