Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

stats: fix row count estimation for null #6203

Merged
merged 5 commits into from
Apr 4, 2018
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions executor/analyze.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ import (
"github.com/pingcap/tidb/sessionctx/variable"
"github.com/pingcap/tidb/statistics"
"github.com/pingcap/tidb/tablecodec"
"github.com/pingcap/tidb/types"
"github.com/pingcap/tidb/util/chunk"
"github.com/pingcap/tidb/util/ranger"
tipb "github.com/pingcap/tipb/go-tipb"
Expand Down Expand Up @@ -176,9 +175,8 @@ type AnalyzeIndexExec struct {
}

func (e *AnalyzeIndexExec) open() error {
idxRange := &ranger.NewRange{LowVal: []types.Datum{types.MinNotNullDatum()}, HighVal: []types.Datum{types.MaxValueDatum()}}
var builder distsql.RequestBuilder
kvReq, err := builder.SetIndexRanges(e.ctx.GetSessionVars().StmtCtx, e.tblInfo.ID, e.idxInfo.ID, []*ranger.NewRange{idxRange}).
kvReq, err := builder.SetIndexRanges(e.ctx.GetSessionVars().StmtCtx, e.tblInfo.ID, e.idxInfo.ID, ranger.FullNewRange()).
SetAnalyzeRequest(e.analyzePB).
SetKeepOrder(true).
SetPriority(e.priority).
Expand Down
41 changes: 41 additions & 0 deletions plan/cbo_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -541,6 +541,47 @@ func (s *testAnalyzeSuite) TestPreparedNullParam(c *C) {
cfg.PreparedPlanCache.Capacity = orgCapacity
}

func (s *testAnalyzeSuite) TestNullCount(c *C) {
defer testleak.AfterTest(c)()
store, dom, err := newStoreWithBootstrap()
c.Assert(err, IsNil)
testKit := testkit.NewTestKit(c, store)
defer func() {
dom.Close()
store.Close()
}()
testKit.MustExec("use test")
testKit.MustExec("drop table if exists t")
testKit.MustExec("create table t (a int, b int, index idx(a))")
testKit.MustExec("insert into t values (null, null), (null, null)")
testKit.MustExec("analyze table t")
testKit.MustQuery("explain select * from t where a is null").Check(testkit.Rows(
"TableScan_5 Selection_6 cop table:t, range:[-inf,+inf], keep order:false 2.00",
"Selection_6 TableScan_5 cop isnull(test.t.a) 2.00",
"TableReader_7 root data:Selection_6 2.00",
))
testKit.MustQuery("explain select * from t use index(idx) where a is null").Check(testkit.Rows(
"IndexScan_5 cop table:t, index:a, range:[<nil>,<nil>], keep order:false 2.00",
"TableScan_6 cop table:t, keep order:false 2.00",
"IndexLookUp_7 root index:IndexScan_5, table:TableScan_6 2.00",
))
h := dom.StatsHandle()
h.Clear()
h.Lease = 1
defer func() { h.Lease = 0 }()
c.Assert(h.Update(dom.InfoSchema()), IsNil)
testKit.MustQuery("explain select * from t where b = 1").Check(testkit.Rows(
"TableScan_5 Selection_6 cop table:t, range:[-inf,+inf], keep order:false 2.00",
"Selection_6 TableScan_5 cop eq(test.t.b, 1) 0.00",
"TableReader_7 root data:Selection_6 0.00",
))
testKit.MustQuery("explain select * from t where b < 1").Check(testkit.Rows(
"TableScan_5 Selection_6 cop table:t, range:[-inf,+inf], keep order:false 2.00",
"Selection_6 TableScan_5 cop lt(test.t.b, 1) 0.00",
"TableReader_7 root data:Selection_6 0.00",
))
}

func newStoreWithBootstrap() (kv.Storage, *domain.Domain, error) {
store, err := mockstore.NewMockTikvStore()
if err != nil {
Expand Down
2 changes: 1 addition & 1 deletion statistics/boostrap.go
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ func initStatsHistograms4Chunk(is infoschema.InfoSchema, tables statsCache, iter
continue
}
hist := NewHistogram(id, ndv, nullCount, version, &colInfo.FieldType, 0, totColSize)
table.Columns[hist.ID] = &Column{Histogram: *hist, Info: colInfo}
table.Columns[hist.ID] = &Column{Histogram: *hist, Info: colInfo, Count: nullCount}
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion statistics/ddl_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ func (s *testStatsCacheSuite) TestDDLHistogram(c *C) {
statsTbl := do.StatsHandle().GetTableStats(tableInfo)
c.Assert(statsTbl.Pseudo, IsFalse)
sc := new(stmtctx.StatementContext)
c.Assert(statsTbl.ColumnIsInvalid(sc, tableInfo.Columns[2].ID), IsTrue)
c.Check(statsTbl.Columns[tableInfo.Columns[2].ID].NullCount, Equals, int64(2))
c.Check(statsTbl.Columns[tableInfo.Columns[2].ID].NDV, Equals, int64(0))

testKit.MustExec("alter table t add column c3 int NOT NULL")
Expand Down
19 changes: 15 additions & 4 deletions statistics/histogram.go
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,10 @@ func (hg *Histogram) greaterAndEqRowCount(value types.Datum) float64 {

// lessRowCount estimates the row count where the column less than value.
func (hg *Histogram) lessRowCount(value types.Datum) float64 {
// all the values is null
if hg.Bounds == nil {
return 0
}
index, match := hg.Bounds.LowerBound(0, &value)
if index == hg.Bounds.NumRows() {
return hg.totalRowCount()
Expand Down Expand Up @@ -389,17 +393,17 @@ func (hg *Histogram) betweenRowCount(a, b types.Datum) float64 {
lessCountB := hg.lessRowCount(b)
// If lessCountA is not less than lessCountB, it may be that they fall to the same bucket and we cannot estimate
// the fraction, so we use `totalCount / NDV` to estimate the row count, but the result should not greater than lessCountB.
if lessCountA >= lessCountB {
if lessCountA >= lessCountB && hg.NDV > 0 {
return math.Min(lessCountB, hg.totalRowCount()/float64(hg.NDV))
}
return lessCountB - lessCountA
}

func (hg *Histogram) totalRowCount() float64 {
if hg.Len() == 0 {
return 0
return float64(hg.NullCount)
}
return float64(hg.Buckets[hg.Len()-1].Count)
return float64(hg.Buckets[hg.Len()-1].Count + hg.NullCount)
}

// mergeBuckets is used to merge every two neighbor buckets.
Expand All @@ -425,7 +429,7 @@ func (hg *Histogram) mergeBuckets(bucketIdx int) {

// getIncreaseFactor will return a factor of data increasing after the last analysis.
func (hg *Histogram) getIncreaseFactor(totalCount int64) float64 {
columnCount := int64(hg.totalRowCount()) + hg.NullCount
columnCount := int64(hg.totalRowCount())
if columnCount == 0 {
// avoid dividing by 0
return 1.0
Expand Down Expand Up @@ -615,10 +619,17 @@ func (c *Column) String() string {
}

func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum) (float64, error) {
if val.IsNull() {
return float64(c.NullCount), nil
}
if c.CMSketch != nil {
count, err := c.CMSketch.queryValue(sc, val)
return float64(count), errors.Trace(err)
}
// all the values is null
if c.Histogram.Bounds == nil {
return 0.0, nil
}
return c.Histogram.equalRowCount(val), nil
}

Expand Down
4 changes: 2 additions & 2 deletions statistics/table.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ func (h *Handle) columnStatsFromStorage(row types.Row, table *Table, tableInfo *
col = &Column{
Histogram: Histogram{ID: histID, NDV: distinct, NullCount: nullCount, tp: &colInfo.FieldType, LastUpdateVersion: histVer, TotColSize: totColSize},
Info: colInfo,
Count: count}
Count: count + nullCount}
break
}
if col == nil || col.LastUpdateVersion < histVer || loadAll {
Expand Down Expand Up @@ -252,7 +252,7 @@ func (t *Table) ColumnIsInvalid(sc *stmtctx.StatementContext, colID int64) bool
sc.SetHistogramsNotLoad()
histogramNeededColumns.insert(tableColumnID{tableID: t.TableID, columnID: colID})
}
return !ok || col.Len() == 0
return !ok || col.totalRowCount() == 0 || (col.NDV > 0 && col.Len() == 0)
}

// ColumnGreaterRowCount estimates the row count where the column greater than value.
Expand Down