From c4dc5195723e5d8ddbf029f0a7eff216ba615258 Mon Sep 17 00:00:00 2001 From: Haibin Xie Date: Mon, 2 Apr 2018 17:18:50 +0800 Subject: [PATCH 1/2] stats: fix row count estimation for null --- executor/analyze.go | 4 +--- plan/cbo_test.go | 26 ++++++++++++++++++++++++++ statistics/boostrap.go | 2 +- statistics/ddl_test.go | 2 +- statistics/histogram.go | 9 ++++++--- statistics/table.go | 4 ++-- 6 files changed, 37 insertions(+), 10 deletions(-) diff --git a/executor/analyze.go b/executor/analyze.go index acfaba16b5e0a..cdfb05639164f 100644 --- a/executor/analyze.go +++ b/executor/analyze.go @@ -27,7 +27,6 @@ import ( "github.com/pingcap/tidb/sessionctx/variable" "github.com/pingcap/tidb/statistics" "github.com/pingcap/tidb/tablecodec" - "github.com/pingcap/tidb/types" "github.com/pingcap/tidb/util/chunk" "github.com/pingcap/tidb/util/ranger" tipb "github.com/pingcap/tipb/go-tipb" @@ -176,9 +175,8 @@ type AnalyzeIndexExec struct { } func (e *AnalyzeIndexExec) open() error { - idxRange := &ranger.NewRange{LowVal: []types.Datum{types.MinNotNullDatum()}, HighVal: []types.Datum{types.MaxValueDatum()}} var builder distsql.RequestBuilder - kvReq, err := builder.SetIndexRanges(e.ctx.GetSessionVars().StmtCtx, e.tblInfo.ID, e.idxInfo.ID, []*ranger.NewRange{idxRange}). + kvReq, err := builder.SetIndexRanges(e.ctx.GetSessionVars().StmtCtx, e.tblInfo.ID, e.idxInfo.ID, ranger.FullNewRange()). SetAnalyzeRequest(e.analyzePB). SetKeepOrder(true). SetPriority(e.priority). diff --git a/plan/cbo_test.go b/plan/cbo_test.go index 2e89071d9abbb..d52d9b2dbfbb8 100644 --- a/plan/cbo_test.go +++ b/plan/cbo_test.go @@ -541,6 +541,32 @@ func (s *testAnalyzeSuite) TestPreparedNullParam(c *C) { cfg.PreparedPlanCache.Capacity = orgCapacity } +func (s *testAnalyzeSuite) TestNullCount(c *C) { + defer testleak.AfterTest(c)() + store, dom, err := newStoreWithBootstrap() + c.Assert(err, IsNil) + testKit := testkit.NewTestKit(c, store) + defer func() { + dom.Close() + store.Close() + }() + testKit.MustExec("use test") + testKit.MustExec("drop table if exists t") + testKit.MustExec("create table t (a int, b int, index idx(a))") + testKit.MustExec("insert into t values (null, 1), (null, 1)") + testKit.MustExec("analyze table t") + testKit.MustQuery("explain select * from t where a is null").Check(testkit.Rows( + "TableScan_5 Selection_6 cop table:t, range:[-inf,+inf], keep order:false 2.00", + "Selection_6 TableScan_5 cop isnull(test.t.a) 2.00", + "TableReader_7 root data:Selection_6 2.00", + )) + testKit.MustQuery("explain select * from t use index(idx) where a is null").Check(testkit.Rows( + "IndexScan_5 cop table:t, index:a, range:[,], keep order:false 2.00", + "TableScan_6 cop table:t, keep order:false 2.00", + "IndexLookUp_7 root index:IndexScan_5, table:TableScan_6 2.00", + )) +} + func newStoreWithBootstrap() (kv.Storage, *domain.Domain, error) { store, err := mockstore.NewMockTikvStore() if err != nil { diff --git a/statistics/boostrap.go b/statistics/boostrap.go index 8df521565221e..457492ddb156a 100644 --- a/statistics/boostrap.go +++ b/statistics/boostrap.go @@ -111,7 +111,7 @@ func initStatsHistograms4Chunk(is infoschema.InfoSchema, tables statsCache, iter continue } hist := NewHistogram(id, ndv, nullCount, version, &colInfo.FieldType, 0, totColSize) - table.Columns[hist.ID] = &Column{Histogram: *hist, Info: colInfo} + table.Columns[hist.ID] = &Column{Histogram: *hist, Info: colInfo, Count: nullCount} } } } diff --git a/statistics/ddl_test.go b/statistics/ddl_test.go index 46ee0f6ed12a1..50a17b55671c3 100644 --- a/statistics/ddl_test.go +++ b/statistics/ddl_test.go @@ -107,7 +107,7 @@ func (s *testStatsCacheSuite) TestDDLHistogram(c *C) { statsTbl := do.StatsHandle().GetTableStats(tableInfo.ID) c.Assert(statsTbl.Pseudo, IsFalse) sc := new(stmtctx.StatementContext) - c.Assert(statsTbl.ColumnIsInvalid(sc, tableInfo.Columns[2].ID), IsTrue) + c.Check(statsTbl.Columns[tableInfo.Columns[2].ID].NullCount, Equals, int64(2)) c.Check(statsTbl.Columns[tableInfo.Columns[2].ID].NDV, Equals, int64(0)) testKit.MustExec("alter table t add column c3 int NOT NULL") diff --git a/statistics/histogram.go b/statistics/histogram.go index d2fe37bfc263b..23b2b86b02c22 100644 --- a/statistics/histogram.go +++ b/statistics/histogram.go @@ -397,9 +397,9 @@ func (hg *Histogram) betweenRowCount(a, b types.Datum) float64 { func (hg *Histogram) totalRowCount() float64 { if hg.Len() == 0 { - return 0 + return float64(hg.NullCount) } - return float64(hg.Buckets[hg.Len()-1].Count) + return float64(hg.Buckets[hg.Len()-1].Count + hg.NullCount) } // mergeBuckets is used to merge every two neighbor buckets. @@ -425,7 +425,7 @@ func (hg *Histogram) mergeBuckets(bucketIdx int) { // getIncreaseFactor will return a factor of data increasing after the last analysis. func (hg *Histogram) getIncreaseFactor(totalCount int64) float64 { - columnCount := int64(hg.totalRowCount()) + hg.NullCount + columnCount := int64(hg.totalRowCount()) if columnCount == 0 { // avoid dividing by 0 return 1.0 @@ -615,6 +615,9 @@ func (c *Column) String() string { } func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum) (float64, error) { + if val.IsNull() { + return float64(c.NullCount), nil + } if c.CMSketch != nil { count, err := c.CMSketch.queryValue(sc, val) return float64(count), errors.Trace(err) diff --git a/statistics/table.go b/statistics/table.go index 8e48d10b28ddf..9943681f53a63 100644 --- a/statistics/table.go +++ b/statistics/table.go @@ -131,7 +131,7 @@ func (h *Handle) columnStatsFromStorage(row types.Row, table *Table, tableInfo * col = &Column{ Histogram: Histogram{ID: histID, NDV: distinct, NullCount: nullCount, tp: &colInfo.FieldType, LastUpdateVersion: histVer, TotColSize: totColSize}, Info: colInfo, - Count: count} + Count: count + nullCount} break } if col == nil || col.LastUpdateVersion < histVer || loadAll { @@ -252,7 +252,7 @@ func (t *Table) ColumnIsInvalid(sc *stmtctx.StatementContext, colID int64) bool sc.SetHistogramsNotLoad() histogramNeededColumns.insert(tableColumnID{tableID: t.TableID, columnID: colID}) } - return !ok || col.Len() == 0 + return !ok || (col.NDV > 0 && col.Len() == 0) } // ColumnGreaterRowCount estimates the row count where the column greater than value. From a59a7369a3757fe202102a3150183609a3187b6d Mon Sep 17 00:00:00 2001 From: Haibin Xie Date: Tue, 3 Apr 2018 15:40:16 +0800 Subject: [PATCH 2/2] fix ci --- plan/cbo_test.go | 17 ++++++++++++++++- statistics/histogram.go | 10 +++++++++- statistics/table.go | 2 +- 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/plan/cbo_test.go b/plan/cbo_test.go index d52d9b2dbfbb8..63ccd92dbf5e0 100644 --- a/plan/cbo_test.go +++ b/plan/cbo_test.go @@ -553,7 +553,7 @@ func (s *testAnalyzeSuite) TestNullCount(c *C) { testKit.MustExec("use test") testKit.MustExec("drop table if exists t") testKit.MustExec("create table t (a int, b int, index idx(a))") - testKit.MustExec("insert into t values (null, 1), (null, 1)") + testKit.MustExec("insert into t values (null, null), (null, null)") testKit.MustExec("analyze table t") testKit.MustQuery("explain select * from t where a is null").Check(testkit.Rows( "TableScan_5 Selection_6 cop table:t, range:[-inf,+inf], keep order:false 2.00", @@ -565,6 +565,21 @@ func (s *testAnalyzeSuite) TestNullCount(c *C) { "TableScan_6 cop table:t, keep order:false 2.00", "IndexLookUp_7 root index:IndexScan_5, table:TableScan_6 2.00", )) + h := dom.StatsHandle() + h.Clear() + h.Lease = 1 + defer func() { h.Lease = 0 }() + c.Assert(h.Update(dom.InfoSchema()), IsNil) + testKit.MustQuery("explain select * from t where b = 1").Check(testkit.Rows( + "TableScan_5 Selection_6 cop table:t, range:[-inf,+inf], keep order:false 2.00", + "Selection_6 TableScan_5 cop eq(test.t.b, 1) 0.00", + "TableReader_7 root data:Selection_6 0.00", + )) + testKit.MustQuery("explain select * from t where b < 1").Check(testkit.Rows( + "TableScan_5 Selection_6 cop table:t, range:[-inf,+inf], keep order:false 2.00", + "Selection_6 TableScan_5 cop lt(test.t.b, 1) 0.00", + "TableReader_7 root data:Selection_6 0.00", + )) } func newStoreWithBootstrap() (kv.Storage, *domain.Domain, error) { diff --git a/statistics/histogram.go b/statistics/histogram.go index 23b2b86b02c22..7047f8eb2c9a1 100644 --- a/statistics/histogram.go +++ b/statistics/histogram.go @@ -358,6 +358,10 @@ func (hg *Histogram) greaterAndEqRowCount(value types.Datum) float64 { // lessRowCount estimates the row count where the column less than value. func (hg *Histogram) lessRowCount(value types.Datum) float64 { + // all the values is null + if hg.Bounds == nil { + return 0 + } index, match := hg.Bounds.LowerBound(0, &value) if index == hg.Bounds.NumRows() { return hg.totalRowCount() @@ -389,7 +393,7 @@ func (hg *Histogram) betweenRowCount(a, b types.Datum) float64 { lessCountB := hg.lessRowCount(b) // If lessCountA is not less than lessCountB, it may be that they fall to the same bucket and we cannot estimate // the fraction, so we use `totalCount / NDV` to estimate the row count, but the result should not greater than lessCountB. - if lessCountA >= lessCountB { + if lessCountA >= lessCountB && hg.NDV > 0 { return math.Min(lessCountB, hg.totalRowCount()/float64(hg.NDV)) } return lessCountB - lessCountA @@ -622,6 +626,10 @@ func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum) (f count, err := c.CMSketch.queryValue(sc, val) return float64(count), errors.Trace(err) } + // all the values is null + if c.Histogram.Bounds == nil { + return 0.0, nil + } return c.Histogram.equalRowCount(val), nil } diff --git a/statistics/table.go b/statistics/table.go index dca0458eb6d6a..9dccb6a9f287f 100644 --- a/statistics/table.go +++ b/statistics/table.go @@ -252,7 +252,7 @@ func (t *Table) ColumnIsInvalid(sc *stmtctx.StatementContext, colID int64) bool sc.SetHistogramsNotLoad() histogramNeededColumns.insert(tableColumnID{tableID: t.TableID, columnID: colID}) } - return !ok || (col.NDV > 0 && col.Len() == 0) + return !ok || col.totalRowCount() == 0 || (col.NDV > 0 && col.Len() == 0) } // ColumnGreaterRowCount estimates the row count where the column greater than value.