From c4dc5195723e5d8ddbf029f0a7eff216ba615258 Mon Sep 17 00:00:00 2001
From: Haibin Xie <xiehaibin@pingcap.com>
Date: Mon, 2 Apr 2018 17:18:50 +0800
Subject: [PATCH 1/2] stats: fix row count estimation for null

---
 executor/analyze.go     |  4 +---
 plan/cbo_test.go        | 26 ++++++++++++++++++++++++++
 statistics/boostrap.go  |  2 +-
 statistics/ddl_test.go  |  2 +-
 statistics/histogram.go |  9 ++++++---
 statistics/table.go     |  4 ++--
 6 files changed, 37 insertions(+), 10 deletions(-)
diff --git a/executor/analyze.go b/executor/analyze.go
index acfaba16b5e0a..cdfb05639164f 100644
--- a/executor/analyze.go
+++ b/executor/analyze.go
@@ -27,7 +27,6 @@ import (
 	"github.com/pingcap/tidb/sessionctx/variable"
 	"github.com/pingcap/tidb/statistics"
 	"github.com/pingcap/tidb/tablecodec"
-	"github.com/pingcap/tidb/types"
 	"github.com/pingcap/tidb/util/chunk"
 	"github.com/pingcap/tidb/util/ranger"
 	tipb "github.com/pingcap/tipb/go-tipb"
@@ -176,9 +175,8 @@ type AnalyzeIndexExec struct {
 }
 
 func (e *AnalyzeIndexExec) open() error {
-	idxRange := &ranger.NewRange{LowVal: []types.Datum{types.MinNotNullDatum()}, HighVal: []types.Datum{types.MaxValueDatum()}}
 	var builder distsql.RequestBuilder
-	kvReq, err := builder.SetIndexRanges(e.ctx.GetSessionVars().StmtCtx, e.tblInfo.ID, e.idxInfo.ID, []*ranger.NewRange{idxRange}).
+	kvReq, err := builder.SetIndexRanges(e.ctx.GetSessionVars().StmtCtx, e.tblInfo.ID, e.idxInfo.ID, ranger.FullNewRange()).
 		SetAnalyzeRequest(e.analyzePB).
 		SetKeepOrder(true).
 		SetPriority(e.priority).
diff --git a/plan/cbo_test.go b/plan/cbo_test.go
index 2e89071d9abbb..d52d9b2dbfbb8 100644
--- a/plan/cbo_test.go
+++ b/plan/cbo_test.go
@@ -541,6 +541,32 @@ func (s *testAnalyzeSuite) TestPreparedNullParam(c *C) {
 	cfg.PreparedPlanCache.Capacity = orgCapacity
 }
 
+func (s *testAnalyzeSuite) TestNullCount(c *C) {
+	defer testleak.AfterTest(c)()
+	store, dom, err := newStoreWithBootstrap()
+	c.Assert(err, IsNil)
+	testKit := testkit.NewTestKit(c, store)
+	defer func() {
+		dom.Close()
+		store.Close()
+	}()
+	testKit.MustExec("use test")
+	testKit.MustExec("drop table if exists t")
+	testKit.MustExec("create table t (a int, b int, index idx(a))")
+	testKit.MustExec("insert into t values (null, 1), (null, 1)")
+	testKit.MustExec("analyze table t")
+	testKit.MustQuery("explain select * from t where a is null").Check(testkit.Rows(
+		"TableScan_5 Selection_6  cop table:t, range:[-inf,+inf], keep order:false 2.00",
+		"Selection_6  TableScan_5 cop isnull(test.t.a) 2.00",
+		"TableReader_7   root data:Selection_6 2.00",
+	))
+	testKit.MustQuery("explain select * from t use index(idx) where a is null").Check(testkit.Rows(
+		"IndexScan_5   cop table:t, index:a, range:[<nil>,<nil>], keep order:false 2.00",
+		"TableScan_6   cop table:t, keep order:false 2.00",
+		"IndexLookUp_7   root index:IndexScan_5, table:TableScan_6 2.00",
+	))
+}
+
 func newStoreWithBootstrap() (kv.Storage, *domain.Domain, error) {
 	store, err := mockstore.NewMockTikvStore()
 	if err != nil {
diff --git a/statistics/boostrap.go b/statistics/boostrap.go
index 8df521565221e..457492ddb156a 100644
--- a/statistics/boostrap.go
+++ b/statistics/boostrap.go
@@ -111,7 +111,7 @@ func initStatsHistograms4Chunk(is infoschema.InfoSchema, tables statsCache, iter
 				continue
 			}
 			hist := NewHistogram(id, ndv, nullCount, version, &colInfo.FieldType, 0, totColSize)
-			table.Columns[hist.ID] = &Column{Histogram: *hist, Info: colInfo}
+			table.Columns[hist.ID] = &Column{Histogram: *hist, Info: colInfo, Count: nullCount}
 		}
 	}
 }
diff --git a/statistics/ddl_test.go b/statistics/ddl_test.go
index 46ee0f6ed12a1..50a17b55671c3 100644
--- a/statistics/ddl_test.go
+++ b/statistics/ddl_test.go
@@ -107,7 +107,7 @@ func (s *testStatsCacheSuite) TestDDLHistogram(c *C) {
 	statsTbl := do.StatsHandle().GetTableStats(tableInfo.ID)
 	c.Assert(statsTbl.Pseudo, IsFalse)
 	sc := new(stmtctx.StatementContext)
-	c.Assert(statsTbl.ColumnIsInvalid(sc, tableInfo.Columns[2].ID), IsTrue)
+	c.Check(statsTbl.Columns[tableInfo.Columns[2].ID].NullCount, Equals, int64(2))
 	c.Check(statsTbl.Columns[tableInfo.Columns[2].ID].NDV, Equals, int64(0))
 
 	testKit.MustExec("alter table t add column c3 int NOT NULL")
diff --git a/statistics/histogram.go b/statistics/histogram.go
index d2fe37bfc263b..23b2b86b02c22 100644
--- a/statistics/histogram.go
+++ b/statistics/histogram.go
@@ -397,9 +397,9 @@ func (hg *Histogram) betweenRowCount(a, b types.Datum) float64 {
 
 func (hg *Histogram) totalRowCount() float64 {
 	if hg.Len() == 0 {
-		return 0
+		return float64(hg.NullCount)
 	}
-	return float64(hg.Buckets[hg.Len()-1].Count)
+	return float64(hg.Buckets[hg.Len()-1].Count + hg.NullCount)
 }
 
 // mergeBuckets is used to merge every two neighbor buckets.
@@ -425,7 +425,7 @@ func (hg *Histogram) mergeBuckets(bucketIdx int) {
 
 // getIncreaseFactor will return a factor of data increasing after the last analysis.
 func (hg *Histogram) getIncreaseFactor(totalCount int64) float64 {
-	columnCount := int64(hg.totalRowCount()) + hg.NullCount
+	columnCount := int64(hg.totalRowCount())
 	if columnCount == 0 {
 		// avoid dividing by 0
 		return 1.0
@@ -615,6 +615,9 @@ func (c *Column) String() string {
 }
 
 func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum) (float64, error) {
+	if val.IsNull() {
+		return float64(c.NullCount), nil
+	}
 	if c.CMSketch != nil {
 		count, err := c.CMSketch.queryValue(sc, val)
 		return float64(count), errors.Trace(err)
diff --git a/statistics/table.go b/statistics/table.go
index 8e48d10b28ddf..9943681f53a63 100644
--- a/statistics/table.go
+++ b/statistics/table.go
@@ -131,7 +131,7 @@ func (h *Handle) columnStatsFromStorage(row types.Row, table *Table, tableInfo *
 			col = &Column{
 				Histogram: Histogram{ID: histID, NDV: distinct, NullCount: nullCount, tp: &colInfo.FieldType, LastUpdateVersion: histVer, TotColSize: totColSize},
 				Info:      colInfo,
-				Count:     count}
+				Count:     count + nullCount}
 			break
 		}
 		if col == nil || col.LastUpdateVersion < histVer || loadAll {
@@ -252,7 +252,7 @@ func (t *Table) ColumnIsInvalid(sc *stmtctx.StatementContext, colID int64) bool
 		sc.SetHistogramsNotLoad()
 		histogramNeededColumns.insert(tableColumnID{tableID: t.TableID, columnID: colID})
 	}
-	return !ok || col.Len() == 0
+	return !ok || (col.NDV > 0 && col.Len() == 0)
 }
 
 // ColumnGreaterRowCount estimates the row count where the column greater than value.

From a59a7369a3757fe202102a3150183609a3187b6d Mon Sep 17 00:00:00 2001
From: Haibin Xie <xiehaibin@pingcap.com>
Date: Tue, 3 Apr 2018 15:40:16 +0800
Subject: [PATCH 2/2] fix ci

---
 plan/cbo_test.go        | 17 ++++++++++++++++-
 statistics/histogram.go | 10 +++++++++-
 statistics/table.go     |  2 +-
 3 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/plan/cbo_test.go b/plan/cbo_test.go
index d52d9b2dbfbb8..63ccd92dbf5e0 100644
--- a/plan/cbo_test.go
+++ b/plan/cbo_test.go
@@ -553,7 +553,7 @@ func (s *testAnalyzeSuite) TestNullCount(c *C) {
 	testKit.MustExec("use test")
 	testKit.MustExec("drop table if exists t")
 	testKit.MustExec("create table t (a int, b int, index idx(a))")
-	testKit.MustExec("insert into t values (null, 1), (null, 1)")
+	testKit.MustExec("insert into t values (null, null), (null, null)")
 	testKit.MustExec("analyze table t")
 	testKit.MustQuery("explain select * from t where a is null").Check(testkit.Rows(
 		"TableScan_5 Selection_6  cop table:t, range:[-inf,+inf], keep order:false 2.00",
@@ -565,6 +565,21 @@ func (s *testAnalyzeSuite) TestNullCount(c *C) {
 		"TableScan_6   cop table:t, keep order:false 2.00",
 		"IndexLookUp_7   root index:IndexScan_5, table:TableScan_6 2.00",
 	))
+	h := dom.StatsHandle()
+	h.Clear()
+	h.Lease = 1
+	defer func() { h.Lease = 0 }()
+	c.Assert(h.Update(dom.InfoSchema()), IsNil)
+	testKit.MustQuery("explain select * from t where b = 1").Check(testkit.Rows(
+		"TableScan_5 Selection_6  cop table:t, range:[-inf,+inf], keep order:false 2.00",
+		"Selection_6  TableScan_5 cop eq(test.t.b, 1) 0.00",
+		"TableReader_7   root data:Selection_6 0.00",
+	))
+	testKit.MustQuery("explain select * from t where b < 1").Check(testkit.Rows(
+		"TableScan_5 Selection_6  cop table:t, range:[-inf,+inf], keep order:false 2.00",
+		"Selection_6  TableScan_5 cop lt(test.t.b, 1) 0.00",
+		"TableReader_7   root data:Selection_6 0.00",
+	))
 }
 
 func newStoreWithBootstrap() (kv.Storage, *domain.Domain, error) {
diff --git a/statistics/histogram.go b/statistics/histogram.go
index 23b2b86b02c22..7047f8eb2c9a1 100644
--- a/statistics/histogram.go
+++ b/statistics/histogram.go
@@ -358,6 +358,10 @@ func (hg *Histogram) greaterAndEqRowCount(value types.Datum) float64 {
 
 // lessRowCount estimates the row count where the column less than value.
 func (hg *Histogram) lessRowCount(value types.Datum) float64 {
+	// all the values is null
+	if hg.Bounds == nil {
+		return 0
+	}
 	index, match := hg.Bounds.LowerBound(0, &value)
 	if index == hg.Bounds.NumRows() {
 		return hg.totalRowCount()
@@ -389,7 +393,7 @@ func (hg *Histogram) betweenRowCount(a, b types.Datum) float64 {
 	lessCountB := hg.lessRowCount(b)
 	// If lessCountA is not less than lessCountB, it may be that they fall to the same bucket and we cannot estimate
 	// the fraction, so we use `totalCount / NDV` to estimate the row count, but the result should not greater than lessCountB.
-	if lessCountA >= lessCountB {
+	if lessCountA >= lessCountB && hg.NDV > 0 {
 		return math.Min(lessCountB, hg.totalRowCount()/float64(hg.NDV))
 	}
 	return lessCountB - lessCountA
@@ -622,6 +626,10 @@ func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum) (f
 		count, err := c.CMSketch.queryValue(sc, val)
 		return float64(count), errors.Trace(err)
 	}
+	// all the values is null
+	if c.Histogram.Bounds == nil {
+		return 0.0, nil
+	}
 	return c.Histogram.equalRowCount(val), nil
 }
 
diff --git a/statistics/table.go b/statistics/table.go
index dca0458eb6d6a..9dccb6a9f287f 100644
--- a/statistics/table.go
+++ b/statistics/table.go
@@ -252,7 +252,7 @@ func (t *Table) ColumnIsInvalid(sc *stmtctx.StatementContext, colID int64) bool
 		sc.SetHistogramsNotLoad()
 		histogramNeededColumns.insert(tableColumnID{tableID: t.TableID, columnID: colID})
 	}
-	return !ok || (col.NDV > 0 && col.Len() == 0)
+	return !ok || col.totalRowCount() == 0 || (col.NDV > 0 && col.Len() == 0)
 }
 
 // ColumnGreaterRowCount estimates the row count where the column greater than value.