stats: do not wait for data unchanged when auto analyze (pingcap#7022)

alivxxx · Jul 18, 2018 · 99e13fe · 99e13fe
1 parent 8438bad
commit 99e13fe
Show file tree

Hide file tree

Showing 2 changed files with 51 additions and 21 deletions.
diff --git a/statistics/update.go b/statistics/update.go
@@ -368,28 +368,37 @@ const (
 // AutoAnalyzeMinCnt means if the count of table is less than this value, we needn't do auto analyze.
 var AutoAnalyzeMinCnt int64 = 1000
 
-func needAnalyzeTable(tbl *Table, limit time.Duration, autoAnalyzeRatio float64) bool {
-	if tbl.ModifyCount == 0 || tbl.Count < AutoAnalyzeMinCnt {
-		return false
-	}
-	t := time.Unix(0, oracle.ExtractPhysical(tbl.Version)*int64(time.Millisecond))
-	if time.Since(t) < limit {
-		return false
-	}
-	if autoAnalyzeRatio > 0 && float64(tbl.ModifyCount)/float64(tbl.Count) > autoAnalyzeRatio {
-		return true
-	}
+// tableAnalyzed checks if the table is analyzed.
+func tableAnalyzed(tbl *Table) bool {
 	for _, col := range tbl.Columns {
-		if col.Count > 0 {
-			return false
+		if col.Histogram.Len() > 0 {
+			return true
 		}
 	}
 	for _, idx := range tbl.Indices {
-		if idx.Len() > 0 {
-			return false
+		if idx.Histogram.Len() > 0 {
+			return true
 		}
 	}
-	return true
+	return false
+}
+
+// needAnalyzeTable checks if we need to analyze the table:
+// 1. If the table has never been analyzed, we need to analyze it when it has
+//    not been modified for a time.
+// 2. If the table had been analyzed before, we need to analyze it when
+//    "tbl.ModifyCount/tbl.Count > autoAnalyzeRatio".
+func needAnalyzeTable(tbl *Table, limit time.Duration, autoAnalyzeRatio float64) bool {
+	analyzed := tableAnalyzed(tbl)
+	if !analyzed {
+		t := time.Unix(0, oracle.ExtractPhysical(tbl.Version)*int64(time.Millisecond))
+		return time.Since(t) >= limit
+	}
+	// Auto analyze is disabled.
+	if autoAnalyzeRatio == 0 {
+		return false
+	}
+	return float64(tbl.ModifyCount)/float64(tbl.Count) > autoAnalyzeRatio
 }
 
 const minAutoAnalyzeRatio = 0.3
@@ -422,7 +431,7 @@ func (h *Handle) HandleAutoAnalyze(is infoschema.InfoSchema) error {
 		for _, tbl := range tbls {
 			tblInfo := tbl.Meta()
 			statsTbl := h.GetTableStats(tblInfo)
-			if statsTbl.Pseudo || statsTbl.Count == 0 {
+			if statsTbl.Pseudo || statsTbl.Count < AutoAnalyzeMinCnt {
 				continue
 			}
 			tblName := "`" + db + "`.`" + tblInfo.Name.O + "`"

diff --git a/statistics/update_test.go b/statistics/update_test.go
@@ -18,11 +18,13 @@ import (
 	"strings"
 	"time"
 
+	"github.com/juju/errors"
 	. "github.com/pingcap/check"
 	"github.com/pingcap/tidb/domain"
 	"github.com/pingcap/tidb/kv"
 	"github.com/pingcap/tidb/model"
 	"github.com/pingcap/tidb/mysql"
+	"github.com/pingcap/tidb/sessionctx"
 	"github.com/pingcap/tidb/statistics"
 	"github.com/pingcap/tidb/types"
 	"github.com/pingcap/tidb/util/ranger"
@@ -270,6 +272,20 @@ func (s *testStatsUpdateSuite) TestTxnWithFailure(c *C) {
 	c.Assert(stats1.Count, Equals, int64(rowCount1+1))
 }
 
+// dumpAnalyzeResult is used for dump the analyze result to KV. We need this because sometimes
+// we need to temporary make the stats lease greater than 0, but the analyze executor will only send
+// the result to a channel.
+func dumpAnalyzeResult(sctx sessionctx.Context, h *statistics.Handle) error {
+	for len(h.AnalyzeResultCh()) > 0 {
+		t := <-h.AnalyzeResultCh()
+		for i, hg := range t.Hist {
+			err := statistics.SaveStatsToStorage(sctx, t.TableID, t.Count, t.IsIndex, hg, t.Cms[i])
+			return errors.Trace(err)
+		}
+	}
+	return nil
+}
+
 func (s *testStatsUpdateSuite) TestAutoUpdate(c *C) {
 	defer cleanEnv(c, s.store, s.do)
 	testKit := testkit.NewTestKit(c, s.store)
@@ -311,12 +327,16 @@ func (s *testStatsUpdateSuite) TestAutoUpdate(c *C) {
 		break
 	}
 
+	// Test that even if the table is recently modified, we can still analyze the table.
+	h.Lease = time.Millisecond
+	defer func() { h.Lease = 0 }()
 	_, err = testKit.Exec("insert into t values ('fff')")
 	c.Assert(err, IsNil)
 	c.Assert(h.DumpStatsDeltaToKV(), IsNil)
 	c.Assert(h.Update(is), IsNil)
 	err = h.HandleAutoAnalyze(is)
 	c.Assert(err, IsNil)
+	c.Assert(dumpAnalyzeResult(testKit.Se, h), IsNil)
 	h.Update(is)
 	stats = h.GetTableStats(tableInfo)
 	c.Assert(stats.Count, Equals, int64(2))
@@ -328,6 +348,7 @@ func (s *testStatsUpdateSuite) TestAutoUpdate(c *C) {
 	c.Assert(h.Update(is), IsNil)
 	err = h.HandleAutoAnalyze(is)
 	c.Assert(err, IsNil)
+	c.Assert(dumpAnalyzeResult(testKit.Se, h), IsNil)
 	h.Update(is)
 	stats = h.GetTableStats(tableInfo)
 	c.Assert(stats.Count, Equals, int64(3))
@@ -336,13 +357,10 @@ func (s *testStatsUpdateSuite) TestAutoUpdate(c *C) {
 	_, err = testKit.Exec("insert into t values ('eee')")
 	c.Assert(err, IsNil)
 	h.DumpStatsDeltaToKV()
-	h.Clear()
-	// We set `Lease` here so that `Update` will use load by need strategy.
-	h.Lease = time.Second
 	h.Update(is)
-	h.Lease = 0
 	err = h.HandleAutoAnalyze(is)
 	c.Assert(err, IsNil)
+	c.Assert(dumpAnalyzeResult(testKit.Se, h), IsNil)
 	h.Update(is)
 	stats = h.GetTableStats(tableInfo)
 	c.Assert(stats.Count, Equals, int64(4))
@@ -354,13 +372,16 @@ func (s *testStatsUpdateSuite) TestAutoUpdate(c *C) {
 		break
 	}
 
+	testKit.MustExec("analyze table t")
+	c.Assert(dumpAnalyzeResult(testKit.Se, h), IsNil)
 	_, err = testKit.Exec("create index idx on t(a)")
 	c.Assert(err, IsNil)
 	is = do.InfoSchema()
 	tbl, err = is.TableByName(model.NewCIStr("test"), model.NewCIStr("t"))
 	c.Assert(err, IsNil)
 	tableInfo = tbl.Meta()
 	h.HandleAutoAnalyze(is)
+	c.Assert(dumpAnalyzeResult(testKit.Se, h), IsNil)
 	h.Update(is)
 	stats = h.GetTableStats(tableInfo)
 	c.Assert(stats.Count, Equals, int64(4))