From ced25893683b22498207d4f73fc61e006d320c20 Mon Sep 17 00:00:00 2001 From: Haibin Xie Date: Sun, 5 May 2019 15:08:54 +0800 Subject: [PATCH 1/4] executor, stats: support incremental analyze with feedback --- executor/analyze.go | 28 +++++++--------- executor/analyze_test.go | 36 +++++++++++++++++++++ executor/builder.go | 46 +++++++++++++++++++------- executor/executor_test.go | 1 + session/bootstrap.go | 10 ++++++ session/session.go | 2 +- statistics/cmsketch.go | 22 +++++++++++++ statistics/handle/bootstrap.go | 16 +++++---- statistics/handle/handle.go | 49 ++++++++++++++++++---------- statistics/handle/update.go | 2 ++ statistics/histogram.go | 59 +++++++++++++--------------------- 11 files changed, 182 insertions(+), 89 deletions(-) diff --git a/executor/analyze.go b/executor/analyze.go index 5c27ba9e17d39..d52a0afeafafe 100644 --- a/executor/analyze.go +++ b/executor/analyze.go @@ -1075,13 +1075,13 @@ func (e *AnalyzeTestFastExec) TestFastSample() error { type analyzeIndexIncrementalExec struct { AnalyzeIndexExec - index *statistics.Index + oldHist *statistics.Histogram + oldCMS *statistics.CMSketch } func analyzeIndexIncremental(idxExec *analyzeIndexIncrementalExec) analyzeResult { - idx := idxExec.index - highBound := idx.Histogram.GetUpper(idx.Len() - 1) - values, err := codec.Decode(highBound.GetBytes(), len(idxExec.idxInfo.Columns)) + startPos := idxExec.oldHist.GetUpper(idxExec.oldHist.Len() - 1) + values, err := codec.DecodeRange(startPos.GetBytes(), len(idxExec.idxInfo.Columns)) if err != nil { return analyzeResult{Err: err, job: idxExec.job} } @@ -1090,16 +1090,12 @@ func analyzeIndexIncremental(idxExec *analyzeIndexIncrementalExec) analyzeResult if err != nil { return analyzeResult{Err: err, job: idxExec.job} } - oldHist, oldCMS, err := idx.RemoveUpperBound(idxExec.ctx.GetSessionVars().StmtCtx, values) + hist, err = statistics.MergeHistograms(idxExec.ctx.GetSessionVars().StmtCtx, idxExec.oldHist, hist, int(idxExec.maxNumBuckets)) if err != nil { return analyzeResult{Err: err, job: idxExec.job} } - hist, err = statistics.MergeHistograms(idxExec.ctx.GetSessionVars().StmtCtx, oldHist, hist, int(idxExec.maxNumBuckets)) - if err != nil { - return analyzeResult{Err: err, job: idxExec.job} - } - if oldCMS != nil && cms != nil { - err = cms.MergeCMSketch(oldCMS) + if idxExec.oldCMS != nil && cms != nil { + err = cms.MergeCMSketch4IncrementalAnalyze(idxExec.oldCMS) if err != nil { return analyzeResult{Err: err, job: idxExec.job} } @@ -1120,26 +1116,24 @@ func analyzeIndexIncremental(idxExec *analyzeIndexIncrementalExec) analyzeResult type analyzePKIncrementalExec struct { AnalyzeColumnsExec - pkStats *statistics.Column + oldHist *statistics.Histogram } func analyzePKIncremental(colExec *analyzePKIncrementalExec) analyzeResult { - pkStats := colExec.pkStats - high := pkStats.GetUpper(pkStats.Len() - 1) var maxVal types.Datum if mysql.HasUnsignedFlag(colExec.pkInfo.Flag) { maxVal = types.NewUintDatum(math.MaxUint64) } else { maxVal = types.NewIntDatum(math.MaxInt64) } - ran := ranger.Range{LowVal: []types.Datum{*high}, LowExclude: true, HighVal: []types.Datum{maxVal}} + startPos := *colExec.oldHist.GetUpper(colExec.oldHist.Len() - 1) + ran := ranger.Range{LowVal: []types.Datum{startPos}, LowExclude: true, HighVal: []types.Datum{maxVal}} hists, _, err := colExec.buildStats([]*ranger.Range{&ran}) if err != nil { return analyzeResult{Err: err, job: colExec.job} } hist := hists[0] - oldHist := pkStats.Histogram.Copy() - hist, err = statistics.MergeHistograms(colExec.ctx.GetSessionVars().StmtCtx, oldHist, hist, int(colExec.maxNumBuckets)) + hist, err = statistics.MergeHistograms(colExec.ctx.GetSessionVars().StmtCtx, colExec.oldHist, hist, int(colExec.maxNumBuckets)) if err != nil { return analyzeResult{Err: err, job: colExec.job} } diff --git a/executor/analyze_test.go b/executor/analyze_test.go index c33880d4f4b90..e4d1225a098b4 100644 --- a/executor/analyze_test.go +++ b/executor/analyze_test.go @@ -24,9 +24,13 @@ import ( "github.com/pingcap/tidb/executor" "github.com/pingcap/tidb/session" "github.com/pingcap/tidb/sessionctx" + "github.com/pingcap/tidb/statistics" + "github.com/pingcap/tidb/statistics/handle" "github.com/pingcap/tidb/store/mockstore" "github.com/pingcap/tidb/store/mockstore/mocktikv" "github.com/pingcap/tidb/table" + "github.com/pingcap/tidb/types" + "github.com/pingcap/tidb/util/codec" "github.com/pingcap/tidb/util/testkit" ) @@ -303,4 +307,36 @@ func (s *testSuite1) TestAnalyzeIncremental(c *C) { tk.MustExec("analyze incremental table t index") // Result should not change. tk.MustQuery("show stats_buckets").Check(testkit.Rows("test t a 0 0 1 1 1 1", "test t a 0 1 2 1 2 2", "test t idx 1 0 1 1 1 1", "test t idx 1 1 2 1 2 2")) + + // Test analyze incremental with feedback. + tk.MustExec("insert into t values (3,3)") + oriProbability := statistics.FeedbackProbability.Load() + defer func() { + statistics.FeedbackProbability.Store(oriProbability) + }() + statistics.FeedbackProbability.Store(1) + is := s.dom.InfoSchema() + table, err := is.TableByName(model.NewCIStr("test"), model.NewCIStr("t")) + c.Assert(err, IsNil) + tblInfo := table.Meta() + tk.MustQuery("select * from t use index(idx) where b = 3") + tk.MustQuery("select * from t where a > 1") + h := s.dom.StatsHandle() + c.Assert(h.DumpStatsDeltaToKV(handle.DumpAll), IsNil) + c.Assert(h.DumpStatsFeedbackToKV(), IsNil) + c.Assert(h.HandleUpdateStats(is), IsNil) + h.Update(is) + tk.MustQuery("show stats_buckets").Check(testkit.Rows("test t a 0 0 1 1 1 1", "test t a 0 1 3 0 2 2147483647", "test t idx 1 0 1 1 1 1", "test t idx 1 1 2 1 2 2")) + tblStats := h.GetTableStats(tblInfo) + val, err := codec.EncodeKey(tk.Se.GetSessionVars().StmtCtx, nil, types.NewIntDatum(3)) + c.Assert(err, IsNil) + c.Assert(tblStats.Indices[tblInfo.Indices[0].ID].CMSketch.QueryBytes(val), Equals, uint64(1)) + c.Assert(statistics.IsAnalyzed(tblStats.Indices[tblInfo.Indices[0].ID].Flag), IsFalse) + c.Assert(statistics.IsAnalyzed(tblStats.Columns[tblInfo.Columns[0].ID].Flag), IsFalse) + + tk.MustExec("analyze incremental table t index") + tk.MustQuery("show stats_buckets").Check(testkit.Rows("test t a 0 0 1 1 1 1", "test t a 0 1 2 1 2 2", "test t a 0 2 3 1 3 3", + "test t idx 1 0 1 1 1 1", "test t idx 1 1 2 1 2 2", "test t idx 1 2 3 1 3 3")) + tblStats = h.GetTableStats(tblInfo) + c.Assert(tblStats.Indices[tblInfo.Indices[0].ID].CMSketch.QueryBytes(val), Equals, uint64(1)) } diff --git a/executor/builder.go b/executor/builder.go index 596b64c72b48c..10b819123fe2f 100644 --- a/executor/builder.go +++ b/executor/builder.go @@ -1385,18 +1385,28 @@ func (b *executorBuilder) buildAnalyzeIndexIncremental(task plannercore.AnalyzeI return analyzeTask } idx, ok := statsTbl.Indices[task.IndexInfo.ID] - // TODO: If the index contains feedback, we may use other strategy. - if !ok || idx.Len() == 0 || idx.ContainsFeedback() { + if !ok || idx.Len() == 0 || idx.LastAnalyzePos.IsNull() { return analyzeTask } - exec := analyzeTask.idxExec - if idx.CMSketch != nil { - width, depth := idx.CMSketch.GetWidthAndDepth() - exec.analyzePB.IdxReq.CmsketchWidth = &width - exec.analyzePB.IdxReq.CmsketchDepth = &depth + var oldHist *statistics.Histogram + if statistics.IsAnalyzed(idx.Flag) { + exec := analyzeTask.idxExec + if idx.CMSketch != nil { + width, depth := idx.CMSketch.GetWidthAndDepth() + exec.analyzePB.IdxReq.CmsketchWidth = &width + exec.analyzePB.IdxReq.CmsketchDepth = &depth + } + oldHist = idx.Histogram.Copy() + } else { + _, bktID := idx.LessRowCountWithBktIdx(idx.LastAnalyzePos) + if bktID == 0 { + return analyzeTask + } + oldHist = idx.TruncateHistogram(bktID) } + oldHist = idx.Histogram.RemoveUpperBound() analyzeTask.taskType = idxIncrementalTask - analyzeTask.idxIncrementalExec = &analyzeIndexIncrementalExec{AnalyzeIndexExec: *analyzeTask.idxExec, index: idx} + analyzeTask.idxIncrementalExec = &analyzeIndexIncrementalExec{AnalyzeIndexExec: *analyzeTask.idxExec, oldHist: oldHist, oldCMS: idx.CMSketch} analyzeTask.job = &statistics.AnalyzeJob{DBName: task.DBName, TableName: task.TableName, PartitionName: task.PartitionName, JobInfo: "analyze incremental index " + task.IndexInfo.Name.O} return analyzeTask } @@ -1445,13 +1455,27 @@ func (b *executorBuilder) buildAnalyzePKIncremental(task plannercore.AnalyzeColu return analyzeTask } col, ok := statsTbl.Columns[task.PKInfo.ID] - // TODO: If the primary key contains feedback, we may use other strategy. - if !ok || col.Len() == 0 || col.ContainsFeedback() { + if !ok || col.Len() == 0 || col.LastAnalyzePos.IsNull() { return analyzeTask } + var oldHist *statistics.Histogram + if statistics.IsAnalyzed(col.Flag) { + oldHist = col.Histogram.Copy() + } else { + d, err := col.LastAnalyzePos.ConvertTo(b.ctx.GetSessionVars().StmtCtx, col.Tp) + if err != nil { + b.err = err + return nil + } + _, bktID := col.LessRowCountWithBktIdx(d) + if bktID == 0 { + return analyzeTask + } + oldHist = col.TruncateHistogram(bktID) + } exec := analyzeTask.colExec analyzeTask.taskType = pkIncrementalTask - analyzeTask.colIncrementalExec = &analyzePKIncrementalExec{AnalyzeColumnsExec: *exec, pkStats: col} + analyzeTask.colIncrementalExec = &analyzePKIncrementalExec{AnalyzeColumnsExec: *exec, oldHist: oldHist} analyzeTask.job = &statistics.AnalyzeJob{DBName: task.DBName, TableName: task.TableName, PartitionName: task.PartitionName, JobInfo: "analyze incremental primary key"} return analyzeTask } diff --git a/executor/executor_test.go b/executor/executor_test.go index 9f24b23c0206a..e9b34d9b6a908 100644 --- a/executor/executor_test.go +++ b/executor/executor_test.go @@ -2507,6 +2507,7 @@ func (s *testSuite1) SetUpSuite(c *C) { c.Assert(err, IsNil) s.dom, err = session.BootstrapSession(s.store) c.Assert(err, IsNil) + s.dom.SetStatsUpdating(true) } func (s *testSuite1) TearDownSuite(c *C) { diff --git a/session/bootstrap.go b/session/bootstrap.go index 34a0503a051a7..94beedfc41ba4 100644 --- a/session/bootstrap.go +++ b/session/bootstrap.go @@ -171,6 +171,7 @@ const ( stats_ver bigint(64) NOT NULL DEFAULT 0, flag bigint(64) NOT NULL DEFAULT 0, correlation double NOT NULL DEFAULT 0, + last_analyze_pos blob DEFAULT NULL, unique index tbl(table_id, is_index, hist_id) );` @@ -328,6 +329,7 @@ const ( version28 = 28 version29 = 29 version30 = 30 + version31 = 31 ) func checkBootstrapped(s Session) (bool, error) { @@ -507,6 +509,10 @@ func upgrade(s Session) { upgradeToVer30(s) } + if ver < version31 { + upgradeToVer31(s) + } + updateBootstrapVer(s) _, err = s.Execute(context.Background(), "COMMIT") @@ -799,6 +805,10 @@ func upgradeToVer30(s Session) { mustExecute(s, CreateStatsTopNTable) } +func upgradeToVer31(s Session) { + doReentrantDDL(s, "ALTER TABLE mysql.stats_histograms ADD COLUMN `last_analyze_pos` blob default null", infoschema.ErrColumnExists) +} + // updateBootstrapVer updates bootstrap version variable in mysql.TiDB table. func updateBootstrapVer(s Session) { // Update bootstrap version. diff --git a/session/session.go b/session/session.go index 1f504b9dd4ddd..653340921600d 100644 --- a/session/session.go +++ b/session/session.go @@ -1558,7 +1558,7 @@ func createSessionWithDomain(store kv.Storage, dom *domain.Domain) (*session, er const ( notBootstrapped = 0 - currentBootstrapVersion = 30 + currentBootstrapVersion = 31 ) func getStoreBootstrapVersion(store kv.Storage) int64 { diff --git a/statistics/cmsketch.go b/statistics/cmsketch.go index 6c58fe47abab9..80510ded742f2 100644 --- a/statistics/cmsketch.go +++ b/statistics/cmsketch.go @@ -298,6 +298,28 @@ func (c *CMSketch) MergeCMSketch(rc *CMSketch) error { return nil } +// MergeCMSketch4IncrementalAnalyze merges two CM Sketch for incremental analyze. +// Since there is no value that appears partially in `c` and `rc`, we can just +// merge them using `max` operations. +func (c *CMSketch) MergeCMSketch4IncrementalAnalyze(rc *CMSketch) error { + if c.depth != rc.depth || c.width != rc.width { + return errors.New("Dimensions of Count-Min Sketch should be the same") + } + if c.topN != nil || rc.topN != nil { + return errors.New("CMSketch with Top-N does not support merge") + } + for i := range c.table { + for j := range c.table[i] { + c.table[i][j] = mathutil.MaxUint32(c.table[i][j], rc.table[i][j]) + } + } + c.count = 0 + for i := range c.table[0] { + c.count += uint64(c.table[0][i]) + } + return nil +} + // CMSketchToProto converts CMSketch to its protobuf representation. func CMSketchToProto(c *CMSketch) *tipb.CMSketch { protoSketch := &tipb.CMSketch{Rows: make([]*tipb.CMSketchRow, c.depth)} diff --git a/statistics/handle/bootstrap.go b/statistics/handle/bootstrap.go index a461bc6d6911d..3a9109454495b 100644 --- a/statistics/handle/bootstrap.go +++ b/statistics/handle/bootstrap.go @@ -109,7 +109,7 @@ func (h *Handle) initStatsHistograms4Chunk(is infoschema.InfoSchema, tables Stat terror.Log(errors.Trace(err)) } hist := statistics.NewHistogram(id, ndv, nullCount, version, types.NewFieldType(mysql.TypeBlob), chunk.InitialCapacity, 0) - table.Indices[hist.ID] = &statistics.Index{Histogram: *hist, CMSketch: cms, Info: idxInfo, StatsVer: row.GetInt64(8)} + table.Indices[hist.ID] = &statistics.Index{Histogram: *hist, CMSketch: cms, Info: idxInfo, StatsVer: row.GetInt64(8), Flag: row.GetInt64(10), LastAnalyzePos: row.GetDatum(11, types.NewFieldType(mysql.TypeBlob))} } else { var colInfo *model.ColumnInfo for _, col := range tbl.Meta().Columns { @@ -124,11 +124,13 @@ func (h *Handle) initStatsHistograms4Chunk(is infoschema.InfoSchema, tables Stat hist := statistics.NewHistogram(id, ndv, nullCount, version, &colInfo.FieldType, 0, totColSize) hist.Correlation = row.GetFloat64(9) table.Columns[hist.ID] = &statistics.Column{ - Histogram: *hist, - PhysicalID: table.PhysicalID, - Info: colInfo, - Count: nullCount, - IsHandle: tbl.Meta().PKIsHandle && mysql.HasPriKeyFlag(colInfo.Flag), + Histogram: *hist, + PhysicalID: table.PhysicalID, + Info: colInfo, + Count: nullCount, + IsHandle: tbl.Meta().PKIsHandle && mysql.HasPriKeyFlag(colInfo.Flag), + Flag: row.GetInt64(10), + LastAnalyzePos: row.GetDatum(11, types.NewFieldType(mysql.TypeBlob)), } } } @@ -137,7 +139,7 @@ func (h *Handle) initStatsHistograms4Chunk(is infoschema.InfoSchema, tables Stat func (h *Handle) initStatsHistograms(is infoschema.InfoSchema, tables StatsCache) error { h.mu.Lock() defer h.mu.Unlock() - sql := "select HIGH_PRIORITY table_id, is_index, hist_id, distinct_count, version, null_count, cm_sketch, tot_col_size, stats_ver, correlation from mysql.stats_histograms" + sql := "select HIGH_PRIORITY table_id, is_index, hist_id, distinct_count, version, null_count, cm_sketch, tot_col_size, stats_ver, correlation, flag, last_analyze_pos from mysql.stats_histograms" rc, err := h.mu.ctx.(sqlexec.SQLExecutor).Execute(context.TODO(), sql) if len(rc) > 0 { defer terror.Call(rc[0].Close) diff --git a/statistics/handle/handle.go b/statistics/handle/handle.go index 8432fdfba2535..91b82c9cb63c5 100644 --- a/statistics/handle/handle.go +++ b/statistics/handle/handle.go @@ -336,7 +336,8 @@ func (h *Handle) indexStatsFromStorage(row chunk.Row, table *statistics.Table, t nullCount := row.GetInt64(5) idx := table.Indices[histID] errorRate := statistics.ErrorRate{} - if statistics.IsAnalyzed(row.GetInt64(8)) { + flag := row.GetInt64(8) + if statistics.IsAnalyzed(flag) { h.mu.Lock() h.mu.rateMap.clear(table.PhysicalID, histID, true) h.mu.Unlock() @@ -356,7 +357,7 @@ func (h *Handle) indexStatsFromStorage(row chunk.Row, table *statistics.Table, t if err != nil { return errors.Trace(err) } - idx = &statistics.Index{Histogram: *hg, CMSketch: cms, Info: idxInfo, ErrorRate: errorRate, StatsVer: row.GetInt64(7)} + idx = &statistics.Index{Histogram: *hg, CMSketch: cms, Info: idxInfo, ErrorRate: errorRate, StatsVer: row.GetInt64(7), Flag: flag, LastAnalyzePos: row.GetDatum(10, types.NewFieldType(mysql.TypeBlob))} } break } @@ -377,7 +378,8 @@ func (h *Handle) columnStatsFromStorage(row chunk.Row, table *statistics.Table, correlation := row.GetFloat64(9) col := table.Columns[histID] errorRate := statistics.ErrorRate{} - if statistics.IsAnalyzed(row.GetInt64(8)) { + flag := row.GetInt64(8) + if statistics.IsAnalyzed(flag) { h.mu.Lock() h.mu.rateMap.clear(table.PhysicalID, histID, false) h.mu.Unlock() @@ -404,12 +406,14 @@ func (h *Handle) columnStatsFromStorage(row chunk.Row, table *statistics.Table, return errors.Trace(err) } col = &statistics.Column{ - PhysicalID: table.PhysicalID, - Histogram: *statistics.NewHistogram(histID, distinct, nullCount, histVer, &colInfo.FieldType, 0, totColSize), - Info: colInfo, - Count: count + nullCount, - ErrorRate: errorRate, - IsHandle: tableInfo.PKIsHandle && mysql.HasPriKeyFlag(colInfo.Flag), + PhysicalID: table.PhysicalID, + Histogram: *statistics.NewHistogram(histID, distinct, nullCount, histVer, &colInfo.FieldType, 0, totColSize), + Info: colInfo, + Count: count + nullCount, + ErrorRate: errorRate, + IsHandle: tableInfo.PKIsHandle && mysql.HasPriKeyFlag(colInfo.Flag), + Flag: flag, + LastAnalyzePos: row.GetDatum(10, types.NewFieldType(mysql.TypeBlob)), } col.Histogram.Correlation = correlation break @@ -424,13 +428,15 @@ func (h *Handle) columnStatsFromStorage(row chunk.Row, table *statistics.Table, return errors.Trace(err) } col = &statistics.Column{ - PhysicalID: table.PhysicalID, - Histogram: *hg, - Info: colInfo, - CMSketch: cms, - Count: int64(hg.TotalRowCount()), - ErrorRate: errorRate, - IsHandle: tableInfo.PKIsHandle && mysql.HasPriKeyFlag(colInfo.Flag), + PhysicalID: table.PhysicalID, + Histogram: *hg, + Info: colInfo, + CMSketch: cms, + Count: int64(hg.TotalRowCount()), + ErrorRate: errorRate, + IsHandle: tableInfo.PKIsHandle && mysql.HasPriKeyFlag(colInfo.Flag), + Flag: flag, + LastAnalyzePos: row.GetDatum(10, types.NewFieldType(mysql.TypeBlob)), } break } @@ -472,7 +478,7 @@ func (h *Handle) tableStatsFromStorage(tableInfo *model.TableInfo, physicalID in table = table.Copy() } table.Pseudo = false - selSQL := fmt.Sprintf("select table_id, is_index, hist_id, distinct_count, version, null_count, tot_col_size, stats_ver, flag, correlation from mysql.stats_histograms where table_id = %d", physicalID) + selSQL := fmt.Sprintf("select table_id, is_index, hist_id, distinct_count, version, null_count, tot_col_size, stats_ver, flag, correlation, last_analyze_pos from mysql.stats_histograms where table_id = %d", physicalID) rows, _, err := h.restrictedExec.ExecRestrictedSQL(nil, selSQL) if err != nil { return nil, errors.Trace(err) @@ -558,6 +564,7 @@ func (h *Handle) SaveStatsToStorage(tableID int64, count int64, isIndex int, hg return } sc := h.mu.ctx.GetSessionVars().StmtCtx + var lastAnalyzePos []byte for i := range hg.Buckets { count := hg.Buckets[i].Count if i > 0 { @@ -568,6 +575,7 @@ func (h *Handle) SaveStatsToStorage(tableID int64, count int64, isIndex int, hg if err != nil { return } + lastAnalyzePos = upperBound.GetBytes() var lowerBound types.Datum lowerBound, err = hg.GetLower(i).ConvertTo(sc, types.NewFieldType(mysql.TypeBlob)) if err != nil { @@ -579,6 +587,13 @@ func (h *Handle) SaveStatsToStorage(tableID int64, count int64, isIndex int, hg return } } + if isAnalyzed == 1 && len(lastAnalyzePos) > 0 { + sql = fmt.Sprintf("update mysql.stats_histograms set last_analyze_pos = X'%X' where table_id = %d and is_index = %d and hist_id = %d", lastAnalyzePos, tableID, isIndex, hg.ID) + _, err = exec.Execute(ctx, sql) + if err != nil { + return + } + } return } diff --git a/statistics/handle/update.go b/statistics/handle/update.go index 24ad2fe7e1c85..3d10bd036dc31 100644 --- a/statistics/handle/update.go +++ b/statistics/handle/update.go @@ -436,6 +436,7 @@ func (h *Handle) UpdateStatsByLocalFeedback(is infoschema.InfoSchema) { newIdx.CMSketch = statistics.UpdateCMSketch(idx.CMSketch, eqFB) newIdx.Histogram = *statistics.UpdateHistogram(&idx.Histogram, &statistics.QueryFeedback{Feedback: ranFB}) newIdx.Histogram.PreCalculateScalar() + newIdx.Flag = 0 newTblStats.Indices[fb.Hist.ID] = &newIdx } else { col, ok := tblStats.Columns[fb.Hist.ID] @@ -448,6 +449,7 @@ func (h *Handle) UpdateStatsByLocalFeedback(is infoschema.InfoSchema) { newFB := &statistics.QueryFeedback{Feedback: ranFB} newFB = newFB.DecodeIntValues() newCol.Histogram = *statistics.UpdateHistogram(&col.Histogram, newFB) + newCol.Flag = 0 newTblStats.Columns[fb.Hist.ID] = &newCol } h.UpdateTableStats([]*statistics.Table{newTblStats}, nil) diff --git a/statistics/histogram.go b/statistics/histogram.go index 31b70eabecd79..dc112894c916d 100644 --- a/statistics/histogram.go +++ b/statistics/histogram.go @@ -34,7 +34,6 @@ import ( "github.com/pingcap/tidb/util/logutil" "github.com/pingcap/tidb/util/ranger" "github.com/pingcap/tipb/go-tipb" - "github.com/spaolacci/murmur3" "go.uber.org/zap" ) @@ -566,17 +565,6 @@ func (hg *Histogram) outOfRange(val types.Datum) bool { chunk.Compare(hg.Bounds.GetRow(hg.Bounds.NumRows()-1), 0, &val) < 0 } -// ContainsFeedback checks if the histogram contains feedback updates. -// We can test it from the `repeat` field because only feedback will update it to 0. -func (hg *Histogram) ContainsFeedback() bool { - for _, bkt := range hg.Buckets { - if bkt.Repeat == 0 { - return true - } - } - return false -} - // Copy deep copies the histogram. func (hg *Histogram) Copy() *Histogram { newHist := *hg @@ -588,6 +576,23 @@ func (hg *Histogram) Copy() *Histogram { return &newHist } +// RemoveUpperBound removes the upper bound from histogram. +// It is used when merge stats for incremental analyze. +func (hg *Histogram) RemoveUpperBound() *Histogram { + hg.Buckets[hg.Len()-1].Count -= hg.Buckets[hg.Len()-1].Repeat + hg.Buckets[hg.Len()-1].Repeat = 0 + return hg +} + +// TruncateHistogram truncates the histogram to `numBkt` buckets. +func (hg *Histogram) TruncateHistogram(numBkt int) *Histogram { + hist := hg.Copy() + hist.Buckets = hist.Buckets[:numBkt] + hist.Bounds.TruncateTo(numBkt * 2) + hist.NDV = int64(float64(hg.NDV) * (hist.TotalRowCount() / hg.TotalRowCount())) + return hist +} + // ErrorRate is the error rate of estimate row count by bucket and cm sketch. type ErrorRate struct { ErrorTotal float64 @@ -629,6 +634,8 @@ type Column struct { Info *model.ColumnInfo IsHandle bool ErrorRate + Flag int64 + LastAnalyzePos types.Datum } func (c *Column) String() string { @@ -730,8 +737,10 @@ type Index struct { Histogram *CMSketch ErrorRate - StatsVer int64 // StatsVer is the version of the current stats, used to maintain compatibility - Info *model.IndexInfo + StatsVer int64 // StatsVer is the version of the current stats, used to maintain compatibility + Info *model.IndexInfo + Flag int64 + LastAnalyzePos types.Datum } func (idx *Index) String() string { @@ -990,28 +999,6 @@ func (idx *Index) outOfRange(val types.Datum) bool { return !withInLowBoundOrPrefixMatch || !withInHighBound } -// RemoveUpperBound removes the upper bound the index stats. -// It is used when merge stats for incremental analyze. -func (idx *Index) RemoveUpperBound(sc *stmtctx.StatementContext, values []types.Datum) (*Histogram, *CMSketch, error) { - hist, cms := idx.Histogram.Copy(), idx.CMSketch.Copy() - hist.Buckets[hist.Len()-1].Count -= hist.Buckets[hist.Len()-1].Repeat - hist.Buckets[hist.Len()-1].Repeat = 0 - if cms == nil { - return hist, nil, nil - } - var data []byte - var err error - for _, val := range values { - data, err = codec.EncodeKey(sc, data, val) - if err != nil { - return nil, nil, err - } - h1, h2 := murmur3.Sum128(data) - cms.setValue(h1, h2, 0) - } - return hist, cms, nil -} - // matchPrefix checks whether ad is the prefix of value func matchPrefix(row chunk.Row, colIdx int, ad *types.Datum) bool { switch ad.Kind() { From 5357e8d5397436d261e151b1822dcf062699e01e Mon Sep 17 00:00:00 2001 From: Haibin Xie Date: Sun, 5 May 2019 19:00:56 +0800 Subject: [PATCH 2/4] fix ci --- executor/builder.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/executor/builder.go b/executor/builder.go index 10b819123fe2f..c7ceb8787d7e5 100644 --- a/executor/builder.go +++ b/executor/builder.go @@ -1404,7 +1404,7 @@ func (b *executorBuilder) buildAnalyzeIndexIncremental(task plannercore.AnalyzeI } oldHist = idx.TruncateHistogram(bktID) } - oldHist = idx.Histogram.RemoveUpperBound() + oldHist = oldHist.RemoveUpperBound() analyzeTask.taskType = idxIncrementalTask analyzeTask.idxIncrementalExec = &analyzeIndexIncrementalExec{AnalyzeIndexExec: *analyzeTask.idxExec, oldHist: oldHist, oldCMS: idx.CMSketch} analyzeTask.job = &statistics.AnalyzeJob{DBName: task.DBName, TableName: task.TableName, PartitionName: task.PartitionName, JobInfo: "analyze incremental index " + task.IndexInfo.Name.O} From 517c0ea209131fa69360b4845f9b880c8ddf464f Mon Sep 17 00:00:00 2001 From: Haibin Xie Date: Tue, 7 May 2019 19:42:18 +0800 Subject: [PATCH 3/4] address comments --- statistics/cmsketch.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/statistics/cmsketch.go b/statistics/cmsketch.go index 55687064e03ae..747337c1d50ee 100644 --- a/statistics/cmsketch.go +++ b/statistics/cmsketch.go @@ -304,7 +304,7 @@ func (c *CMSketch) MergeCMSketch(rc *CMSketch) error { // (1): For values that only appears in `c, using `max` to merge them affects the `min` query result less than using `sum`; // (2): For values that only appears in `rc`, it is the same as condition (1); // (3): For values that appears both in `c` and `rc`, if they do not appear partially in `c` and `rc`, for example, -// if `v` appears 5 times in the table, it can appears 3 times in `c` and 5 times in `rc`, then `max` also gives the correct answer. +// if `v` appears 5 times in the table, it can appears 5 times in `c` and 3 times in `rc`, then `max` also gives the correct answer. // So in fact, if we can know the number of appearances of each value in the first place, it is better to use `max` to construct the CM sketch rather than `sum`. func (c *CMSketch) MergeCMSketch4IncrementalAnalyze(rc *CMSketch) error { if c.depth != rc.depth || c.width != rc.width { From e8dc21e244aa70fe5bfd1916e98f453ce2b000e4 Mon Sep 17 00:00:00 2001 From: Haibin Xie Date: Wed, 8 May 2019 11:58:13 +0800 Subject: [PATCH 4/4] address comments --- executor/analyze_test.go | 2 +- statistics/histogram.go | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/executor/analyze_test.go b/executor/analyze_test.go index e4d1225a098b4..d2e579286dd3d 100644 --- a/executor/analyze_test.go +++ b/executor/analyze_test.go @@ -325,7 +325,7 @@ func (s *testSuite1) TestAnalyzeIncremental(c *C) { c.Assert(h.DumpStatsDeltaToKV(handle.DumpAll), IsNil) c.Assert(h.DumpStatsFeedbackToKV(), IsNil) c.Assert(h.HandleUpdateStats(is), IsNil) - h.Update(is) + c.Assert(h.Update(is), IsNil) tk.MustQuery("show stats_buckets").Check(testkit.Rows("test t a 0 0 1 1 1 1", "test t a 0 1 3 0 2 2147483647", "test t idx 1 0 1 1 1 1", "test t idx 1 1 2 1 2 2")) tblStats := h.GetTableStats(tblInfo) val, err := codec.EncodeKey(tk.Se.GetSessionVars().StmtCtx, nil, types.NewIntDatum(3)) diff --git a/statistics/histogram.go b/statistics/histogram.go index ca5846f2f88af..951070a45fe20 100644 --- a/statistics/histogram.go +++ b/statistics/histogram.go @@ -196,7 +196,7 @@ const ( Version1 = 1 ) -// AnalyzeFlag is one for column flag. We can use IsAnalyzed to check whether this column is analyzed or not. +// AnalyzeFlag is set when the statistics comes from analyze and has not been modified by feedback. const AnalyzeFlag = 1 // IsAnalyzed checks whether this flag contains AnalyzeFlag. @@ -204,7 +204,7 @@ func IsAnalyzed(flag int64) bool { return (flag & AnalyzeFlag) > 0 } -// ResetAnalyzeFlag resets the AnalyzeFlag. +// ResetAnalyzeFlag resets the AnalyzeFlag because it has been modified by feedback. func ResetAnalyzeFlag(flag int64) int64 { return flag &^ AnalyzeFlag }