Skip to content

Commit

Permalink
planner: reduce topn count to exclude non-skewed values (pingcap#53035)
Browse files Browse the repository at this point in the history
  • Loading branch information
terry1purcell authored Jun 19, 2024
1 parent e41a47c commit 8ee4897
Show file tree
Hide file tree
Showing 9 changed files with 69 additions and 26 deletions.
2 changes: 1 addition & 1 deletion pkg/executor/show_stats_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -407,7 +407,7 @@ func TestShowAnalyzeStatus(t *testing.T) {
require.Equal(t, "test", rows[0][0])
require.Equal(t, "t", rows[0][1])
require.Equal(t, "", rows[0][2])
require.Equal(t, "analyze table all columns with 256 buckets, 500 topn, 1 samplerate", rows[0][3])
require.Equal(t, "analyze table all columns with 256 buckets, 100 topn, 1 samplerate", rows[0][3])
require.Equal(t, "2", rows[0][4])
checkTime := func(val any) {
str, ok := val.(string)
Expand Down
8 changes: 4 additions & 4 deletions pkg/executor/test/analyzetest/analyze_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1916,7 +1916,7 @@ func testKillAutoAnalyze(t *testing.T, ver int) {
if ver == 1 {
jobInfo += "columns"
} else {
jobInfo += "table all columns with 256 buckets, 500 topn, 1 samplerate"
jobInfo += "table all columns with 256 buckets, 100 topn, 1 samplerate"
}
// kill auto analyze when it is pending/running/finished
for _, status := range []string{
Expand Down Expand Up @@ -2041,7 +2041,7 @@ func TestAnalyzeJob(t *testing.T) {
DBName: "test",
TableName: "t",
PartitionName: "",
JobInfo: "table all columns with 256 buckets, 500 topn, 1 samplerate",
JobInfo: "table all columns with 256 buckets, 100 topn, 1 samplerate",
}
executor.AddNewAnalyzeJob(se, job)
require.NotNil(t, job.ID)
Expand Down Expand Up @@ -2133,7 +2133,7 @@ func TestInsertAnalyzeJobWithLongInstance(t *testing.T) {
DBName: "test",
TableName: "t",
PartitionName: "",
JobInfo: "table all columns with 256 buckets, 500 topn, 1 samplerate",
JobInfo: "table all columns with 256 buckets, 100 topn, 1 samplerate",
}
h := dom.StatsHandle()
instance := "xxxtidb-tidb-0.xxxtidb-tidb-peer.xxxx-xx-1234-xxx-123456-1-321.xyz:4000"
Expand Down Expand Up @@ -2785,7 +2785,7 @@ func TestAnalyzeColumnsSkipMVIndexJsonCol(t *testing.T) {
tk.MustQuery("select job_info from mysql.analyze_jobs where table_schema = 'test' and table_name = 't'").Sort().Check(
testkit.Rows(
"analyze index idx_c",
"analyze table columns a, b with 256 buckets, 500 topn, 1 samplerate",
"analyze table columns a, b with 256 buckets, 100 topn, 1 samplerate",
))

is := dom.InfoSchema()
Expand Down
6 changes: 3 additions & 3 deletions pkg/planner/cardinality/testdata/cardinality_suite_out.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
{
"Start": 800,
"End": 900,
"Count": 771.504166655054
"Count": 755.754166655054
},
{
"Start": 900,
Expand Down Expand Up @@ -79,7 +79,7 @@
{
"Start": 800,
"End": 1000,
"Count": 1229.696869573942
"Count": 1213.946869573942
},
{
"Start": 900,
Expand All @@ -104,7 +104,7 @@
{
"Start": 200,
"End": 400,
"Count": 1226.2788209899081
"Count": 1215.0288209899081
},
{
"Start": 200,
Expand Down
4 changes: 3 additions & 1 deletion pkg/planner/core/planbuilder.go
Original file line number Diff line number Diff line change
Expand Up @@ -2696,9 +2696,11 @@ var analyzeOptionDefault = map[ast.AnalyzeOptionType]uint64{
ast.AnalyzeOptSampleRate: math.Float64bits(0),
}

// TopN reduced from 500 to 100 due to concerns over large number of TopN values collected for customers with many tables.
// 100 is more inline with other databases. 100-256 is also common for NumBuckets with other databases.
var analyzeOptionDefaultV2 = map[ast.AnalyzeOptionType]uint64{
ast.AnalyzeOptNumBuckets: 256,
ast.AnalyzeOptNumTopN: 500,
ast.AnalyzeOptNumTopN: 100,
ast.AnalyzeOptCMSketchWidth: 2048,
ast.AnalyzeOptCMSketchDepth: 5,
ast.AnalyzeOptNumSamples: 0,
Expand Down
32 changes: 22 additions & 10 deletions pkg/statistics/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,11 @@ func BuildHistAndTopN(
sampleNum := int64(len(samples))
// As we use samples to build the histogram, the bucket number and repeat should multiply a factor.
sampleFactor := float64(count) / float64(len(samples))
// If a numTopn value other than 100 is passed in, we assume it's a value that the user wants us to honor
allowPruning := true
if numTopN != 100 {
allowPruning = false
}

// Step1: collect topn from samples

Expand Down Expand Up @@ -326,18 +331,23 @@ func BuildHistAndTopN(
continue
}
// case 2, meet a different value: counting for the "current" is complete
// case 2-1, now topn is empty: append the "current" count directly
// case 2-1, do not add a count of 1 if we're sampling
if curCnt == 1 && sampleFactor > 1 && allowPruning {
cur, curCnt = sampleBytes, 1
continue
}
// case 2-2, now topn is empty: append the "current" count directly
if len(topNList) == 0 {
topNList = append(topNList, TopNMeta{Encoded: cur, Count: uint64(curCnt)})
cur, curCnt = sampleBytes, 1
continue
}
// case 2-2, now topn is full, and the "current" count is less than the least count in the topn: no need to insert the "current"
// case 2-3, now topn is full, and the "current" count is less than the least count in the topn: no need to insert the "current"
if len(topNList) >= numTopN && uint64(curCnt) <= topNList[len(topNList)-1].Count {
cur, curCnt = sampleBytes, 1
continue
}
// case 2-3, now topn is not full, or the "current" count is larger than the least count in the topn: need to find a slot to insert the "current"
// case 2-4, now topn is not full, or the "current" count is larger than the least count in the topn: need to find a slot to insert the "current"
j := len(topNList)
for ; j > 0; j-- {
if uint64(curCnt) < topNList[j-1].Count {
Expand All @@ -358,9 +368,10 @@ func BuildHistAndTopN(
hg.Correlation = calcCorrelation(sampleNum, corrXYSum)
}

// Handle the counting for the last value. Basically equal to the case 2 above.
// now topn is empty: append the "current" count directly
if numTopN != 0 {
// Handle the counting for the last value. Basically equal to the case 2 above - including
// limiting addition of a value with a count of 1 (since it will be pruned anyway).
if numTopN != 0 && (!allowPruning || (allowPruning && (sampleFactor <= 1 || curCnt > 1))) {
// now topn is empty: append the "current" count directly
if len(topNList) == 0 {
topNList = append(topNList, TopNMeta{Encoded: cur, Count: uint64(curCnt)})
} else if len(topNList) < numTopN || uint64(curCnt) > topNList[len(topNList)-1].Count {
Expand All @@ -380,7 +391,9 @@ func BuildHistAndTopN(
}
}

topNList = pruneTopNItem(topNList, ndv, nullCount, sampleNum, count)
if allowPruning {
topNList = pruneTopNItem(topNList, ndv, nullCount, sampleNum, count)
}

// Step2: exclude topn from samples
if numTopN != 0 {
Expand Down Expand Up @@ -435,7 +448,7 @@ func BuildHistAndTopN(
topn.Scale(sampleFactor)

if uint64(count) <= topn.TotalCount() || int(hg.NDV) <= len(topn.TopN) {
// TopN includes all sample data
// If we've collected everything - don't create any buckets
return hg, topn, nil
}

Expand All @@ -454,8 +467,7 @@ func BuildHistAndTopN(
//
// We assume that the ones not in the top-n list's selectivity is 1/remained_ndv which is the internal implementation of EqualRowCount
func pruneTopNItem(topns []TopNMeta, ndv, nullCount, sampleRows, totalRows int64) []TopNMeta {
// If the sampleRows holds all rows, or NDV of samples equals to actual NDV, we just return the TopN directly.
if sampleRows == totalRows || totalRows <= 1 || int64(len(topns)) >= ndv || len(topns) == 0 {
if totalRows <= 1 || int64(len(topns)) >= ndv || len(topns) <= 1 {
return topns
}
// Sum the occurrence except the least common one from the top-n list. To check whether the lest common one is worth
Expand Down
2 changes: 1 addition & 1 deletion pkg/statistics/handle/autoanalyze/autoanalyze_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,7 @@ func TestAutoAnalyzeSkipColumnTypes(t *testing.T) {
exec.AutoAnalyzeMinCnt = originalVal
}()
require.True(t, h.HandleAutoAnalyze())
tk.MustQuery("select job_info from mysql.analyze_jobs where job_info like '%auto analyze table%'").Check(testkit.Rows("auto analyze table columns a, b, d with 256 buckets, 500 topn, 1 samplerate"))
tk.MustQuery("select job_info from mysql.analyze_jobs where job_info like '%auto analyze table%'").Check(testkit.Rows("auto analyze table columns a, b, d with 256 buckets, 100 topn, 1 samplerate"))
}

func TestAutoAnalyzeOnEmptyTable(t *testing.T) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,7 @@ func testIssues24349(testKit *testkit.TestKit) {
testKit.MustExec("create table t (a int, b int) partition by hash(a) partitions 3")
testKit.MustExec("insert into t values (0, 3), (0, 3), (0, 3), (0, 2), (1, 1), (1, 2), (1, 2), (1, 2), (1, 3), (1, 4), (2, 1), (2, 1)")
testKit.MustExec("analyze table t with 1 topn, 3 buckets")
testKit.MustExec("explain select * from t where a > 0 and b > 0")
testKit.MustQuery("show stats_buckets where partition_name='global'").Check(testkit.Rows(
"test t global a 0 0 2 2 0 2 0",
"test t global b 0 0 3 1 1 2 0",
Expand Down
36 changes: 32 additions & 4 deletions pkg/statistics/statistics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -501,7 +501,7 @@ func SubTestBuild() func(*testing.T) {
return func(t *testing.T) {
s := createTestStatisticsSamples(t)
bucketCount := int64(256)
topNCount := 20
topNCount := 100
ctx := mock.NewContext()
sc := ctx.GetSessionVars().StmtCtx
sketch, _, err := buildFMSketch(sc, s.rc.(*recordSet).data, 1000)
Expand Down Expand Up @@ -650,7 +650,7 @@ func TestPruneTopN(t *testing.T) {
var totalNDV, nullCnt, sampleRows, totalRows int64

// case 1
topnIn = []TopNMeta{{[]byte{1}, 100_000}, {[]byte{2}, 10}}
topnIn = []TopNMeta{{[]byte{1}, 100_000}}
totalNDV = 2
nullCnt = 0
sampleRows = 100_010
Expand All @@ -674,13 +674,41 @@ func TestPruneTopN(t *testing.T) {

// case 3
topnIn = nil
for i := 0; i < 100; i++ {
topnIn = append(topnIn, TopNMeta{[]byte{byte(i)}, 1_000})
for i := 0; i < 10; i++ {
topnIn = append(topnIn, TopNMeta{[]byte{byte(i)}, 10_000})
}
totalNDV = 100
nullCnt = 0
sampleRows = 100_000
totalRows = 10_000_000
topnOut = pruneTopNItem(topnIn, totalNDV, nullCnt, sampleRows, totalRows)
require.Equal(t, topnIn, topnOut)

// case 4 - test TopN pruning for small table
topnIn = []TopNMeta{
{[]byte{1}, 3_000},
{[]byte{2}, 3_000},
}
totalNDV = 4002
nullCnt = 0
sampleRows = 10_000
totalRows = 10_000
topnOut = pruneTopNItem(topnIn, totalNDV, nullCnt, sampleRows, totalRows)
require.Equal(t, topnIn, topnOut)

// case 5 - test pruning of value=1
topnIn = nil
for i := 0; i < 10; i++ {
topnIn = append(topnIn, TopNMeta{[]byte{byte(i)}, 90})
}
topnPruned := topnIn
for i := 90; i < 150; i++ {
topnIn = append(topnIn, TopNMeta{[]byte{byte(i)}, 1})
}
totalNDV = 150
nullCnt = 0
sampleRows = 1500
totalRows = 1500
topnOut = pruneTopNItem(topnIn, totalNDV, nullCnt, sampleRows, totalRows)
require.Equal(t, topnPruned, topnOut)
}
4 changes: 2 additions & 2 deletions tests/integrationtest/r/executor/analyze.result
Original file line number Diff line number Diff line change
Expand Up @@ -824,12 +824,12 @@ delete from mysql.analyze_jobs;
analyze table t;
select job_info from mysql.analyze_jobs where job_info like '%analyze table%';
job_info
analyze table columns a, b, d with 256 buckets, 500 topn, 1 samplerate
analyze table columns a, b, d with 256 buckets, 100 topn, 1 samplerate
delete from mysql.analyze_jobs;
analyze table t columns a, e;
select job_info from mysql.analyze_jobs where job_info like '%analyze table%';
job_info
analyze table columns a, d with 256 buckets, 500 topn, 1 samplerate
analyze table columns a, d with 256 buckets, 100 topn, 1 samplerate
set @@session.tidb_analyze_skip_column_types = default;
DROP TABLE IF EXISTS Issue34228;
CREATE TABLE Issue34228 (id bigint NOT NULL, dt datetime NOT NULL) PARTITION BY RANGE COLUMNS(dt) (PARTITION p202201 VALUES LESS THAN ("2022-02-01"), PARTITION p202202 VALUES LESS THAN ("2022-03-01"));
Expand Down

0 comments on commit 8ee4897

Please sign in to comment.