Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

*: add cardinality estimation trace for Selectivity #29883

Merged
merged 10 commits into from
Nov 29, 2021
1 change: 1 addition & 0 deletions executor/executor.go
Original file line number Diff line number Diff line change
Expand Up @@ -1684,6 +1684,7 @@ func ResetContextOfStmt(ctx sessionctx.Context, s ast.StmtNode) (err error) {
sc.LockTableIDs = make(map[int64]struct{})
sc.EnableOptimizeTrace = false
sc.LogicalOptimizeTrace = nil
sc.OptimizerCETrace = nil

sc.InitMemTracker(memory.LabelForSQLText, vars.MemQuotaQuery)
sc.InitDiskTracker(memory.LabelForSQLText, -1)
Expand Down
4 changes: 4 additions & 0 deletions sessionctx/stmtctx/stmtctx.go
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,10 @@ type StatementContext struct {
EnableOptimizeTrace bool
// LogicalOptimizeTrace indicates the trace for optimize
LogicalOptimizeTrace *tracing.LogicalOptimizeTracer
// EnableOptimizerCETrace indicate if cardinality estimation internal process needs to be traced.
// CE Trace is currently a submodule of the optimizer trace and is controlled by a separated option.
EnableOptimizerCETrace bool
OptimizerCETrace []*tracing.CETraceRecord
}

// StmtHints are SessionVars related sql hints.
Expand Down
7 changes: 6 additions & 1 deletion statistics/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ import (
"go.uber.org/goleak"
)

var testDataMap = make(testdata.BookKeeper, 2)
var testDataMap = make(testdata.BookKeeper, 3)

func TestMain(m *testing.M) {
testbridge.WorkaroundGoCheckFlags()
Expand All @@ -45,6 +45,7 @@ func TestMain(m *testing.M) {

testDataMap.LoadTestSuiteData("testdata", "integration_suite")
testDataMap.LoadTestSuiteData("testdata", "stats_suite")
testDataMap.LoadTestSuiteData("testdata", "trace_suite")

opts := []goleak.Option{
goleak.IgnoreTopFunction("go.etcd.io/etcd/pkg/logutil.(*MergeLogger).outputLoop"),
Expand All @@ -66,6 +67,10 @@ func GetStatsSuiteData() testdata.TestData {
return testDataMap["stats_suite"]
}

func GetTraceSuiteData() testdata.TestData {
return testDataMap["trace_suite"]
}

// TestStatistics batches tests sharing a test suite to reduce the setups
// overheads.
func TestStatistics(t *testing.T) {
Expand Down
123 changes: 120 additions & 3 deletions statistics/selectivity.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,25 @@
package statistics

import (
"bytes"
"math"
"math/bits"
"sort"

"github.com/pingcap/errors"
"github.com/pingcap/tidb/expression"
"github.com/pingcap/tidb/parser/ast"
"github.com/pingcap/tidb/parser/format"
"github.com/pingcap/tidb/parser/mysql"
planutil "github.com/pingcap/tidb/planner/util"
"github.com/pingcap/tidb/sessionctx"
"github.com/pingcap/tidb/sessionctx/stmtctx"
"github.com/pingcap/tidb/types"
driver "github.com/pingcap/tidb/types/parser_driver"
"github.com/pingcap/tidb/util/chunk"
"github.com/pingcap/tidb/util/logutil"
"github.com/pingcap/tidb/util/ranger"
"github.com/pingcap/tidb/util/tracing"
"go.uber.org/zap"
)

Expand Down Expand Up @@ -179,14 +185,20 @@ func (coll *HistColl) Selectivity(ctx sessionctx.Context, exprs []expression.Exp
if coll.Count == 0 || len(exprs) == 0 {
return 1, nil, nil
}
ret := 1.0
sc := ctx.GetSessionVars().StmtCtx
tableID := coll.PhysicalID
// TODO: If len(exprs) is bigger than 63, we could use bitset structure to replace the int64.
// This will simplify some code and speed up if we use this rather than a boolean slice.
if len(exprs) > 63 || (len(coll.Columns) == 0 && len(coll.Indices) == 0) {
return pseudoSelectivity(coll, exprs), nil, nil
ret = pseudoSelectivity(coll, exprs)
if sc.EnableOptimizerCETrace {
CETraceExpr(sc, tableID, "Table Stats-Pseudo-Expression", expression.ComposeCNFCondition(ctx, exprs...), ret*float64(coll.Count))
}
return ret, nil, nil
}
ret := 1.0

var nodes []*StatsNode
sc := ctx.GetSessionVars().StmtCtx

remainedExprs := make([]expression.Expression, 0, len(exprs))

Expand Down Expand Up @@ -281,6 +293,9 @@ func (coll *HistColl) Selectivity(ctx sessionctx.Context, exprs []expression.Exp
usedSets := GetUsableSetsByGreedy(nodes)
// Initialize the mask with the full set.
mask := (int64(1) << uint(len(remainedExprs))) - 1
// curExpr records covered expressions by now. It's for cardinality estimation tracing.
var curExpr []expression.Expression

for _, set := range usedSets {
mask &^= set.mask
ret *= set.Selectivity
Expand All @@ -291,6 +306,16 @@ func (coll *HistColl) Selectivity(ctx sessionctx.Context, exprs []expression.Exp
if set.partCover {
ret *= selectionFactor
}
if sc.EnableOptimizerCETrace {
// Tracing for the expression estimation results after applying this StatsNode.
for i := range remainedExprs {
if set.mask&(1<<uint64(i)) > 0 {
curExpr = append(curExpr, remainedExprs[i])
}
}
expr := expression.ComposeCNFCondition(ctx, curExpr...)
CETraceExpr(sc, tableID, "Table Stats-Expression-CNF", expr, ret*float64(coll.Count))
}
}

// Now we try to cover those still not covered DNF conditions using independence assumption,
Expand Down Expand Up @@ -345,19 +370,34 @@ func (coll *HistColl) Selectivity(ctx sessionctx.Context, exprs []expression.Exp
}

selectivity = selectivity + curSelectivity - selectivity*curSelectivity
if sc.EnableOptimizerCETrace {
// Tracing for the expression estimation results of this DNF.
CETraceExpr(sc, tableID, "Table Stats-Expression-DNF", scalarCond, selectivity*float64(coll.Count))
}
}

if selectivity != 0 {
ret *= selectivity
mask &^= 1 << uint64(i)
}
if sc.EnableOptimizerCETrace {
// Tracing for the expression estimation results after applying the DNF estimation result.
curExpr = append(curExpr, remainedExprs[i])
expr := expression.ComposeCNFCondition(ctx, curExpr...)
CETraceExpr(sc, tableID, "Table Stats-Expression-CNF", expr, ret*float64(coll.Count))
}
}
}

// If there's still conditions which cannot be calculated, we will multiply a selectionFactor.
if mask > 0 {
ret *= selectionFactor
}
if sc.EnableOptimizerCETrace {
// Tracing for the expression estimation results after applying the default selectivity.
totalExpr := expression.ComposeCNFCondition(ctx, remainedExprs...)
CETraceExpr(sc, tableID, "Table Stats-Expression-CNF", totalExpr, ret*float64(coll.Count))
}
return ret, nodes, nil
}

Expand Down Expand Up @@ -479,3 +519,80 @@ func FindPrefixOfIndexByCol(cols []*expression.Column, idxColIDs []int64, cached
}
return expression.FindPrefixOfIndex(cols, idxColIDs)
}

// CETraceExpr appends an expression and related information into CE trace
func CETraceExpr(sc *stmtctx.StatementContext, tableID int64, tp string, expr expression.Expression, rowCount float64) {
exprStr, err := ExprToString(expr)
if err != nil {
logutil.BgLogger().Debug("[OptimizerTrace] Failed to trace CE of an expression",
zap.Any("expression", expr))
return
}
rec := tracing.CETraceRecord{
TableID: tableID,
Type: tp,
Expr: exprStr,
RowCount: uint64(rowCount),
}
sc.OptimizerCETrace = append(sc.OptimizerCETrace, &rec)
}

// ExprToString prints an Expression into a string which can appear in a SQL.
//
// It might be too tricky because it makes use of TiDB allowing using internal function name in SQL.
// For example, you can write `eq`(a, 1), which is the same as a = 1.
// We should have implemented this by first implementing a method to turn an expression to an AST
// then call astNode.Restore(), like the Constant case here. But for convenience, we use this trick for now.
//
// It may be more appropriate to put this in expression package. But currently we only use it for CE trace,
// and it may not be general enough to handle all possible expressions. So we put it here for now.
func ExprToString(e expression.Expression) (string, error) {
switch expr := e.(type) {
case *expression.ScalarFunction:
var buffer bytes.Buffer
buffer.WriteString("`" + expr.FuncName.L + "`(")
switch expr.FuncName.L {
case ast.Cast:
for _, arg := range expr.GetArgs() {
argStr, err := ExprToString(arg)
if err != nil {
return "", err
}
buffer.WriteString(argStr)
buffer.WriteString(", ")
buffer.WriteString(expr.RetType.String())
}
default:
for i, arg := range expr.GetArgs() {
argStr, err := ExprToString(arg)
if err != nil {
return "", err
}
buffer.WriteString(argStr)
if i+1 != len(expr.GetArgs()) {
buffer.WriteString(", ")
}
}
}
buffer.WriteString(")")
return buffer.String(), nil
case *expression.Column:
return expr.String(), nil
case *expression.CorrelatedColumn:
return "", errors.New("tracing for correlated columns not supported now")
case *expression.Constant:
value, err := expr.Eval(chunk.Row{})
if err != nil {
return "", err
}
valueExpr := driver.ValueExpr{Datum: value}
var buffer bytes.Buffer
restoreCtx := format.NewRestoreCtx(format.DefaultRestoreFlags, &buffer)
err = valueExpr.Restore(restoreCtx)
if err != nil {
return "", err
}
return buffer.String(), nil
}
return "", errors.New("unexpected type of Expression")
}
11 changes: 11 additions & 0 deletions statistics/testdata/trace_suite_in.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[
{
"name": "TestTraceCE",
"cases": [
"a > 0 and a < 2",
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We only add some basic test cases in this PR. More detailed tests will be added afterward.

"a >= 1 and a < 10",
"a < 3 or b < 4",
"a = 1 and b = 2"
]
}
]
Loading