Merge branch 'master' into exchange-partition-GA

pingcap · Jul 20, 2022 · 12fce72 · 12fce72
2 parents 2271f67 + f0717df
commit 12fce72
Show file tree

Hide file tree

Showing 9 changed files with 1,329 additions and 73 deletions.
diff --git a/expression/util.go b/expression/util.go
@@ -688,6 +688,21 @@ func pushNotAcrossExpr(ctx sessionctx.Context, expr Expression, not bool) (_ Exp
 	return expr, not
 }
 
+// GetExprInsideIsTruth get the expression inside the `istrue_with_null` and `istrue`.
+// This is useful when handling expressions from "not" or "!", because we might wrap `istrue_with_null` or `istrue`
+// when handling them. See pushNotAcrossExpr() and wrapWithIsTrue() for details.
+func GetExprInsideIsTruth(expr Expression) Expression {
+	if f, ok := expr.(*ScalarFunction); ok {
+		switch f.FuncName.L {
+		case ast.IsTruthWithNull, ast.IsTruthWithoutNull:
+			return GetExprInsideIsTruth(f.GetArgs()[0])
+		default:
+			return expr
+		}
+	}
+	return expr
+}
+
 // PushDownNot pushes the `not` function down to the expression's arguments.
 func PushDownNot(ctx sessionctx.Context, expr Expression) Expression {
 	newExpr, _ := pushNotAcrossExpr(ctx, expr, false)

diff --git a/sessionctx/variable/session.go b/sessionctx/variable/session.go
@@ -1182,6 +1182,11 @@ type SessionVars struct {
 	// When it is false, ANALYZE reads the latest data.
 	// When it is true, ANALYZE reads data on the snapshot at the beginning of ANALYZE.
 	EnableAnalyzeSnapshot bool
+
+	// DefaultStrMatchSelectivity adjust the estimation strategy for string matching expressions that can't be estimated by building into range.
+	// when > 0: it's the selectivity for the expression.
+	// when = 0: try to use TopN to evaluate the like expression to estimate the selectivity.
+	DefaultStrMatchSelectivity float64
 }
 
 // InitStatementContext initializes a StatementContext, the object is reused to reduce allocation.
@@ -2724,3 +2729,30 @@ func (s *SessionVars) GetSeekFactor(tbl *model.TableInfo) float64 {
 	}
 	return s.seekFactor
 }
+
+// EnableEvalTopNEstimationForStrMatch means if we need to evaluate expression with TopN to improve estimation.
+// Currently, it's only for string matching functions (like and regexp).
+func (s *SessionVars) EnableEvalTopNEstimationForStrMatch() bool {
+	return s.DefaultStrMatchSelectivity == 0
+}
+
+// GetStrMatchDefaultSelectivity means the default selectivity for like and regexp.
+// Note: 0 is a special value, which means the default selectivity is 0.1 and TopN assisted estimation is enabled.
+func (s *SessionVars) GetStrMatchDefaultSelectivity() float64 {
+	if s.DefaultStrMatchSelectivity == 0 {
+		return 0.1
+	}
+	return s.DefaultStrMatchSelectivity
+}
+
+// GetNegateStrMatchDefaultSelectivity means the default selectivity for not like and not regexp.
+// Note:
+//     0 is a special value, which means the default selectivity is 0.9 and TopN assisted estimation is enabled.
+//     0.8 (the default value) is also a special value. For backward compatibility, when the variable is set to 0.8, we
+//   keep the default selectivity of like/regexp and not like/regexp all 0.8.
+func (s *SessionVars) GetNegateStrMatchDefaultSelectivity() float64 {
+	if s.DefaultStrMatchSelectivity == DefTiDBDefaultStrMatchSelectivity {
+		return DefTiDBDefaultStrMatchSelectivity
+	}
+	return 1 - s.GetStrMatchDefaultSelectivity()
+}
diff --git a/sessionctx/variable/sysvar.go b/sessionctx/variable/sysvar.go
@@ -1714,6 +1714,11 @@ var defaultSysVars = []*SysVar{
 		GenerateBinaryPlan.Store(TiDBOptOn(val))
 		return nil
 	}},
+	{Scope: ScopeGlobal | ScopeSession, Name: TiDBDefaultStrMatchSelectivity, Value: strconv.FormatFloat(DefTiDBDefaultStrMatchSelectivity, 'f', -1, 64), Type: TypeFloat, MinValue: 0, MaxValue: 1,
+		SetSession: func(s *SessionVars, val string) error {
+			s.DefaultStrMatchSelectivity = tidbOptFloat64(val, DefTiDBDefaultStrMatchSelectivity)
+			return nil
+		}},
 }
 
 // FeedbackProbability points to the FeedbackProbability in statistics package.

diff --git a/sessionctx/variable/tidb_vars.go b/sessionctx/variable/tidb_vars.go
@@ -710,6 +710,13 @@ const (
 	// When set to false, ANALYZE reads the latest data.
 	// When set to true, ANALYZE reads data on the snapshot at the beginning of ANALYZE.
 	TiDBEnableAnalyzeSnapshot = "tidb_enable_analyze_snapshot"
+
+	// TiDBDefaultStrMatchSelectivity controls some special cardinality estimation strategy for string match functions (like and regexp).
+	// When set to 0, Selectivity() will try to evaluate those functions with TopN and NULL in the stats to estimate,
+	// and the default selectivity and the selectivity for the histogram part will be 0.1.
+	// When set to (0, 1], Selectivity() will use the value of this variable as the default selectivity of those
+	// functions instead of the selectionFactor (0.8).
+	TiDBDefaultStrMatchSelectivity = "tidb_default_string_match_selectivity"
 )
 
 // TiDB vars that have only global scope
@@ -988,6 +995,7 @@ const (
 	DefTiDBEnableAnalyzeSnapshot                   = false
 	DefTiDBGenerateBinaryPlan                      = true
 	DefEnableTiDBGCAwareMemoryTrack                = true
+	DefTiDBDefaultStrMatchSelectivity              = 0.8
 )
 
 // Process global variables.

diff --git a/statistics/selectivity.go b/statistics/selectivity.go
@@ -317,109 +317,175 @@ func (coll *HistColl) Selectivity(ctx sessionctx.Context, exprs []expression.Exp
 		}
 	}
 
-	// Try to cover Constants
+	notCoveredConstants := make(map[int]*expression.Constant)
+	notCoveredDNF := make(map[int]*expression.ScalarFunction)
+	notCoveredStrMatch := make(map[int]*expression.ScalarFunction)
+	notCoveredNegateStrMatch := make(map[int]*expression.ScalarFunction)
+	notCoveredOtherExpr := make(map[int]expression.Expression)
 	if mask > 0 {
 		for i, expr := range remainedExprs {
 			if mask&(1<<uint64(i)) == 0 {
 				continue
 			}
-			if c, ok := expr.(*expression.Constant); ok {
-				if expression.MaybeOverOptimized4PlanCache(ctx, []expression.Expression{c}) {
+			switch x := expr.(type) {
+			case *expression.Constant:
+				notCoveredConstants[i] = x
+				continue
+			case *expression.ScalarFunction:
+				switch x.FuncName.L {
+				case ast.LogicOr:
+					notCoveredDNF[i] = x
 					continue
-				}
-				if c.Value.IsNull() {
-					// c is null
-					ret *= 0
-					mask &^= 1 << uint64(i)
-				} else if isTrue, err := c.Value.ToBool(sc); err == nil {
-					if isTrue == 0 {
-						// c is false
-						ret *= 0
+				case ast.Like, ast.Regexp:
+					notCoveredStrMatch[i] = x
+					continue
+				case ast.UnaryNot:
+					inner := expression.GetExprInsideIsTruth(x.GetArgs()[0])
+					innerSF, ok := inner.(*expression.ScalarFunction)
+					if ok {
+						switch innerSF.FuncName.L {
+						case ast.Like, ast.Regexp:
+							notCoveredNegateStrMatch[i] = x
+							continue
+						}
 					}
-					// c is true, no need to change ret
-					mask &^= 1 << uint64(i)
 				}
-				// Not expected to come here:
-				// err != nil, no need to do anything.
 			}
+			notCoveredOtherExpr[i] = expr
 		}
 	}
 
-	// Now we try to cover those still not covered DNF conditions using independence assumption,
+	// Try to cover remaining Constants
+	for i, c := range notCoveredConstants {
+		if expression.MaybeOverOptimized4PlanCache(ctx, []expression.Expression{c}) {
+			continue
+		}
+		if c.Value.IsNull() {
+			// c is null
+			ret *= 0
+			mask &^= 1 << uint64(i)
+			delete(notCoveredConstants, i)
+		} else if isTrue, err := c.Value.ToBool(sc); err == nil {
+			if isTrue == 0 {
+				// c is false
+				ret *= 0
+			}
+			// c is true, no need to change ret
+			mask &^= 1 << uint64(i)
+			delete(notCoveredConstants, i)
+		}
+		// Not expected to come here:
+		// err != nil, no need to do anything.
+	}
+
+	// Try to cover remaining DNF conditions using independence assumption,
 	// i.e., sel(condA or condB) = sel(condA) + sel(condB) - sel(condA) * sel(condB)
-	if mask > 0 {
-	OUTER:
-		for i, expr := range remainedExprs {
-			if mask&(1<<uint64(i)) == 0 {
-				continue
+OUTER:
+	for i, scalarCond := range notCoveredDNF {
+		// If there are columns not in stats, we won't handle them. This case might happen after DDL operations.
+		cols := expression.ExtractColumns(scalarCond)
+		for i := range cols {
+			if _, ok := coll.Columns[cols[i].UniqueID]; !ok {
+				continue OUTER
 			}
-			scalarCond, ok := expr.(*expression.ScalarFunction)
-			// Make sure we only handle DNF condition.
-			if !ok || scalarCond.FuncName.L != ast.LogicOr {
+		}
+
+		dnfItems := expression.FlattenDNFConditions(scalarCond)
+		dnfItems = ranger.MergeDNFItems4Col(ctx, dnfItems)
+		// If the conditions only contain a single column, we won't handle them.
+		if len(dnfItems) <= 1 {
+			continue
+		}
+
+		selectivity := 0.0
+		for _, cond := range dnfItems {
+			// In selectivity calculation, we don't handle CorrelatedColumn, so we directly skip over it.
+			// Other kinds of `Expression`, i.e., Constant, Column and ScalarFunction all can possibly be built into
+			// ranges and used to calculation selectivity, so we accept them all.
+			_, ok := cond.(*expression.CorrelatedColumn)
+			if ok {
 				continue
 			}
-			// If there're columns not in stats, we won't handle them. This case might happen after DDL operations.
-			cols := expression.ExtractColumns(scalarCond)
-			for i := range cols {
-				if _, ok := coll.Columns[cols[i].UniqueID]; !ok {
-					continue OUTER
-				}
-			}
 
-			dnfItems := expression.FlattenDNFConditions(scalarCond)
-			dnfItems = ranger.MergeDNFItems4Col(ctx, dnfItems)
-			// If the conditions only contain a single column, we won't handle them.
-			if len(dnfItems) <= 1 {
-				continue
+			var cnfItems []expression.Expression
+			if scalar, ok := cond.(*expression.ScalarFunction); ok && scalar.FuncName.L == ast.LogicAnd {
+				cnfItems = expression.FlattenCNFConditions(scalar)
+			} else {
+				cnfItems = append(cnfItems, cond)
 			}
 
-			selectivity := 0.0
-			for _, cond := range dnfItems {
-				// In selectivity calculation, we don't handle CorrelatedColumn, so we directly skip over it.
-				// Other kinds of `Expression`, i.e., Constant, Column and ScalarFunction all can possibly be built into
-				// ranges and used to calculation selectivity, so we accept them all.
-				_, ok := cond.(*expression.CorrelatedColumn)
-				if ok {
-					continue
-				}
+			curSelectivity, _, err := coll.Selectivity(ctx, cnfItems, nil)
+			if err != nil {
+				logutil.BgLogger().Debug("something wrong happened, use the default selectivity", zap.Error(err))
+				curSelectivity = selectionFactor
+			}
 
-				var cnfItems []expression.Expression
-				if scalar, ok := cond.(*expression.ScalarFunction); ok && scalar.FuncName.L == ast.LogicAnd {
-					cnfItems = expression.FlattenCNFConditions(scalar)
-				} else {
-					cnfItems = append(cnfItems, cond)
-				}
+			selectivity = selectivity + curSelectivity - selectivity*curSelectivity
+			if sc.EnableOptimizerCETrace {
+				// Tracing for the expression estimation results of this DNF.
+				CETraceExpr(ctx, tableID, "Table Stats-Expression-DNF", scalarCond, selectivity*float64(coll.Count))
+			}
+		}
 
-				curSelectivity, _, err := coll.Selectivity(ctx, cnfItems, nil)
-				if err != nil {
-					logutil.BgLogger().Debug("something wrong happened, use the default selectivity", zap.Error(err))
-					curSelectivity = selectionFactor
-				}
+		if selectivity != 0 {
+			ret *= selectivity
+			mask &^= 1 << uint64(i)
+			delete(notCoveredDNF, i)
+		}
+		if sc.EnableOptimizerCETrace {
+			// Tracing for the expression estimation results after applying the DNF estimation result.
+			curExpr = append(curExpr, remainedExprs[i])
+			expr := expression.ComposeCNFCondition(ctx, curExpr...)
+			CETraceExpr(ctx, tableID, "Table Stats-Expression-CNF", expr, ret*float64(coll.Count))
+		}
+	}
 
-				selectivity = selectivity + curSelectivity - selectivity*curSelectivity
-				if sc.EnableOptimizerCETrace {
-					// Tracing for the expression estimation results of this DNF.
-					CETraceExpr(ctx, tableID, "Table Stats-Expression-DNF", scalarCond, selectivity*float64(coll.Count))
-				}
+	// Try to cover remaining string matching functions by evaluating the expressions with TopN to estimate.
+	if ctx.GetSessionVars().EnableEvalTopNEstimationForStrMatch() {
+		for i, scalarCond := range notCoveredStrMatch {
+			ok, sel, err := coll.GetSelectivityByFilter(ctx, ctx.GetSessionVars().GetStrMatchDefaultSelectivity(), []expression.Expression{scalarCond})
+			if err != nil {
+				sc.AppendWarning(errors.New("Error when using TopN-assisted estimation: " + err.Error()))
 			}
-
-			if selectivity != 0 {
-				ret *= selectivity
-				mask &^= 1 << uint64(i)
+			if !ok {
+				continue
 			}
-			if sc.EnableOptimizerCETrace {
-				// Tracing for the expression estimation results after applying the DNF estimation result.
-				curExpr = append(curExpr, remainedExprs[i])
-				expr := expression.ComposeCNFCondition(ctx, curExpr...)
-				CETraceExpr(ctx, tableID, "Table Stats-Expression-CNF", expr, ret*float64(coll.Count))
+			ret *= sel
+			mask &^= 1 << uint64(i)
+			delete(notCoveredStrMatch, i)
+		}
+		for i, scalarCond := range notCoveredNegateStrMatch {
+			ok, sel, err := coll.GetSelectivityByFilter(ctx, ctx.GetSessionVars().GetNegateStrMatchDefaultSelectivity(), []expression.Expression{scalarCond})
+			if err != nil {
+				sc.AppendWarning(errors.New("Error when using TopN-assisted estimation: " + err.Error()))
 			}
+			if !ok {
+				continue
+			}
+			ret *= sel
+			mask &^= 1 << uint64(i)
+			delete(notCoveredNegateStrMatch, i)
 		}
 	}
 
-	// If there's still conditions which cannot be calculated, we will multiply a selectionFactor.
+	// At last, if there are still conditions which cannot be estimated, we multiply the selectivity with
+	// the minimal default selectivity of the remaining conditions.
+	// Currently, only string matching functions (like and regexp) may have a different default selectivity,
+	// other expressions' default selectivity is selectionFactor.
 	if mask > 0 {
-		ret *= selectionFactor
+		minSelectivity := 1.0
+		if len(notCoveredConstants) > 0 || len(notCoveredDNF) > 0 || len(notCoveredOtherExpr) > 0 {
+			minSelectivity = math.Min(minSelectivity, selectionFactor)
+		}
+		if len(notCoveredStrMatch) > 0 {
+			minSelectivity = math.Min(minSelectivity, ctx.GetSessionVars().GetStrMatchDefaultSelectivity())
+		}
+		if len(notCoveredNegateStrMatch) > 0 {
+			minSelectivity = math.Min(minSelectivity, ctx.GetSessionVars().GetNegateStrMatchDefaultSelectivity())
+		}
+		ret *= minSelectivity
 	}
+
 	if sc.EnableOptimizerCETrace {
 		// Tracing for the expression estimation results after applying the default selectivity.
 		totalExpr := expression.ComposeCNFCondition(ctx, remainedExprs...)