plan: handle DNF expressions in Selectivity (#9282)

pingcap · Feb 13, 2019 · 9bdba3a · 9bdba3a
1 parent 2f9ab60
commit 9bdba3a
Show file tree

Hide file tree

Showing 9 changed files with 109 additions and 48 deletions.
diff --git a/cmd/explaintest/r/explain_easy_stats.result b/cmd/explaintest/r/explain_easy_stats.result
@@ -195,3 +195,10 @@ id	count	task	operator info
 Point_Get_1	1.00	root	table:index_prune, index:a b
 drop table if exists t1, t2, t3, index_prune;
 set @@session.tidb_opt_insubq_to_join_and_agg=1;
+drop table if exists tbl;
+create table tbl(column1 int, column2 int, index idx(column1, column2));
+load stats 's/explain_easy_stats_tbl_dnf.json';
+explain select * from tbl where (column1=0 and column2=1) or (column1=1 and column2=3) or (column1=2 and column2=5);
+id	count	task	operator info
+IndexReader_9	3.00	root	index:IndexScan_8
+└─IndexScan_8	3.00	cop	table:tbl, index:column1, column2, range:[0 1,0 1], [1 3,1 3], [2 5,2 5], keep order:false
diff --git a/cmd/explaintest/s/explain_easy_stats_tbl_dnf.json b/cmd/explaintest/s/explain_easy_stats_tbl_dnf.json
diff --git a/cmd/explaintest/t/explain_easy_stats.test b/cmd/explaintest/t/explain_easy_stats.test
@@ -58,3 +58,8 @@ explain select * from index_prune WHERE a = 1010010404050976781 AND b = 26467085
 
 drop table if exists t1, t2, t3, index_prune;
 set @@session.tidb_opt_insubq_to_join_and_agg=1;
+
+drop table if exists tbl;
+create table tbl(column1 int, column2 int, index idx(column1, column2));
+load stats 's/explain_easy_stats_tbl_dnf.json';
+explain select * from tbl where (column1=0 and column2=1) or (column1=1 and column2=3) or (column1=2 and column2=5);
diff --git a/planner/core/common_plans.go b/planner/core/common_plans.go
@@ -291,15 +291,14 @@ func (e *Execute) rebuildRange(p Plan) error {
 
 func (e *Execute) buildRangeForIndexScan(sctx sessionctx.Context, is *PhysicalIndexScan) ([]*ranger.Range, error) {
 	idxCols, colLengths := expression.IndexInfo2Cols(is.schema.Columns, is.Index)
-	ranges := ranger.FullRange()
-	if len(idxCols) > 0 {
-		var err error
-		ranges, _, _, _, err = ranger.DetachCondAndBuildRangeForIndex(sctx, is.AccessCondition, idxCols, colLengths)
-		if err != nil {
-			return nil, errors.Trace(err)
-		}
+	if len(idxCols) == 0 {
+		return ranger.FullRange(), nil
+	}
+	res, err := ranger.DetachCondAndBuildRangeForIndex(sctx, is.AccessCondition, idxCols, colLengths)
+	if err != nil {
+		return nil, err
 	}
-	return ranges, nil
+	return res.Ranges, nil
 }
 
 // Deallocate represents deallocate plan.

diff --git a/planner/core/exhaust_physical_plans.go b/planner/core/exhaust_physical_plans.go
@@ -16,7 +16,6 @@ package core
 import (
 	"math"
 
-	"github.com/pingcap/errors"
 	"github.com/pingcap/parser/ast"
 	"github.com/pingcap/parser/model"
 	"github.com/pingcap/parser/mysql"
@@ -529,20 +528,20 @@ func (p *LogicalJoin) buildRangeForIndexJoin(indexInfo *model.IndexInfo, innerPl
 	// In `buildFakeEqCondsForIndexJoin`, we construct the equal conditions for join keys and remove filters that contain the join keys' column.
 	// When t1.a = t2.a and t1.a > 1, we can also guarantee that t1.a > 1 won't be chosen as the access condition.
 	// So the equal conditions we built can be successfully used to build a range if they can be used. They won't be affected by the existing filters.
-	ranges, accesses, moreRemained, _, err := ranger.DetachCondAndBuildRangeForIndex(p.ctx, access, idxCols, colLengths)
+	res, err := ranger.DetachCondAndBuildRangeForIndex(p.ctx, access, idxCols, colLengths)
 	if err != nil {
-		terror.Log(errors.Trace(err))
+		terror.Log(err)
 		return nil, nil, nil
 	}
 
 	// We should guarantee that all the join's equal condition is used.
 	for _, eqCond := range eqConds {
-		if !expression.Contains(accesses, eqCond) {
+		if !expression.Contains(res.AccessConds, eqCond) {
 			return nil, nil, nil
 		}
 	}
 
-	return ranges, append(remained, moreRemained...), keyOff2IdxOff
+	return res.Ranges, append(remained, res.RemainedConds...), keyOff2IdxOff
 }
 
 func (p *LogicalJoin) buildFakeEqCondsForIndexJoin(keys, idxCols []*expression.Column, colLengths []int,

diff --git a/planner/core/logical_plans.go b/planner/core/logical_plans.go
@@ -448,19 +448,22 @@ func (ds *DataSource) deriveTablePathStats(path *accessPath) (bool, error) {
 // And it will check whether this index is full matched by point query. We will use this check to
 // determine whether we remove other paths or not.
 func (ds *DataSource) deriveIndexPathStats(path *accessPath) (bool, error) {
-	var err error
 	sc := ds.ctx.GetSessionVars().StmtCtx
 	path.ranges = ranger.FullRange()
 	path.countAfterAccess = float64(ds.statisticTable.Count)
 	path.idxCols, path.idxColLens = expression.IndexInfo2Cols(ds.schema.Columns, path.index)
 	if len(path.idxCols) != 0 {
-		path.ranges, path.accessConds, path.tableFilters, path.eqCondCount, err = ranger.DetachCondAndBuildRangeForIndex(ds.ctx, ds.pushedDownConds, path.idxCols, path.idxColLens)
+		res, err := ranger.DetachCondAndBuildRangeForIndex(ds.ctx, ds.pushedDownConds, path.idxCols, path.idxColLens)
 		if err != nil {
-			return false, errors.Trace(err)
+			return false, err
 		}
+		path.ranges = res.Ranges
+		path.accessConds = res.AccessConds
+		path.tableFilters = res.RemainedConds
+		path.eqCondCount = res.EqCondCount
 		path.countAfterAccess, err = ds.stats.HistColl.GetRowCountByIndexRanges(sc, path.index.ID, path.ranges)
 		if err != nil {
-			return false, errors.Trace(err)
+			return false, err
 		}
 	} else {
 		path.tableFilters = ds.pushedDownConds

diff --git a/statistics/selectivity.go b/statistics/selectivity.go
@@ -39,6 +39,9 @@ type StatsNode struct {
 	Selectivity float64
 	// numCols is the number of columns contained in the index or column(which is always 1).
 	numCols int
+	// partCover indicates whether the bit in the mask is for a full cover or partial cover. It is only true
+	// when the condition is a DNF expression on index, and the expression is not totally extracted as access condition.
+	partCover bool
 }
 
 // The type of the StatsNode.
@@ -142,7 +145,6 @@ func isColEqCorCol(filter expression.Expression) *expression.Column {
 // Selectivity is a function calculate the selectivity of the expressions.
 // The definition of selectivity is (row count after filter / row count before filter).
 // And exprs must be CNF now, in other words, `exprs[0] and exprs[1] and ... and exprs[len - 1]` should be held when you call this.
-// TODO: support expressions that the top layer is a DNF.
 // Currently the time complexity is o(n^2).
 func (coll *HistColl) Selectivity(ctx sessionctx.Context, exprs []expression.Expression) (float64, []*StatsNode, error) {
 	// If table's count is zero or conditions are empty, we should return 100% selectivity.
@@ -186,7 +188,7 @@ func (coll *HistColl) Selectivity(ctx sessionctx.Context, exprs []expression.Exp
 	for id, colInfo := range coll.Columns {
 		col := expression.ColInfo2Col(extractedCols, colInfo.Info)
 		if col != nil {
-			maskCovered, ranges, err := getMaskAndRanges(ctx, remainedExprs, ranger.ColumnRangeType, nil, col)
+			maskCovered, ranges, _, err := getMaskAndRanges(ctx, remainedExprs, ranger.ColumnRangeType, nil, col)
 			if err != nil {
 				return 0, nil, errors.Trace(err)
 			}
@@ -215,7 +217,7 @@ func (coll *HistColl) Selectivity(ctx sessionctx.Context, exprs []expression.Exp
 			for i := 0; i < len(idxCols); i++ {
 				lengths = append(lengths, idxInfo.Info.Columns[i].Length)
 			}
-			maskCovered, ranges, err := getMaskAndRanges(ctx, remainedExprs, ranger.IndexRangeType, lengths, idxCols...)
+			maskCovered, ranges, partCover, err := getMaskAndRanges(ctx, remainedExprs, ranger.IndexRangeType, lengths, idxCols...)
 			if err != nil {
 				return 0, nil, errors.Trace(err)
 			}
@@ -231,6 +233,7 @@ func (coll *HistColl) Selectivity(ctx sessionctx.Context, exprs []expression.Exp
 				Ranges:      ranges,
 				numCols:     len(idxInfo.Info.Columns),
 				Selectivity: selectivity,
+				partCover:   partCover,
 			})
 		}
 	}
@@ -240,6 +243,13 @@ func (coll *HistColl) Selectivity(ctx sessionctx.Context, exprs []expression.Exp
 	for _, set := range usedSets {
 		mask &^= set.mask
 		ret *= set.Selectivity
+		// If `partCover` is true, it means that the conditions are in DNF form, and only part
+		// of the DNF expressions are extracted as access conditions, so besides from the selectivity
+		// of the extracted access conditions, we multiply another selectionFactor for the residual
+		// conditions.
+		if set.partCover {
+			ret *= selectionFactor
+		}
 	}
 	// If there's still conditions which cannot be calculated, we will multiply a selectionFactor.
 	if mask > 0 {
@@ -249,20 +259,27 @@ func (coll *HistColl) Selectivity(ctx sessionctx.Context, exprs []expression.Exp
 }
 
 func getMaskAndRanges(ctx sessionctx.Context, exprs []expression.Expression, rangeType ranger.RangeType,
-	lengths []int, cols ...*expression.Column) (mask int64, ranges []*ranger.Range, err error) {
+	lengths []int, cols ...*expression.Column) (mask int64, ranges []*ranger.Range, partCover bool, err error) {
 	sc := ctx.GetSessionVars().StmtCtx
-	var accessConds []expression.Expression
+	isDNF := false
+	var accessConds, remainedConds []expression.Expression
 	switch rangeType {
 	case ranger.ColumnRangeType:
 		accessConds = ranger.ExtractAccessConditionsForColumn(exprs, cols[0].UniqueID)
 		ranges, err = ranger.BuildColumnRange(accessConds, sc, cols[0].RetType)
 	case ranger.IndexRangeType:
-		ranges, accessConds, err = ranger.DetachSimpleCondAndBuildRangeForIndex(ctx, exprs, cols, lengths)
+		var res *ranger.DetachRangeResult
+		res, err = ranger.DetachCondAndBuildRangeForIndex(ctx, exprs, cols, lengths)
+		ranges, accessConds, remainedConds, isDNF = res.Ranges, res.AccessConds, res.RemainedConds, res.IsDNFCond
 	default:
 		panic("should never be here")
 	}
 	if err != nil {
-		return 0, nil, errors.Trace(err)
+		return 0, nil, false, err
+	}
+	if isDNF && len(accessConds) > 0 {
+		mask |= 1
+		return mask, ranges, len(remainedConds) > 0, nil
 	}
 	for i := range exprs {
 		for j := range accessConds {
@@ -272,7 +289,7 @@ func getMaskAndRanges(ctx sessionctx.Context, exprs []expression.Expression, ran
 			}
 		}
 	}
-	return mask, ranges, nil
+	return mask, ranges, false, nil
 }
 
 // getUsableSetsByGreedy will select the indices and pk used for calculate selectivity by greedy algorithm.

diff --git a/util/ranger/detacher.go b/util/ranger/detacher.go
@@ -138,16 +138,17 @@ func getEqOrInColOffset(expr expression.Expression, cols []*expression.Column) i
 // It will first find the point query column and then extract the range query column.
 // considerDNF is true means it will try to extract access conditions from the DNF expressions.
 func detachCNFCondAndBuildRangeForIndex(sctx sessionctx.Context, conditions []expression.Expression, cols []*expression.Column,
-	tpSlice []*types.FieldType, lengths []int, considerDNF bool) ([]*Range, []expression.Expression, []expression.Expression, int, error) {
+	tpSlice []*types.FieldType, lengths []int, considerDNF bool) (*DetachRangeResult, error) {
 	var (
 		eqCount int
 		ranges  []*Range
 		err     error
 	)
+	res := &DetachRangeResult{}
 
 	accessConds, filterConds, newConditions, emptyRange := extractEqAndInCondition(sctx, conditions, cols, lengths)
 	if emptyRange {
-		return ranges, nil, nil, 0, nil
+		return res, nil
 	}
 
 	for ; eqCount < len(accessConds); eqCount++ {
@@ -163,9 +164,13 @@ func detachCNFCondAndBuildRangeForIndex(sctx sessionctx.Context, conditions []ex
 		filterConds = append(filterConds, newConditions...)
 		ranges, err = buildCNFIndexRange(sctx.GetSessionVars().StmtCtx, cols, tpSlice, lengths, eqOrInCount, accessConds)
 		if err != nil {
-			return nil, nil, nil, 0, errors.Trace(err)
+			return res, err
 		}
-		return ranges, accessConds, filterConds, eqCount, nil
+		res.Ranges = ranges
+		res.AccessConds = accessConds
+		res.RemainedConds = filterConds
+		res.EqCondCount = eqCount
+		return res, nil
 	}
 	checker := &conditionChecker{
 		colUniqueID:   cols[eqOrInCount].UniqueID,
@@ -186,7 +191,11 @@ func detachCNFCondAndBuildRangeForIndex(sctx sessionctx.Context, conditions []ex
 		}
 	}
 	ranges, err = buildCNFIndexRange(sctx.GetSessionVars().StmtCtx, cols, tpSlice, lengths, eqOrInCount, accessConds)
-	return ranges, accessConds, filterConds, eqCount, errors.Trace(err)
+	res.Ranges = ranges
+	res.AccessConds = accessConds
+	res.RemainedConds = filterConds
+	res.EqCondCount = eqCount
+	return res, err
 }
 
 func extractEqAndInCondition(sctx sessionctx.Context, conditions []expression.Expression,
@@ -260,10 +269,13 @@ func detachDNFCondAndBuildRangeForIndex(sctx sessionctx.Context, condition *expr
 		if sf, ok := item.(*expression.ScalarFunction); ok && sf.FuncName.L == ast.LogicAnd {
 			cnfItems := expression.FlattenCNFConditions(sf)
 			var accesses, filters []expression.Expression
-			ranges, accesses, filters, _, err := detachCNFCondAndBuildRangeForIndex(sctx, cnfItems, cols, newTpSlice, lengths, true)
+			res, err := detachCNFCondAndBuildRangeForIndex(sctx, cnfItems, cols, newTpSlice, lengths, true)
 			if err != nil {
 				return nil, nil, false, nil
 			}
+			ranges := res.Ranges
+			accesses = res.AccessConds
+			filters = res.RemainedConds
 			if len(accesses) == 0 {
 				return FullRange(), nil, true, nil
 			}
@@ -297,11 +309,25 @@ func detachDNFCondAndBuildRangeForIndex(sctx sessionctx.Context, condition *expr
 	return totalRanges, []expression.Expression{expression.ComposeDNFCondition(sctx, newAccessItems...)}, hasResidual, nil
 }
 
+// DetachRangeResult wraps up results when detaching conditions and builing ranges.
+type DetachRangeResult struct {
+	// Ranges is the ranges extracted and built from conditions.
+	Ranges []*Range
+	// AccessConds is the extracted conditions for access.
+	AccessConds []expression.Expression
+	// RemainedConds is the filter conditions which should be kept after access.
+	RemainedConds []expression.Expression
+	// EqCondCount is the number of equal conditions extracted.
+	EqCondCount int
+	// IsDNFCond indicates if the top layer of conditions are in DNF.
+	IsDNFCond bool
+}
+
 // DetachCondAndBuildRangeForIndex will detach the index filters from table filters.
-// If the top layer is DNF, we return a int slice which is eqAndInCount of every DNF item.
-// Otherwise just one number is returned.
+// The returned values are encapsulated into a struct DetachRangeResult, see its comments for explanation.
 func DetachCondAndBuildRangeForIndex(sctx sessionctx.Context, conditions []expression.Expression, cols []*expression.Column,
-	lengths []int) ([]*Range, []expression.Expression, []expression.Expression, int, error) {
+	lengths []int) (*DetachRangeResult, error) {
+	res := &DetachRangeResult{}
 	newTpSlice := make([]*types.FieldType, 0, len(cols))
 	for _, col := range cols {
 		newTpSlice = append(newTpSlice, newFieldType(col.RetType))
@@ -310,13 +336,17 @@ func DetachCondAndBuildRangeForIndex(sctx sessionctx.Context, conditions []expre
 		if sf, ok := conditions[0].(*expression.ScalarFunction); ok && sf.FuncName.L == ast.LogicOr {
 			ranges, accesses, hasResidual, err := detachDNFCondAndBuildRangeForIndex(sctx, sf, cols, newTpSlice, lengths)
 			if err != nil {
-				return nil, nil, nil, 0, errors.Trace(err)
+				return res, errors.Trace(err)
 			}
+			res.Ranges = ranges
+			res.AccessConds = accesses
+			res.IsDNFCond = true
 			// If this DNF have something cannot be to calculate range, then all this DNF should be pushed as filter condition.
 			if hasResidual {
-				return ranges, accesses, conditions, 0, nil
+				res.RemainedConds = conditions
+				return res, nil
 			}
-			return ranges, accesses, nil, 0, nil
+			return res, nil
 		}
 	}
 	return detachCNFCondAndBuildRangeForIndex(sctx, conditions, cols, newTpSlice, lengths, true)
@@ -325,13 +355,13 @@ func DetachCondAndBuildRangeForIndex(sctx sessionctx.Context, conditions []expre
 // DetachSimpleCondAndBuildRangeForIndex will detach the index filters from table filters.
 // It will find the point query column firstly and then extract the range query column.
 func DetachSimpleCondAndBuildRangeForIndex(sctx sessionctx.Context, conditions []expression.Expression,
-	cols []*expression.Column, lengths []int) (ranges []*Range, accessConds []expression.Expression, err error) {
+	cols []*expression.Column, lengths []int) ([]*Range, []expression.Expression, error) {
 	newTpSlice := make([]*types.FieldType, 0, len(cols))
 	for _, col := range cols {
 		newTpSlice = append(newTpSlice, newFieldType(col.RetType))
 	}
-	ranges, accessConds, _, _, err = detachCNFCondAndBuildRangeForIndex(sctx, conditions, cols, newTpSlice, lengths, false)
-	return ranges, accessConds, errors.Trace(err)
+	res, err := detachCNFCondAndBuildRangeForIndex(sctx, conditions, cols, newTpSlice, lengths, false)
+	return res.Ranges, res.AccessConds, err
 }
 
 func removeAccessConditions(conditions, accessConds []expression.Expression) []expression.Expression {

diff --git a/util/ranger/ranger_test.go b/util/ranger/ranger_test.go
@@ -594,11 +594,11 @@ func (s *testRangerSuite) TestIndexRange(c *C) {
 		}
 		cols, lengths := expression.IndexInfo2Cols(selection.Schema().Columns, tbl.Indices[tt.indexPos])
 		c.Assert(cols, NotNil)
-		ranges, conds, filter, _, err := ranger.DetachCondAndBuildRangeForIndex(ctx, conds, cols, lengths)
+		res, err := ranger.DetachCondAndBuildRangeForIndex(ctx, conds, cols, lengths)
 		c.Assert(err, IsNil)
-		c.Assert(fmt.Sprintf("%s", conds), Equals, tt.accessConds, Commentf("wrong access conditions for expr: %s", tt.exprStr))
-		c.Assert(fmt.Sprintf("%s", filter), Equals, tt.filterConds, Commentf("wrong filter conditions for expr: %s", tt.exprStr))
-		got := fmt.Sprintf("%v", ranges)
+		c.Assert(fmt.Sprintf("%s", res.AccessConds), Equals, tt.accessConds, Commentf("wrong access conditions for expr: %s", tt.exprStr))
+		c.Assert(fmt.Sprintf("%s", res.RemainedConds), Equals, tt.filterConds, Commentf("wrong filter conditions for expr: %s", tt.exprStr))
+		got := fmt.Sprintf("%v", res.Ranges)
 		c.Assert(got, Equals, tt.resultStr, Commentf("different for expr %s", tt.exprStr))
 	}
 }
@@ -681,11 +681,11 @@ func (s *testRangerSuite) TestIndexRangeForUnsignedInt(c *C) {
 		}
 		cols, lengths := expression.IndexInfo2Cols(selection.Schema().Columns, tbl.Indices[tt.indexPos])
 		c.Assert(cols, NotNil)
-		ranges, conds, filter, _, err := ranger.DetachCondAndBuildRangeForIndex(ctx, conds, cols, lengths)
+		res, err := ranger.DetachCondAndBuildRangeForIndex(ctx, conds, cols, lengths)
 		c.Assert(err, IsNil)
-		c.Assert(fmt.Sprintf("%s", conds), Equals, tt.accessConds, Commentf("wrong access conditions for expr: %s", tt.exprStr))
-		c.Assert(fmt.Sprintf("%s", filter), Equals, tt.filterConds, Commentf("wrong filter conditions for expr: %s", tt.exprStr))
-		got := fmt.Sprintf("%v", ranges)
+		c.Assert(fmt.Sprintf("%s", res.AccessConds), Equals, tt.accessConds, Commentf("wrong access conditions for expr: %s", tt.exprStr))
+		c.Assert(fmt.Sprintf("%s", res.RemainedConds), Equals, tt.filterConds, Commentf("wrong filter conditions for expr: %s", tt.exprStr))
+		got := fmt.Sprintf("%v", res.Ranges)
 		c.Assert(got, Equals, tt.resultStr, Commentf("different for expr %s", tt.exprStr))
 	}
 }