Skip to content

Commit

Permalink
planner: unify the terms NDV and cardinality in the optimizer (#26345) (
Browse files Browse the repository at this point in the history
  • Loading branch information
ti-srebot authored Aug 30, 2021
1 parent 2f400f0 commit bbb1dbc
Show file tree
Hide file tree
Showing 8 changed files with 90 additions and 90 deletions.
6 changes: 3 additions & 3 deletions planner/core/exhaust_physical_plans.go
Original file line number Diff line number Diff line change
Expand Up @@ -961,7 +961,7 @@ func (p *LogicalJoin) constructInnerTableScanTask(
// TableScan as inner child of IndexJoin can return at most 1 tuple for each outer row.
RowCount: math.Min(1.0, countAfterAccess),
StatsVersion: ds.stats.StatsVersion,
// Cardinality would not be used in cost computation of IndexJoin, set leave it as default nil.
// NDV would not be used in cost computation of IndexJoin, set leave it as default nil.
}
rowSize := ds.TblColHists.GetTableAvgRowSize(p.ctx, ds.TblCols, ts.StoreType, true)
sessVars := ds.ctx.GetSessionVars()
Expand Down Expand Up @@ -1431,7 +1431,7 @@ func (ijHelper *indexJoinBuildHelper) updateBestChoice(ranges []*ranger.Range, p
}
var innerNDV float64
if stats := ijHelper.innerPlan.statsInfo(); stats != nil && stats.StatsVersion != statistics.PseudoVersion {
innerNDV = getCardinality(path.IdxCols[:usedColsLen], ijHelper.innerPlan.Schema(), stats)
innerNDV = getColsNDV(path.IdxCols[:usedColsLen], ijHelper.innerPlan.Schema(), stats)
}
// We choose the index by the NDV of the used columns, the larger the better.
// If NDVs are same, we choose index which uses more columns.
Expand Down Expand Up @@ -2139,7 +2139,7 @@ func (la *LogicalApply) exhaustPhysicalPlans(prop *property.PhysicalProperty) ([
}
cacheHitRatio := 0.0
if la.stats.RowCount != 0 {
ndv := getCardinality(columns, la.schema, la.stats)
ndv := getColsNDV(columns, la.schema, la.stats)
// for example, if there are 100 rows and the number of distinct values of these correlated columns
// are 70, then we can assume 30 rows can hit the cache so the cache hit ratio is 1 - (70/100) = 0.3
cacheHitRatio = 1 - (ndv / la.stats.RowCount)
Expand Down
4 changes: 2 additions & 2 deletions planner/core/plan.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ func optimizeByShuffle4Window(pp *PhysicalWindow, ctx sessionctx.Context) *Physi
for _, item := range pp.PartitionBy {
partitionBy = append(partitionBy, item.Col)
}
NDV := int(getCardinality(partitionBy, dataSource.Schema(), dataSource.statsInfo()))
NDV := int(getColsNDV(partitionBy, dataSource.Schema(), dataSource.statsInfo()))
if NDV <= 1 {
return nil
}
Expand Down Expand Up @@ -167,7 +167,7 @@ func optimizeByShuffle4StreamAgg(pp *PhysicalStreamAgg, ctx sessionctx.Context)
partitionBy = append(partitionBy, col)
}
}
NDV := int(getCardinality(partitionBy, dataSource.Schema(), dataSource.statsInfo()))
NDV := int(getColsNDV(partitionBy, dataSource.Schema(), dataSource.statsInfo()))
if NDV <= 1 {
return nil
}
Expand Down
144 changes: 72 additions & 72 deletions planner/core/stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,11 @@ func (p *LogicalTableDual) DeriveStats(childStats []*property.StatsInfo, selfSch
return p.stats, nil
}
profile := &property.StatsInfo{
RowCount: float64(p.RowCount),
Cardinality: make(map[int64]float64, selfSchema.Len()),
RowCount: float64(p.RowCount),
ColNDVs: make(map[int64]float64, selfSchema.Len()),
}
for _, col := range selfSchema.Columns {
profile.Cardinality[col.UniqueID] = float64(p.RowCount)
profile.ColNDVs[col.UniqueID] = float64(p.RowCount)
}
p.stats = profile
return p.stats, nil
Expand All @@ -58,12 +58,12 @@ func (p *LogicalMemTable) DeriveStats(childStats []*property.StatsInfo, selfSche
statsTable := statistics.PseudoTable(p.TableInfo)
stats := &property.StatsInfo{
RowCount: float64(statsTable.Count),
Cardinality: make(map[int64]float64, len(p.TableInfo.Columns)),
ColNDVs: make(map[int64]float64, len(p.TableInfo.Columns)),
HistColl: statsTable.GenerateHistCollFromColumnInfo(p.TableInfo.Columns, p.schema.Columns),
StatsVersion: statistics.PseudoVersion,
}
for _, col := range selfSchema.Columns {
stats.Cardinality[col.UniqueID] = float64(statsTable.Count)
stats.ColNDVs[col.UniqueID] = float64(statsTable.Count)
}
p.stats = stats
return p.stats, nil
Expand All @@ -81,11 +81,11 @@ func (p *LogicalShow) DeriveStats(childStats []*property.StatsInfo, selfSchema *

func getFakeStats(schema *expression.Schema) *property.StatsInfo {
profile := &property.StatsInfo{
RowCount: 1,
Cardinality: make(map[int64]float64, schema.Len()),
RowCount: 1,
ColNDVs: make(map[int64]float64, schema.Len()),
}
for _, col := range schema.Columns {
profile.Cardinality[col.UniqueID] = 1
profile.ColNDVs[col.UniqueID] = 1
}
return profile
}
Expand Down Expand Up @@ -144,11 +144,11 @@ func (p *baseLogicalPlan) DeriveStats(childStats []*property.StatsInfo, selfSche
return p.stats, nil
}
profile := &property.StatsInfo{
RowCount: float64(1),
Cardinality: make(map[int64]float64, selfSchema.Len()),
RowCount: float64(1),
ColNDVs: make(map[int64]float64, selfSchema.Len()),
}
for _, col := range selfSchema.Columns {
profile.Cardinality[col.UniqueID] = 1
profile.ColNDVs[col.UniqueID] = 1
}
p.stats = profile
return profile, nil
Expand Down Expand Up @@ -221,15 +221,15 @@ func (ds *DataSource) initStats(colGroups [][]*expression.Column) {
}
tableStats := &property.StatsInfo{
RowCount: float64(ds.statisticTable.Count),
Cardinality: make(map[int64]float64, ds.schema.Len()),
ColNDVs: make(map[int64]float64, ds.schema.Len()),
HistColl: ds.statisticTable.GenerateHistCollFromColumnInfo(ds.Columns, ds.schema.Columns),
StatsVersion: ds.statisticTable.Version,
}
if ds.statisticTable.Pseudo {
tableStats.StatsVersion = statistics.PseudoVersion
}
for _, col := range ds.schema.Columns {
tableStats.Cardinality[col.UniqueID] = ds.getColumnNDV(col.ID)
tableStats.ColNDVs[col.UniqueID] = ds.getColumnNDV(col.ID)
}
ds.tableStats = tableStats
ds.tableStats.GroupNDVs = ds.getGroupNDVs(colGroups)
Expand Down Expand Up @@ -608,24 +608,24 @@ func (p *LogicalUnionAll) DeriveStats(childStats []*property.StatsInfo, selfSche
return p.stats, nil
}
p.stats = &property.StatsInfo{
Cardinality: make(map[int64]float64, selfSchema.Len()),
ColNDVs: make(map[int64]float64, selfSchema.Len()),
}
for _, childProfile := range childStats {
p.stats.RowCount += childProfile.RowCount
for _, col := range selfSchema.Columns {
p.stats.Cardinality[col.UniqueID] += childProfile.Cardinality[col.UniqueID]
p.stats.ColNDVs[col.UniqueID] += childProfile.ColNDVs[col.UniqueID]
}
}
return p.stats, nil
}

func deriveLimitStats(childProfile *property.StatsInfo, limitCount float64) *property.StatsInfo {
stats := &property.StatsInfo{
RowCount: math.Min(limitCount, childProfile.RowCount),
Cardinality: make(map[int64]float64, len(childProfile.Cardinality)),
RowCount: math.Min(limitCount, childProfile.RowCount),
ColNDVs: make(map[int64]float64, len(childProfile.ColNDVs)),
}
for id, c := range childProfile.Cardinality {
stats.Cardinality[id] = math.Min(c, stats.RowCount)
for id, c := range childProfile.ColNDVs {
stats.ColNDVs[id] = math.Min(c, stats.RowCount)
}
return stats
}
Expand Down Expand Up @@ -671,25 +671,25 @@ func getGroupNDV4Cols(cols []*expression.Column, stats *property.StatsInfo) *pro
return nil
}

// getCardinality returns the Cardinality of a couple of columns.
// If the columns match any GroupNDV maintained by child operator, we can get an accurate cardinality.
// Otherwise, we simply return the max cardinality among the columns, which is a lower bound.
func getCardinality(cols []*expression.Column, schema *expression.Schema, profile *property.StatsInfo) float64 {
cardinality := 1.0
// getColsNDV returns the NDV of a couple of columns.
// If the columns match any GroupNDV maintained by child operator, we can get an accurate NDV.
// Otherwise, we simply return the max NDV among the columns, which is a lower bound.
func getColsNDV(cols []*expression.Column, schema *expression.Schema, profile *property.StatsInfo) float64 {
NDV := 1.0
if groupNDV := getGroupNDV4Cols(cols, profile); groupNDV != nil {
return math.Max(groupNDV.NDV, cardinality)
return math.Max(groupNDV.NDV, NDV)
}
indices := schema.ColumnsIndices(cols)
if indices == nil {
logutil.BgLogger().Error("column not found in schema", zap.Any("columns", cols), zap.String("schema", schema.String()))
return cardinality
return NDV
}
for _, idx := range indices {
// It is a very naive estimation.
col := schema.Columns[idx]
cardinality = math.Max(cardinality, profile.Cardinality[col.UniqueID])
NDV = math.Max(NDV, profile.ColNDVs[col.UniqueID])
}
return cardinality
return NDV
}

func (p *LogicalProjection) getGroupNDVs(colGroups [][]*expression.Column, childProfile *property.StatsInfo, selfSchema *expression.Schema) []property.GroupNDV {
Expand Down Expand Up @@ -739,12 +739,12 @@ func (p *LogicalProjection) DeriveStats(childStats []*property.StatsInfo, selfSc
return p.stats, nil
}
p.stats = &property.StatsInfo{
RowCount: childProfile.RowCount,
Cardinality: make(map[int64]float64, len(p.Exprs)),
RowCount: childProfile.RowCount,
ColNDVs: make(map[int64]float64, len(p.Exprs)),
}
for i, expr := range p.Exprs {
cols := expression.ExtractColumns(expr)
p.stats.Cardinality[selfSchema.Columns[i].UniqueID] = getCardinality(cols, childSchema[0], childProfile)
p.stats.ColNDVs[selfSchema.Columns[i].UniqueID] = getColsNDV(cols, childSchema[0], childProfile)
}
p.stats.GroupNDVs = p.getGroupNDVs(colGroups, childProfile, selfSchema)
return p.stats, nil
Expand Down Expand Up @@ -785,7 +785,7 @@ func (la *LogicalAggregation) getGroupNDVs(colGroups [][]*expression.Column, chi
}
// Check if the child profile provides GroupNDV for the GROUP BY columns.
// Note that gbyCols may not be the exact GROUP BY columns, e.g, GROUP BY a+b,
// but we have no other approaches for the cardinality estimation of these cases
// but we have no other approaches for the NDV estimation of these cases
// except for using the independent assumption, unless we can use stats of expression index.
groupNDV := getGroupNDV4Cols(gbyCols, childProfile)
if groupNDV == nil {
Expand All @@ -807,14 +807,14 @@ func (la *LogicalAggregation) DeriveStats(childStats []*property.StatsInfo, self
la.stats.GroupNDVs = la.getGroupNDVs(colGroups, childProfile, gbyCols)
return la.stats, nil
}
cardinality := getCardinality(gbyCols, childSchema[0], childProfile)
NDV := getColsNDV(gbyCols, childSchema[0], childProfile)
la.stats = &property.StatsInfo{
RowCount: cardinality,
Cardinality: make(map[int64]float64, selfSchema.Len()),
RowCount: NDV,
ColNDVs: make(map[int64]float64, selfSchema.Len()),
}
// We cannot estimate the Cardinality for every output, so we use a conservative strategy.
// We cannot estimate the ColNDVs for every output, so we use a conservative strategy.
for _, col := range selfSchema.Columns {
la.stats.Cardinality[col.UniqueID] = cardinality
la.stats.ColNDVs[col.UniqueID] = NDV
}
la.inputCount = childProfile.RowCount
la.stats.GroupNDVs = la.getGroupNDVs(colGroups, childProfile, gbyCols)
Expand All @@ -826,7 +826,7 @@ func (la *LogicalAggregation) ExtractColGroups(_ [][]*expression.Column) [][]*ex
// Parent colGroups would be dicarded, because aggregation would make NDV of colGroups
// which does not match GroupByItems invalid.
// Note that gbyCols may not be the exact GROUP BY columns, e.g, GROUP BY a+b,
// but we have no other approaches for the cardinality estimation of these cases
// but we have no other approaches for the NDV estimation of these cases
// except for using the independent assumption, unless we can use stats of expression index.
gbyCols := make([]*expression.Column, 0, len(la.GroupByItems))
for _, gbyExpr := range la.GroupByItems {
Expand Down Expand Up @@ -854,9 +854,9 @@ func (p *LogicalJoin) getGroupNDVs(colGroups [][]*expression.Column, childStats

// DeriveStats implement LogicalPlan DeriveStats interface.
// If the type of join is SemiJoin, the selectivity of it will be same as selection's.
// If the type of join is LeftOuterSemiJoin, it will not add or remove any row. The last column is a boolean value, whose Cardinality should be two.
// If the type of join is LeftOuterSemiJoin, it will not add or remove any row. The last column is a boolean value, whose NDV should be two.
// If the type of join is inner/outer join, the output of join(s, t) should be N(s) * N(t) / (V(s.key) * V(t.key)) * Min(s.key, t.key).
// N(s) stands for the number of rows in relation s. V(s.key) means the Cardinality of join key in s.
// N(s) stands for the number of rows in relation s. V(s.key) means the NDV of join key in s.
// This is a quite simple strategy: We assume every bucket of relation which will participate join has the same number of rows, and apply cross join for
// every matched bucket.
func (p *LogicalJoin) DeriveStats(childStats []*property.StatsInfo, selfSchema *expression.Schema, childSchema []*expression.Schema, colGroups [][]*expression.Column) (*property.StatsInfo, error) {
Expand All @@ -879,23 +879,23 @@ func (p *LogicalJoin) DeriveStats(childStats []*property.StatsInfo, selfSchema *
p.equalCondOutCnt = helper.estimate()
if p.JoinType == SemiJoin || p.JoinType == AntiSemiJoin {
p.stats = &property.StatsInfo{
RowCount: leftProfile.RowCount * SelectionFactor,
Cardinality: make(map[int64]float64, len(leftProfile.Cardinality)),
RowCount: leftProfile.RowCount * SelectionFactor,
ColNDVs: make(map[int64]float64, len(leftProfile.ColNDVs)),
}
for id, c := range leftProfile.Cardinality {
p.stats.Cardinality[id] = c * SelectionFactor
for id, c := range leftProfile.ColNDVs {
p.stats.ColNDVs[id] = c * SelectionFactor
}
return p.stats, nil
}
if p.JoinType == LeftOuterSemiJoin || p.JoinType == AntiLeftOuterSemiJoin {
p.stats = &property.StatsInfo{
RowCount: leftProfile.RowCount,
Cardinality: make(map[int64]float64, selfSchema.Len()),
RowCount: leftProfile.RowCount,
ColNDVs: make(map[int64]float64, selfSchema.Len()),
}
for id, c := range leftProfile.Cardinality {
p.stats.Cardinality[id] = c
for id, c := range leftProfile.ColNDVs {
p.stats.ColNDVs[id] = c
}
p.stats.Cardinality[selfSchema.Columns[selfSchema.Len()-1].UniqueID] = 2.0
p.stats.ColNDVs[selfSchema.Columns[selfSchema.Len()-1].UniqueID] = 2.0
p.stats.GroupNDVs = p.getGroupNDVs(colGroups, childStats)
return p.stats, nil
}
Expand All @@ -905,16 +905,16 @@ func (p *LogicalJoin) DeriveStats(childStats []*property.StatsInfo, selfSchema *
} else if p.JoinType == RightOuterJoin {
count = math.Max(count, rightProfile.RowCount)
}
cardinality := make(map[int64]float64, selfSchema.Len())
for id, c := range leftProfile.Cardinality {
cardinality[id] = math.Min(c, count)
colNDVs := make(map[int64]float64, selfSchema.Len())
for id, c := range leftProfile.ColNDVs {
colNDVs[id] = math.Min(c, count)
}
for id, c := range rightProfile.Cardinality {
cardinality[id] = math.Min(c, count)
for id, c := range rightProfile.ColNDVs {
colNDVs[id] = math.Min(c, count)
}
p.stats = &property.StatsInfo{
RowCount: count,
Cardinality: cardinality,
RowCount: count,
ColNDVs: colNDVs,
}
p.stats.GroupNDVs = p.getGroupNDVs(colGroups, childStats)
return p.stats, nil
Expand Down Expand Up @@ -960,9 +960,9 @@ func (h *fullJoinRowCountHelper) estimate() float64 {
if h.cartesian {
return h.leftProfile.RowCount * h.rightProfile.RowCount
}
leftKeyCardinality := getCardinality(h.leftJoinKeys, h.leftSchema, h.leftProfile)
rightKeyCardinality := getCardinality(h.rightJoinKeys, h.rightSchema, h.rightProfile)
count := h.leftProfile.RowCount * h.rightProfile.RowCount / math.Max(leftKeyCardinality, rightKeyCardinality)
leftKeyNDV := getColsNDV(h.leftJoinKeys, h.leftSchema, h.leftProfile)
rightKeyNDV := getColsNDV(h.rightJoinKeys, h.rightSchema, h.rightProfile)
count := h.leftProfile.RowCount * h.rightProfile.RowCount / math.Max(leftKeyNDV, rightKeyNDV)
return count
}

Expand All @@ -982,17 +982,17 @@ func (la *LogicalApply) DeriveStats(childStats []*property.StatsInfo, selfSchema
}
leftProfile := childStats[0]
la.stats = &property.StatsInfo{
RowCount: leftProfile.RowCount,
Cardinality: make(map[int64]float64, selfSchema.Len()),
RowCount: leftProfile.RowCount,
ColNDVs: make(map[int64]float64, selfSchema.Len()),
}
for id, c := range leftProfile.Cardinality {
la.stats.Cardinality[id] = c
for id, c := range leftProfile.ColNDVs {
la.stats.ColNDVs[id] = c
}
if la.JoinType == LeftOuterSemiJoin || la.JoinType == AntiLeftOuterSemiJoin {
la.stats.Cardinality[selfSchema.Columns[selfSchema.Len()-1].UniqueID] = 2.0
la.stats.ColNDVs[selfSchema.Columns[selfSchema.Len()-1].UniqueID] = 2.0
} else {
for i := childSchema[0].Len(); i < selfSchema.Len(); i++ {
la.stats.Cardinality[selfSchema.Columns[i].UniqueID] = leftProfile.RowCount
la.stats.ColNDVs[selfSchema.Columns[i].UniqueID] = leftProfile.RowCount
}
}
la.stats.GroupNDVs = la.getGroupNDVs(colGroups, childStats)
Expand Down Expand Up @@ -1023,11 +1023,11 @@ func (la *LogicalApply) ExtractColGroups(colGroups [][]*expression.Column) [][]*
// Exists and MaxOneRow produce at most one row, so we set the RowCount of stats one.
func getSingletonStats(schema *expression.Schema) *property.StatsInfo {
ret := &property.StatsInfo{
RowCount: 1.0,
Cardinality: make(map[int64]float64, schema.Len()),
RowCount: 1.0,
ColNDVs: make(map[int64]float64, schema.Len()),
}
for _, col := range schema.Columns {
ret.Cardinality[col.UniqueID] = 1
ret.ColNDVs[col.UniqueID] = 1
}
return ret
}
Expand Down Expand Up @@ -1057,16 +1057,16 @@ func (p *LogicalWindow) DeriveStats(childStats []*property.StatsInfo, selfSchema
}
childProfile := childStats[0]
p.stats = &property.StatsInfo{
RowCount: childProfile.RowCount,
Cardinality: make(map[int64]float64, selfSchema.Len()),
RowCount: childProfile.RowCount,
ColNDVs: make(map[int64]float64, selfSchema.Len()),
}
childLen := selfSchema.Len() - len(p.WindowFuncDescs)
for i := 0; i < childLen; i++ {
id := selfSchema.Columns[i].UniqueID
p.stats.Cardinality[id] = childProfile.Cardinality[id]
p.stats.ColNDVs[id] = childProfile.ColNDVs[id]
}
for i := childLen; i < selfSchema.Len(); i++ {
p.stats.Cardinality[selfSchema.Columns[i].UniqueID] = childProfile.RowCount
p.stats.ColNDVs[selfSchema.Columns[i].UniqueID] = childProfile.RowCount
}
p.stats.GroupNDVs = p.getGroupNDVs(colGroups, childStats)
return p.stats, nil
Expand Down
2 changes: 1 addition & 1 deletion planner/core/stats_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ func (s *testStatsSuite) TestGroupNDVs(c *C) {
}
}

func (s *testStatsSuite) TestCardinalityGroupCols(c *C) {
func (s *testStatsSuite) TestNDVGroupCols(c *C) {
store, dom, err := newStoreWithBootstrap()
c.Assert(err, IsNil)
defer func() {
Expand Down
Loading

0 comments on commit bbb1dbc

Please sign in to comment.