diff --git a/pkg/sql/opt/norm/general_funcs.go b/pkg/sql/opt/norm/general_funcs.go index 6fb72af85e36..905ce48fc5d6 100644 --- a/pkg/sql/opt/norm/general_funcs.go +++ b/pkg/sql/opt/norm/general_funcs.go @@ -320,11 +320,11 @@ func (c *CustomFuncs) IsBoundBy(src opt.Expr, cols opt.ColSet) bool { return c.OuterCols(src).SubsetOf(cols) } -// IsDeterminedBy returns true if all outer references in the source expression -// are bound by the closure of the given columns according to the functional -// dependencies of the input expression. -func (c *CustomFuncs) IsDeterminedBy(src opt.Expr, cols opt.ColSet, input memo.RelExpr) bool { - return input.Relational().FuncDeps.InClosureOf(c.OuterCols(src), cols) +// ColsAreDeterminedBy returns true if the given columns are functionally +// determined by the "in" ColSet according to the functional dependencies of the +// input expression. +func (c *CustomFuncs) ColsAreDeterminedBy(cols, in opt.ColSet, input memo.RelExpr) bool { + return input.Relational().FuncDeps.InClosureOf(cols, in) } // AreProjectionsCorrelated returns true if any element in the projections diff --git a/pkg/sql/opt/norm/groupby_funcs.go b/pkg/sql/opt/norm/groupby_funcs.go index 3b9d9ce28738..982a5bf5724c 100644 --- a/pkg/sql/opt/norm/groupby_funcs.go +++ b/pkg/sql/opt/norm/groupby_funcs.go @@ -308,3 +308,74 @@ func (c *CustomFuncs) areRowsDistinct( return true } + +// CanMergeAggs returns true if the given inner and outer AggregationsExprs can +// be replaced with a single equivalent AggregationsExpr. +func (c *CustomFuncs) CanMergeAggs(innerAggs, outerAggs memo.AggregationsExpr) bool { + // Create a mapping from the output ColumnID of each inner aggregate to its + // operator type. + innerColsToAggOps := map[opt.ColumnID]opt.Operator{} + for i := range innerAggs { + innerAgg := innerAggs[i].Agg + if !opt.IsAggregateOp(innerAgg) { + // Aggregate can't be an AggFilter or AggDistinct. + return false + } + innerColsToAggOps[innerAggs[i].Col] = innerAgg.Op() + } + + for i := range outerAggs { + outerAgg := outerAggs[i].Agg + if !opt.IsAggregateOp(outerAgg) { + // Aggregate can't be an AggFilter or AggDistinct. + return false + } + if outerAgg.ChildCount() != 1 { + // There are no valid inner-outer aggregate pairs for which the ChildCount + // of the outer is not equal to one. + return false + } + input, ok := outerAgg.Child(0).(*memo.VariableExpr) + if !ok { + // The outer aggregate does not directly aggregate on a column. + return false + } + innerOp, ok := innerColsToAggOps[input.Col] + if !ok { + // This outer aggregate does not reference an inner aggregate. + return false + } + if !opt.AggregatesCanMerge(innerOp, outerAgg.Op()) { + // There is no single aggregate that can replace this pair. + return false + } + } + return true +} + +// MergeAggs returns an AggregationsExpr that is equivalent to the two given +// AggregationsExprs. MergeAggs will panic if CanMergeAggs is false. +func (c *CustomFuncs) MergeAggs(innerAggs, outerAggs memo.AggregationsExpr) memo.AggregationsExpr { + // Create a mapping from the output ColumnIDs of the inner aggregates to their + // indices in innerAggs. + innerColsToAggs := map[opt.ColumnID]int{} + for i := range innerAggs { + innerColsToAggs[innerAggs[i].Col] = i + } + + newAggs := make(memo.AggregationsExpr, len(outerAggs)) + for i := range outerAggs { + // For each outer aggregate, construct a new aggregate that takes the Agg + // field of the referenced inner aggregate and the Col field of the outer + // aggregate. This works because CanMergeAggs has already verified that + // every inner-outer aggregate pair forms a valid decomposition for the + // inner aggregate. In most cases, the inner and outer aggregates are the + // same, but in the count and count-rows cases the inner aggregate must + // be used (see opt.aggregatesCanMerge for details). The column from the + // outer aggregate has to be used to preserve logical equivalency. + inputCol := outerAggs[i].Agg.Child(0).(*memo.VariableExpr).Col + innerAgg := innerAggs[innerColsToAggs[inputCol]].Agg + newAggs[i] = c.f.ConstructAggregationsItem(innerAgg, outerAggs[i].Col) + } + return newAggs +} diff --git a/pkg/sql/opt/norm/rules/groupby.opt b/pkg/sql/opt/norm/rules/groupby.opt index bfb21db7555e..9640416c7297 100644 --- a/pkg/sql/opt/norm/rules/groupby.opt +++ b/pkg/sql/opt/norm/rules/groupby.opt @@ -304,3 +304,91 @@ (ReplaceAggregationsItem $aggregations $item (CountRows)) $groupingPrivate ) + +# FoldGroupingOperators folds two grouping operators into one equivalent +# operator. As an example, the following pairs of queries are equivalent: +# +# SELECT sum(t) FROM (SELECT sum(b) FROM ab GROUP BY a) AS g(t); +# SELECT sum(b) FROM ab; +# +# SELECT max(t) FROM (SELECT max(b) FROM ab GROUP BY a) AS g(t); +# SELECT max(b) FROM ab; +# +# SELECT sum_int(t) FROM (SELECT count(b) FROM ab GROUP BY a) AS g(t); +# SELECT count(b) FROM ab; +# +# This transformation is possible when the following conditions are met: +# +# 1. All of the outer aggregates are aggregating on the output columns of the +# inner aggregates. +# 2. All of the inner-outer aggregate pairs can be replaced with an equivalent +# single aggregate. (See the AggregatesCanMerge comment in operator.go). +# 3. The grouping columns of the inner operator functionally determine the +# grouping columns of the outer operator according to the functional +# dependencies of the input of the inner operator. +# 4. Both grouping operators are unordered. +# +# Why is it sufficient for the inner grouping columns to functionally determine +# the outer grouping columns? +# * Duplicate values in the determinant ("from" side) imply duplicate values in +# the dependent ("to" side). +# * Grouping on the determinant will not remove unique values from the +# determinant. Therefore, the grouping will not remove unique values from the +# dependent, by the properties of functional dependencies. +# * Grouping on the dependent will simply reduce the dependent to its unique +# values. +# * Therefore, grouping on the dependent produces the same final groups as +# grouping on the dependent after grouping on the determinant. +# * Conditions #2 and #4 guarantee that the aggregates produce the same result +# regardless of how the grouping is accomplished, as long as the same groups +# result in the end. +# +# Take the following table as an example: +# +# r a b +# ----- +# 1 4 3 +# 2 4 3 +# 3 2 3 +# 4 2 3 +# 5 6 5 +# 6 6 5 +# +# Its functional dependencies: key(r), r-->(a, b), a-->(b) +# +# Here are some examples of possible groupings taking the sum over the "r" +# column: +# +# Grouping by a: SUM(1, 2), SUM(3, 4), SUM(5, 6) +# Grouping by b: SUM(1, 2, 3, 4), SUM(5, 6) +# Grouping by a then b: SUM(SUM(1, 2), SUM(3, 4)), SUM(SUM(5, 6)) +# +# Rows can always be grouped together by subsequent groupings, but they can +# never be "ungrouped". Grouping on a does not group any rows together that +# would not also be grouped by b. +# +# This situation is rare in direct SQL queries, but can arise when composing +# views and queries. +[FoldGroupingOperators, Normalize] +(GroupBy | ScalarGroupBy + (GroupBy + $innerInput:* + $innerAggs:* + $innerGrouping:* & (IsUnorderedGrouping $innerGrouping) + ) + $outerAggs:* + $outerGrouping:* & + (IsUnorderedGrouping $outerGrouping) & + (ColsAreDeterminedBy + $outerGroupingCols:(GroupingCols $outerGrouping) + (GroupingCols $innerGrouping) + $innerInput + ) & + (CanMergeAggs $innerAggs $outerAggs) +) +=> +((OpName) + $innerInput + (MergeAggs $innerAggs $outerAggs) + (MakeGrouping $outerGroupingCols (EmptyOrdering)) +) diff --git a/pkg/sql/opt/norm/rules/window.opt b/pkg/sql/opt/norm/rules/window.opt index 4317231bef02..f222d7f319cb 100644 --- a/pkg/sql/opt/norm/rules/window.opt +++ b/pkg/sql/opt/norm/rules/window.opt @@ -58,8 +58,8 @@ $input $filters:[ ... $item:* & - (IsDeterminedBy - $item + (ColsAreDeterminedBy + (OuterCols $item) $partitionCols:(WindowPartition $private) $input ) diff --git a/pkg/sql/opt/norm/testdata/rules/groupby b/pkg/sql/opt/norm/testdata/rules/groupby index 6932edd38433..bb1bfd5766a8 100644 --- a/pkg/sql/opt/norm/testdata/rules/groupby +++ b/pkg/sql/opt/norm/testdata/rules/groupby @@ -2917,3 +2917,255 @@ project └── agg-distinct [as=count:6, outer=(2)] └── count └── y:2 + +# -------------------------------------------------- +# FoldGroupingOperators +# -------------------------------------------------- + +# Case with sum aggregate. +norm expect=FoldGroupingOperators +SELECT sum(s) FROM (SELECT sum(x) FROM xy GROUP BY y) AS f(s) +---- +scalar-group-by + ├── columns: sum:4 + ├── cardinality: [1 - 1] + ├── key: () + ├── fd: ()-->(4) + ├── scan xy + │ ├── columns: x:1!null + │ └── key: (1) + └── aggregations + └── sum [as=sum:4, outer=(1)] + └── x:1 + +# Case with count-rows aggregate. +norm expect=FoldGroupingOperators +SELECT sum_int(c) FROM (SELECT count(x) FROM xy GROUP BY y) AS f(c) +---- +scalar-group-by + ├── columns: sum_int:4!null + ├── cardinality: [1 - 1] + ├── key: () + ├── fd: ()-->(4) + ├── scan xy + └── aggregations + └── count-rows [as=sum_int:4] + +# Case with a count aggregate. +norm expect=FoldGroupingOperators +SELECT sum_int(cnt) FROM (SELECT count(c2) FROM nullablecols GROUP BY c1) AS f(cnt) +---- +scalar-group-by + ├── columns: sum_int:6!null + ├── cardinality: [1 - 1] + ├── key: () + ├── fd: ()-->(6) + ├── scan nullablecols + │ └── columns: c2:2 + └── aggregations + └── count [as=sum_int:6, outer=(2)] + └── c2:2 + +# Case with max aggregate. +norm expect=FoldGroupingOperators +SELECT max(m) FROM (SELECT max(x) FROM xy GROUP BY y) AS f(m) +---- +scalar-group-by + ├── columns: max:4 + ├── cardinality: [1 - 1] + ├── key: () + ├── fd: ()-->(4) + ├── scan xy + │ ├── columns: x:1!null + │ └── key: (1) + └── aggregations + └── max [as=max:4, outer=(1)] + └── x:1 + +# Case with bit_and aggregate. +norm expect=FoldGroupingOperators +SELECT bit_and(b) FROM (SELECT bit_and(x) FROM xy GROUP BY y) AS f(b) +---- +scalar-group-by + ├── columns: bit_and:4 + ├── cardinality: [1 - 1] + ├── key: () + ├── fd: ()-->(4) + ├── scan xy + │ ├── columns: x:1!null + │ └── key: (1) + └── aggregations + └── bit-and-agg [as=bit_and:4, outer=(1)] + └── x:1 + +# Case with multiple aggregates. +norm expect=FoldGroupingOperators +SELECT max(m), sum(s), sum_int(c) +FROM (SELECT sum(b), count(c), max(b) FROM abc GROUP BY a) +AS f(s, c, m) +---- +scalar-group-by + ├── columns: max:7 sum:8 sum_int:9!null + ├── cardinality: [1 - 1] + ├── key: () + ├── fd: ()-->(7-9) + ├── scan abc + │ └── columns: b:2!null + └── aggregations + ├── max [as=max:7, outer=(2)] + │ └── b:2 + ├── sum [as=sum:8, outer=(2)] + │ └── b:2 + └── count-rows [as=sum_int:9] + +# GroupBy on GroupBy case where the inner grouping columns determine the outer +# grouping columns, but they do not intersect. +norm expect=FoldGroupingOperators +SELECT sum(s) FROM (SELECT y, sum(x) AS s FROM xy GROUP BY x) GROUP BY y +---- +project + ├── columns: sum:4!null + └── group-by + ├── columns: y:2 sum:4!null + ├── grouping columns: y:2 + ├── key: (2) + ├── fd: (2)-->(4) + ├── scan xy + │ ├── columns: x:1!null y:2 + │ ├── key: (1) + │ └── fd: (1)-->(2) + └── aggregations + └── sum [as=sum:4, outer=(1)] + └── x:1 + +# GroupBy on GroupBy case with multiple-column grouping. +norm expect=FoldGroupingOperators +SELECT sum(s) FROM (SELECT a, sum(c) AS s FROM abc GROUP BY a, b) GROUP BY a +---- +project + ├── columns: sum:5!null + └── group-by + ├── columns: a:1!null sum:5!null + ├── grouping columns: a:1!null + ├── key: (1) + ├── fd: (1)-->(5) + ├── scan abc + │ └── columns: a:1!null c:3!null + └── aggregations + └── sum [as=sum:5, outer=(3)] + └── c:3 + +# No-op case with an AvgOp. Note: this query actually could be folded if the +# groups were known to be of the same size. +norm expect-not=FoldGroupingOperators +SELECT sum(a) FROM (SELECT avg(x) FROM xy GROUP BY y) AS f(a) +---- +scalar-group-by + ├── columns: sum:4 + ├── cardinality: [1 - 1] + ├── key: () + ├── fd: ()-->(4) + ├── group-by + │ ├── columns: y:2 avg:3!null + │ ├── grouping columns: y:2 + │ ├── key: (2) + │ ├── fd: (2)-->(3) + │ ├── scan xy + │ │ ├── columns: x:1!null y:2 + │ │ ├── key: (1) + │ │ └── fd: (1)-->(2) + │ └── aggregations + │ └── avg [as=avg:3, outer=(1)] + │ └── x:1 + └── aggregations + └── sum [as=sum:4, outer=(3)] + └── avg:3 + +# No-op case with several valid aggregate pairs and one invalid pair. +norm expect-not=FoldGroupingOperators +SELECT sum(c), sum(s), max(s) FROM (SELECT sum(x), count(x) FROM xy GROUP BY y) AS f(s, c) +---- +scalar-group-by + ├── columns: sum:5 sum:6 max:7 + ├── cardinality: [1 - 1] + ├── key: () + ├── fd: ()-->(5-7) + ├── group-by + │ ├── columns: y:2 sum:3!null count:4!null + │ ├── grouping columns: y:2 + │ ├── key: (2) + │ ├── fd: (2)-->(3,4) + │ ├── scan xy + │ │ ├── columns: x:1!null y:2 + │ │ ├── key: (1) + │ │ └── fd: (1)-->(2) + │ └── aggregations + │ ├── sum [as=sum:3, outer=(1)] + │ │ └── x:1 + │ └── count-rows [as=count:4] + └── aggregations + ├── sum [as=sum:5, outer=(4)] + │ └── count:4 + ├── sum [as=sum:6, outer=(3)] + │ └── sum:3 + └── max [as=max:7, outer=(3)] + └── sum:3 + +# No-op case because the outer grouping columns are not functionally determined +# by the inner grouping columns in the functional dependencies of the input of +# the inner grouping operator. +norm expect-not=FoldGroupingOperators +SELECT max(m) FROM (SELECT max(x) AS m, sum(x) AS s FROM xy GROUP BY y) GROUP BY s +---- +project + ├── columns: max:5!null + └── group-by + ├── columns: sum:4!null max:5!null + ├── grouping columns: sum:4!null + ├── key: (4) + ├── fd: (4)-->(5) + ├── group-by + │ ├── columns: y:2 max:3!null sum:4!null + │ ├── grouping columns: y:2 + │ ├── key: (2) + │ ├── fd: (2)-->(3,4) + │ ├── scan xy + │ │ ├── columns: x:1!null y:2 + │ │ ├── key: (1) + │ │ └── fd: (1)-->(2) + │ └── aggregations + │ ├── max [as=max:3, outer=(1)] + │ │ └── x:1 + │ └── sum [as=sum:4, outer=(1)] + │ └── x:1 + └── aggregations + └── max [as=max:5, outer=(3)] + └── max:3 + +# No-op case because one of the grouping operators has an internal ordering. The +# array_agg ensures that the GroupBy has an internal ordering. +norm expect-not=FoldGroupingOperators +SELECT sum(s) FROM (SELECT sum(z) AS s, array_agg(z) FROM (SELECT * FROM uvwz ORDER BY w DESC) GROUP BY u) +---- +scalar-group-by + ├── columns: sum:8 + ├── cardinality: [1 - 1] + ├── key: () + ├── fd: ()-->(8) + ├── group-by + │ ├── columns: u:1!null sum:6!null + │ ├── grouping columns: u:1!null + │ ├── internal-ordering: -3 opt(1) + │ ├── key: (1) + │ ├── fd: (1)-->(6) + │ ├── sort + │ │ ├── columns: u:1!null w:3!null z:4!null + │ │ ├── ordering: -3 opt(1) [actual: -3] + │ │ └── scan uvwz + │ │ └── columns: u:1!null w:3!null z:4!null + │ └── aggregations + │ └── sum [as=sum:6, outer=(4)] + │ └── z:4 + └── aggregations + └── sum [as=sum:8, outer=(6)] + └── sum:6 diff --git a/pkg/sql/opt/norm/window_funcs.go b/pkg/sql/opt/norm/window_funcs.go index 4cbfe9c32fc5..bd89f92ee47b 100644 --- a/pkg/sql/opt/norm/window_funcs.go +++ b/pkg/sql/opt/norm/window_funcs.go @@ -118,7 +118,7 @@ func (c *CustomFuncs) ExtractDeterminedConditions( ) memo.FiltersExpr { newFilters := make(memo.FiltersExpr, 0, len(filters)) for i := range filters { - if c.IsDeterminedBy(&filters[i], cols, input) { + if c.ColsAreDeterminedBy(filters[i].ScalarProps().OuterCols, cols, input) { newFilters = append(newFilters, filters[i]) } } @@ -132,7 +132,7 @@ func (c *CustomFuncs) ExtractUndeterminedConditions( ) memo.FiltersExpr { newFilters := make(memo.FiltersExpr, 0, len(filters)) for i := range filters { - if !c.IsDeterminedBy(&filters[i], cols, input) { + if !c.ColsAreDeterminedBy(filters[i].ScalarProps().OuterCols, cols, input) { newFilters = append(newFilters, filters[i]) } } diff --git a/pkg/sql/opt/operator.go b/pkg/sql/opt/operator.go index a9a73de427df..3de3d2a8cc0b 100644 --- a/pkg/sql/opt/operator.go +++ b/pkg/sql/opt/operator.go @@ -348,6 +348,43 @@ func AggregateIsNeverNull(op Operator) bool { return false } +// AggregatesCanMerge returns true if the given inner and outer operators can be +// replaced with a single equivalent operator, assuming the outer operator is +// aggregating on the inner. In other words, the inner-outer aggregate pair +// forms a valid "decomposition" of a single aggregate. For example, the +// following pairs of queries are equivalent: +// +// SELECT sum(s) FROM (SELECT sum(y) FROM xy GROUP BY x) AS f(s); +// SELECT sum(y) FROM xy; +// +// SELECT sum_int(c) FROM (SELECT count(y) FROM xy GROUP BY x) AS f(c); +// SELECT count(y) FROM xy; +// +// Note: some aggregates like StringAggOp are decomposable in theory, but in +// practice can not be easily merged as in the examples above. +func AggregatesCanMerge(inner, outer Operator) bool { + switch inner { + + case AnyNotNullAggOp, BitAndAggOp, BitOrAggOp, BoolAndOp, + BoolOrOp, ConstAggOp, ConstNotNullAggOp, FirstAggOp, + MaxOp, MinOp, SumOp, SumIntOp, XorAggOp: + return inner == outer + + case CountOp, CountRowsOp: + // Only SumIntOp can be used here because SumOp outputs a decimal value, + // while CountOp and CountRowsOp both output int values. + return outer == SumIntOp + + case ArrayAggOp, AvgOp, ConcatAggOp, CorrOp, JsonAggOp, + JsonbAggOp, PercentileContOp, PercentileDiscOp, SqrDiffOp, + StdDevOp, StringAggOp, VarianceOp: + return false + + default: + panic(errors.AssertionFailedf("unhandled ops: %s, %s", log.Safe(inner), log.Safe(outer))) + } +} + // OpaqueMetadata is an object stored in OpaqueRelExpr and passed // through to the exec factory. type OpaqueMetadata interface { diff --git a/pkg/sql/opt/xform/testdata/external/trading b/pkg/sql/opt/xform/testdata/external/trading index 59e21f3dac6a..82dd2c1bb9cd 100644 --- a/pkg/sql/opt/xform/testdata/external/trading +++ b/pkg/sql/opt/xform/testdata/external/trading @@ -584,9 +584,7 @@ project # could eliminate the join to Cards (because of FK). # 2. InnerJoin can be pushed below GroupBy, which would put the GroupBy as the # input of the ScalarGroupBy. -# 3. ScalarGroupBy Max of a GroupBy Max is just ScalarGroupBy Max. Those two -# would then be collapsed into one. -# 4. Furthermore, the join with the second Cards table could be eliminated, +# 3. Furthermore, the join with the second Cards table could be eliminated, # just as with #1. # opt format=show-stats diff --git a/pkg/sql/opt/xform/testdata/external/trading-mutation b/pkg/sql/opt/xform/testdata/external/trading-mutation index 1483de86905c..778b74f109ee 100644 --- a/pkg/sql/opt/xform/testdata/external/trading-mutation +++ b/pkg/sql/opt/xform/testdata/external/trading-mutation @@ -590,9 +590,7 @@ project # could eliminate the join to Cards (because of FK). # 2. InnerJoin can be pushed below GroupBy, which would put the GroupBy as the # input of the ScalarGroupBy. -# 3. ScalarGroupBy Max of a GroupBy Max is just ScalarGroupBy Max. Those two -# would then be collapsed into one. -# 4. Furthermore, the join with the second Cards table could be eliminated, +# 3. Furthermore, the join with the second Cards table could be eliminated, # just as with #1. # opt format=show-stats