Skip to content

Commit

Permalink
Merge #26597
Browse files Browse the repository at this point in the history
26597: opt: push limit through index join r=RaduBerinde a=RaduBerinde

Pushing limit through index joins. This is important for cases like
```
SELECT * FROM t ORDER BY w LIMIT 5
```
where it's much better to index-join 5 rows than to read the entire
table and sort.

Unfortunately this requires more acrobatics during exec build (we have
to move the limit above the indexJoinNode). This is the reason why I
haven't yet implemented the same rule for Offset; I'm hoping that case
is less important and can wait until we have a better lookup-join
execution path.

Release note: None

@rytaft - I realize this conflicts heavily with your outstanding PRs, I plan to wait for them to merge and rebase my changes.

Co-authored-by: Radu Berinde <radu@cockroachlabs.com>
  • Loading branch information
craig[bot] and RaduBerinde committed Jun 15, 2018
2 parents db47b3f + 336719c commit 718f6bd
Show file tree
Hide file tree
Showing 12 changed files with 427 additions and 178 deletions.
36 changes: 36 additions & 0 deletions pkg/sql/logictest/testdata/logic_test/select_index
Original file line number Diff line number Diff line change
Expand Up @@ -541,3 +541,39 @@ SELECT w FROM t3 WHERE v > 0 AND v < 100 ORDER BY v
3
2
1

statement ok
CREATE TABLE tab1 (
pk INTEGER NOT NULL,
col0 INTEGER NULL,
col1 FLOAT NULL,
col2 STRING NULL,
col3 INTEGER NULL,
col4 FLOAT NULL,
col5 STRING NULL,
CONSTRAINT "primary" PRIMARY KEY (pk ASC),
INDEX idx_tab1_0 (col0 ASC),
INDEX idx_tab1_1 (col1 ASC),
INDEX idx_tab1_3 (col3 ASC),
INDEX idx_tab1_4 (col4 ASC),
FAMILY "primary" (pk, col0, col1, col2, col3, col4, col5)
)

statement ok
INSERT INTO tab1(pk, col0, col3) VALUES
(1, 65, 65),
(2, 87, 87),
(3, 70, 70),
(4, 88, 88),
(5, 69, 69),
(6, 72, 72),
(7, 82, 82)

query II
SELECT pk, col0 FROM tab1 WHERE (col3 BETWEEN 66 AND 87) ORDER BY 1 DESC
----
7 82
6 72
5 69
3 70
2 87
30 changes: 24 additions & 6 deletions pkg/sql/opt/exec/execbuilder/relational_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -546,12 +546,21 @@ func (b *Builder) buildRowNumber(ev memo.ExprView) (execPlan, error) {
}

func (b *Builder) buildLookupJoin(ev memo.ExprView) (execPlan, error) {
// If lookup join child is a sort operator, then flip the order so that the
// sort is on top of the lookup join.
var err error
// If lookup join child is a limit and/or sort operator then flip the order so
// that the sort/limit is on top of the lookup join.
// TODO(radu): Remove this code once we have support for a more general
// lookup join operator.
var limit tree.TypedExpr
var ordering props.Ordering
child := ev.Child(0)
if child.Operator() == opt.LimitOp {
limit, err = b.buildScalar(nil, child.Child(1))
if err != nil {
return execPlan{}, err
}
child = child.Child(0)
}
if child.Operator() == opt.SortOp {
ordering = child.Physical().Ordering
child = child.Child(0)
Expand All @@ -577,14 +586,23 @@ func (b *Builder) buildLookupJoin(ev memo.ExprView) (execPlan, error) {
reqOrder = b.makeSQLOrdering(res, ev)
}

node, err := b.factory.ConstructIndexJoin(input.root, md.Table(def.Table), needed, reqOrder)
res.root, err = b.factory.ConstructIndexJoin(input.root, md.Table(def.Table), needed, reqOrder)
if err != nil {
return execPlan{}, err
}

res.root = node
if ordering != nil {
return b.buildSortedInput(res, ordering)
res, err = b.buildSortedInput(res, ordering)
if err != nil {
return execPlan{}, err
}
}

if limit != nil {
var err error
res.root, err = b.factory.ConstructLimit(res.root, limit, nil)
if err != nil {
return execPlan{}, err
}
}
return res, nil
}
Expand Down
47 changes: 47 additions & 0 deletions pkg/sql/opt/exec/execbuilder/testdata/limit
Original file line number Diff line number Diff line change
Expand Up @@ -127,3 +127,50 @@ filter · · (k) ·
· table t@t_v_idx · ·
· spans ALL · ·
· limit 5 · ·

query TTTTT
EXPLAIN (VERBOSE) SELECT k, w FROM t WHERE v >= 1 AND v <= 100 LIMIT 10
----
render · · (k, w) ·
│ render 0 k · ·
│ render 1 w · ·
└── index-join · · (k, v, w) ·
├── scan · · (k, v) ·
│ table t@t_v_idx · ·
│ spans /1-/101 · ·
│ limit 10 · ·
└── scan · · (k, v, w) ·
· table t@primary · ·

query TTTTT
EXPLAIN (VERBOSE) SELECT k, w FROM t WHERE v >= 1 AND v <= 100 ORDER BY v LIMIT 10
----
render · · (k, w) ·
│ render 0 k · ·
│ render 1 w · ·
└── index-join · · (k, v, w) +v
├── scan · · (k, v) +v
│ table t@t_v_idx · ·
│ spans /1-/101 · ·
│ limit 10 · ·
└── scan · · (k, v, w) ·
· table t@primary · ·

query TTTTT
EXPLAIN (VERBOSE) SELECT k, w FROM (SELECT * FROM t WHERE v >= 1 AND v <= 100 ORDER BY k LIMIT 10) ORDER BY v
----
render · · (k, w) ·
│ render 0 k · ·
│ render 1 w · ·
└── sort · · (k, v, w) +v
│ order +v · ·
└── limit · · (k, v, w) +k
│ count 10 · ·
└── sort · · (k, v, w) +k
│ order +k · ·
└── index-join · · (k, v, w) ·
├── scan · · (k, v) ·
│ table t@t_v_idx · ·
│ spans /1-/101 · ·
└── scan · · (k, v, w) ·
· table t@primary · ·
64 changes: 48 additions & 16 deletions pkg/sql/opt/exec/execbuilder/testdata/select_index
Original file line number Diff line number Diff line change
Expand Up @@ -572,18 +572,20 @@ INSERT INTO test2(k)
# Plan check:
# The query is using an index-join and the limit is propagated to the scan.

# TODO(radu): need revscan, index-join, and limit pushdown
# TODO(radu): need revscan.
query TTTTT
EXPLAIN (VERBOSE) SELECT * FROM test2 WHERE k <= '100' ORDER BY k DESC LIMIT 20
----
limit · · (id, k, v) -k
│ count 20 · ·
└── sort · · (id, k, v) -k
│ order -k · ·
└── scan · · (id, k, v) ·
· table test2@primary · ·
· spans ALL · ·
· filter k <= '100' · ·
limit · · (id, k, v) -k
│ count 20 · ·
└── sort · · (id, k, v) -k
│ order -k · ·
└── index-join · · (id, k, v) ·
├── scan · · (id, k) ·
│ table test2@test2_k_key · ·
│ spans /!NULL-/"100"/PrefixEnd · ·
└── scan · · (id, k, v) ·
· table test2@primary · ·

# Result check: The following query must not issue more than the
# requested LIMIT K/V reads, even though an index join batches 100
Expand All @@ -596,6 +598,36 @@ SELECT regexp_replace(message, '\d\d\d\d\d+', '...PK...')
FROM [SHOW KV TRACE FOR SELECT * FROM test2 WHERE k <= '100' ORDER BY k DESC LIMIT 20]
WHERE message LIKE 'fetched:%'
----
fetched: /test2/test2_k_key/'001' -> /...PK...
fetched: /test2/test2_k_key/'002' -> /...PK...
fetched: /test2/test2_k_key/'003' -> /...PK...
fetched: /test2/test2_k_key/'004' -> /...PK...
fetched: /test2/test2_k_key/'005' -> /...PK...
fetched: /test2/test2_k_key/'006' -> /...PK...
fetched: /test2/test2_k_key/'007' -> /...PK...
fetched: /test2/test2_k_key/'008' -> /...PK...
fetched: /test2/test2_k_key/'009' -> /...PK...
fetched: /test2/test2_k_key/'010' -> /...PK...
fetched: /test2/test2_k_key/'011' -> /...PK...
fetched: /test2/test2_k_key/'012' -> /...PK...
fetched: /test2/test2_k_key/'013' -> /...PK...
fetched: /test2/test2_k_key/'014' -> /...PK...
fetched: /test2/test2_k_key/'015' -> /...PK...
fetched: /test2/test2_k_key/'016' -> /...PK...
fetched: /test2/test2_k_key/'017' -> /...PK...
fetched: /test2/test2_k_key/'018' -> /...PK...
fetched: /test2/test2_k_key/'019' -> /...PK...
fetched: /test2/test2_k_key/'020' -> /...PK...
fetched: /test2/test2_k_key/'021' -> /...PK...
fetched: /test2/test2_k_key/'022' -> /...PK...
fetched: /test2/test2_k_key/'023' -> /...PK...
fetched: /test2/test2_k_key/'024' -> /...PK...
fetched: /test2/test2_k_key/'025' -> /...PK...
fetched: /test2/test2_k_key/'026' -> /...PK...
fetched: /test2/test2_k_key/'027' -> /...PK...
fetched: /test2/test2_k_key/'028' -> /...PK...
fetched: /test2/test2_k_key/'029' -> /...PK...
fetched: /test2/test2_k_key/'030' -> /...PK...
fetched: /test2/primary/...PK.../k/v -> /'001'/42
fetched: /test2/primary/...PK.../k/v -> /'002'/42
fetched: /test2/primary/...PK.../k/v -> /'003'/42
Expand Down Expand Up @@ -992,16 +1024,16 @@ sort · ·
· table noncover@primary
· spans ALL

# TODO(radu): need to prefer the order-matching index
query TTT
EXPLAIN SELECT * FROM noncover ORDER BY c LIMIT 5
----
limit · ·
└── sort · ·
│ order +c
└── scan · ·
· table noncover@primary
· spans ALL
index-join · ·
├── scan · ·
│ table noncover@c
│ spans ALL
│ limit 5
└── scan · ·
· table noncover@primary

query TTT
SELECT "Tree", "Field", "Description" FROM [
Expand Down
7 changes: 6 additions & 1 deletion pkg/sql/opt/memo/cost.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,12 @@ var MaxCost = Cost(math.Inf(+1))

// Less returns true if this cost is lower than the given cost.
func (c Cost) Less(other Cost) bool {
return c < other
// Two plans with the same cost can have slightly different floating point
// results (e.g. same subcosts being added up in a different order). So we
// treat plans with very similar cost as equal.
const epsilon = 1e-5

return c <= other-epsilon
}

// Sub subtracts the other cost from this cost and returns the result.
Expand Down
10 changes: 10 additions & 0 deletions pkg/sql/opt/memo/private_defs.go
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,16 @@ type LookupJoinDef struct {
LookupCols opt.ColSet
}

// IsIndexJoin is true if the lookup join is an index join, meaning that the
// input columns are from the same table as the lookup columns. In this special
// case, there is always a 1:1 relationship between input and output rows.
func (l *LookupJoinDef) IsIndexJoin(md *opt.Metadata) bool {
// The input and index key column sets will be the same if this is an index
// join, or always disjoint if not.
ord := md.Table(l.Table).Index(l.Index).Column(0).Ordinal
return md.TableColumn(l.Table, ord) == l.KeyCols[0]
}

// ExplainOpDef defines the value of the Def private field of the Explain operator.
type ExplainOpDef struct {
Options tree.ExplainOptions
Expand Down
4 changes: 2 additions & 2 deletions pkg/sql/opt/rule_name_string.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

27 changes: 27 additions & 0 deletions pkg/sql/opt/xform/custom_funcs.go
Original file line number Diff line number Diff line change
Expand Up @@ -297,3 +297,30 @@ func (c *CustomFuncs) LimitScanDef(def, limit memo.PrivateID) memo.PrivateID {
defCopy.HardLimit = int64(*c.e.mem.LookupPrivate(limit).(*tree.DInt))
return c.e.mem.InternScanOpDef(&defCopy)
}

// HasOrderingCols returns true if the output columns produced by a group
// contain all columns in an ordering.
func (c *CustomFuncs) HasOrderingCols(input memo.GroupID, ordering memo.PrivateID) bool {
outCols := c.OutputCols(input)
for _, c := range c.ExtractOrdering(ordering) {
if !outCols.Contains(int(c.ID())) {
return false
}
}
return true
}

// OneResultPerInput returns true if the given LookupJoinOp always produces
// exactly one row for each input row.
func (c *CustomFuncs) OneResultPerInput(def memo.PrivateID) bool {
lookupJoinDef := c.e.mem.LookupPrivate(def).(*memo.LookupJoinDef)

// We always have exactly one result per input when this is an index join.
//
// TODO(radu): in general, we need to check if the keyCols are associated with
// a foreign key constraint.
//
// TODO(radu): we also know to have exactly one result per input if we have a
// left lookup join and the KeyCols form a key in the table.
return lookupJoinDef.IsIndexJoin(c.e.mem.Metadata())
}
19 changes: 19 additions & 0 deletions pkg/sql/opt/xform/rules/limit.opt
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,22 @@
(Scan
(LimitScanDef $def $limit)
)

# PushLimitIntoLookupJoin pushes a limit through a lookup join, if we know that
# the lookup join produces exactly one row per input row (most importantly,
# index join).
# TODO(radu): we can similarly push Offset too.
[PushLimitIntoLookupJoin, Explore]
(Limit
(LookupJoin
$input:*
$def:* & (OneResultPerInput $def)
)
$limit:*
$ordering:* & (HasOrderingCols $input $ordering)
)
=>
(LookupJoin
(Limit $input $limit $ordering)
$def
)
Loading

0 comments on commit 718f6bd

Please sign in to comment.