Skip to content

Commit

Permalink
opt: make lookup join limit hint consistent with coster, add tests
Browse files Browse the repository at this point in the history
This commit adds a new helper function called lookupJoinInputLimitHint, which
is called by both the coster and the physical props code for limit hint
calculation. This helper function ensures that both places take into account
the batch size of 100, and require that the calculated limit hint is a
multiple of this batch size.

This commit also adds more tests related to costing of limit hints for
scans and lookup joins.

Release note: None
  • Loading branch information
rytaft committed Mar 3, 2020
1 parent 9ac5dbc commit 7e0ba7c
Show file tree
Hide file tree
Showing 6 changed files with 437 additions and 13 deletions.
6 changes: 3 additions & 3 deletions pkg/sql/opt/memo/testdata/memo
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ memo (optimized, ~18KB, required=[presentation: y:2,x:3,c:6] [ordering: +2])
├── G5: (const 10)
├── G6: (plus G11 G12)
├── G7: (project G13 G14 y)
│ ├── [ordering: +2] [limit hint: 10.00]
│ ├── [ordering: +2] [limit hint: 100.00]
│ │ ├── best: (sort G7)
│ │ └── cost: 1119.26
│ ├── [ordering: +5]
Expand All @@ -169,7 +169,7 @@ memo (optimized, ~18KB, required=[presentation: y:2,x:3,c:6] [ordering: +2])
├── G11: (variable y)
├── G12: (const 1)
├── G13: (select G16 G17)
│ ├── [ordering: +2] [limit hint: 10.00]
│ ├── [ordering: +2] [limit hint: 100.00]
│ │ ├── best: (sort G13)
│ │ └── cost: 1112.58
│ └── []
Expand All @@ -178,7 +178,7 @@ memo (optimized, ~18KB, required=[presentation: y:2,x:3,c:6] [ordering: +2])
├── G14: (projections G18)
├── G15: (eq G19 G20)
├── G16: (scan a)
│ ├── [ordering: +2] [limit hint: 30.00]
│ ├── [ordering: +2] [limit hint: 300.00]
│ │ ├── best: (sort G16)
│ │ └── cost: 1259.35
│ └── []
Expand Down
19 changes: 13 additions & 6 deletions pkg/sql/opt/xform/coster.go
Original file line number Diff line number Diff line change
Expand Up @@ -415,12 +415,8 @@ func (c *coster) computeLookupJoinCost(
// expensive lookup join might have a lower cost if its limit hint estimates
// that most rows will not be needed.
if required.LimitHint != 0 {
// Estimate the number of lookups needed to output LimitHint rows.
expectedLookupCount := required.LimitHint * lookupCount / join.Relational().Stats.RowCount

// Round up to the nearest multiple of a batch.
expectedLookupCount = math.Ceil(expectedLookupCount/joinReaderBatchSize) * joinReaderBatchSize
lookupCount = math.Min(lookupCount, expectedLookupCount)
outputRows := join.Relational().Stats.RowCount
lookupCount = lookupJoinInputLimitHint(lookupCount, outputRows, required.LimitHint)
}

// The rows in the (left) input are used to probe into the (right) table.
Expand Down Expand Up @@ -792,3 +788,14 @@ func localityMatchScore(zone cat.Zone, locality roachpb.Locality) float64 {
// Weight the constraintScore twice as much as the lease score.
return (constraintScore*2 + leaseScore) / 3
}

// lookupJoinInputLimitHint calculates an appropriate limit hint for the input
// to a lookup join.
func lookupJoinInputLimitHint(inputRowCount, outputRowCount, outputLimitHint float64) float64 {
// Estimate the number of lookups needed to output LimitHint rows.
expectedLookupCount := outputLimitHint * inputRowCount / outputRowCount

// Round up to the nearest multiple of a batch.
expectedLookupCount = math.Ceil(expectedLookupCount/joinReaderBatchSize) * joinReaderBatchSize
return math.Min(inputRowCount, expectedLookupCount)
}
13 changes: 9 additions & 4 deletions pkg/sql/opt/xform/physical_props.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,10 +123,15 @@ func BuildChildPhysicalProps(
}
if input, ok := parent.Child(nth).(memo.RelExpr); ok {
inputRows := input.Relational().Stats.RowCount
// outputRows / inputRows is roughly the number of output rows produced
// for each input row. Reduce the number of required input rows so that
// the expected number of output rows is equal to the parent limit hint.
childProps.LimitHint = parentProps.LimitHint * inputRows / outputRows
switch parent.Op() {
case opt.SelectOp:
// outputRows / inputRows is roughly the number of output rows produced
// for each input row. Reduce the number of required input rows so that
// the expected number of output rows is equal to the parent limit hint.
childProps.LimitHint = parentProps.LimitHint * inputRows / outputRows
case opt.LookupJoinOp:
childProps.LimitHint = lookupJoinInputLimitHint(inputRows, outputRows, parentProps.LimitHint)
}
}

case opt.OrdinalityOp, opt.ProjectOp, opt.ProjectSetOp:
Expand Down
242 changes: 242 additions & 0 deletions pkg/sql/opt/xform/testdata/coster/join
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,118 @@ inner-join (hash)
└── filters
└── k:1 = x:5 [outer=(1,5), constraints=(/1: (/NULL - ]; /5: (/NULL - ]), fd=(1)==(5), (5)==(1)]

exec-ddl
ALTER TABLE a INJECT STATISTICS '[
{
"columns": ["k"],
"created_at": "2019-02-08 04:10:40.001179+00:00",
"row_count": 100000,
"distinct_count": 100000
}
]'
----

exec-ddl
ALTER TABLE b INJECT STATISTICS '[
{
"columns": ["x"],
"created_at": "2019-02-08 04:10:40.001179+00:00",
"row_count": 10000,
"distinct_count": 1000
}
]'
----

# Lookup join with no limit hint.
opt
SELECT * FROM a JOIN b ON k=z WHERE x > 0 AND x <= 5000
----
inner-join (lookup a)
├── columns: k:1!null i:2 s:3 d:4!null x:5!null z:6!null
├── key columns: [6] = [1]
├── lookup columns are key
├── stats: [rows=10000, distinct(1)=1000, null(1)=0, distinct(6)=1000, null(6)=0]
├── cost: 71400.04
├── fd: (1)-->(2-4), (1)==(6), (6)==(1)
├── select
│ ├── columns: x:5!null z:6!null
│ ├── stats: [rows=10000, distinct(5)=1000, null(5)=0, distinct(6)=1000, null(6)=0]
│ ├── cost: 10600.03
│ ├── scan b
│ │ ├── columns: x:5 z:6!null
│ │ ├── stats: [rows=10000, distinct(5)=1000, null(5)=0, distinct(6)=1000, null(6)=0]
│ │ └── cost: 10500.02
│ └── filters
│ └── (x:5 > 0) AND (x:5 <= 5000) [outer=(5), constraints=(/5: [/1 - /5000]; tight)]
└── filters (true)

# With the limit hint, the cost of the lookup join is reduced.
opt
SELECT * FROM a JOIN b ON k=z WHERE x > 0 AND x <= 5000 LIMIT 6000
----
limit
├── columns: k:1!null i:2 s:3 d:4!null x:5!null z:6!null
├── cardinality: [0 - 6000]
├── stats: [rows=6000]
├── cost: 55460.05
├── fd: (1)-->(2-4), (1)==(6), (6)==(1)
├── inner-join (lookup a)
│ ├── columns: k:1!null i:2 s:3 d:4!null x:5!null z:6!null
│ ├── key columns: [6] = [1]
│ ├── lookup columns are key
│ ├── stats: [rows=10000, distinct(1)=1000, null(1)=0, distinct(6)=1000, null(6)=0]
│ ├── cost: 55400.04
│ ├── fd: (1)-->(2-4), (1)==(6), (6)==(1)
│ ├── limit hint: 6000.00
│ ├── select
│ │ ├── columns: x:5!null z:6!null
│ │ ├── stats: [rows=10000, distinct(5)=1000, null(5)=0, distinct(6)=1000, null(6)=0]
│ │ ├── cost: 10600.03
│ │ ├── limit hint: 6000.00
│ │ ├── scan b
│ │ │ ├── columns: x:5 z:6!null
│ │ │ ├── stats: [rows=10000, distinct(5)=1000, null(5)=0, distinct(6)=1000, null(6)=0]
│ │ │ ├── cost: 10500.02
│ │ │ └── limit hint: 6000.00
│ │ └── filters
│ │ └── (x:5 > 0) AND (x:5 <= 5000) [outer=(5), constraints=(/5: [/1 - /5000]; tight)]
│ └── filters (true)
└── 6000

# The limit hint for the lookup join input will be rounded up to the nearest
# multiple of the batch size, so the cost of the lookup join here is the same
# as the test case above.
opt
SELECT * FROM a JOIN b ON k=z WHERE x > 0 AND x <= 5000 LIMIT 5950
----
limit
├── columns: k:1!null i:2 s:3 d:4!null x:5!null z:6!null
├── cardinality: [0 - 5950]
├── stats: [rows=5950]
├── cost: 55459.55
├── fd: (1)-->(2-4), (1)==(6), (6)==(1)
├── inner-join (lookup a)
│ ├── columns: k:1!null i:2 s:3 d:4!null x:5!null z:6!null
│ ├── key columns: [6] = [1]
│ ├── lookup columns are key
│ ├── stats: [rows=10000, distinct(1)=1000, null(1)=0, distinct(6)=1000, null(6)=0]
│ ├── cost: 55400.04
│ ├── fd: (1)-->(2-4), (1)==(6), (6)==(1)
│ ├── limit hint: 5950.00
│ ├── select
│ │ ├── columns: x:5!null z:6!null
│ │ ├── stats: [rows=10000, distinct(5)=1000, null(5)=0, distinct(6)=1000, null(6)=0]
│ │ ├── cost: 10600.03
│ │ ├── limit hint: 6000.00
│ │ ├── scan b
│ │ │ ├── columns: x:5 z:6!null
│ │ │ ├── stats: [rows=10000, distinct(5)=1000, null(5)=0, distinct(6)=1000, null(6)=0]
│ │ │ ├── cost: 10500.02
│ │ │ └── limit hint: 6000.00
│ │ └── filters
│ │ └── (x:5 > 0) AND (x:5 <= 5000) [outer=(5), constraints=(/5: [/1 - /5000]; tight)]
│ └── filters (true)
└── 5950

exec-ddl
CREATE TABLE abc (a INT PRIMARY KEY, b INT, c INT, INDEX c_idx (c))
Expand Down Expand Up @@ -365,3 +477,133 @@ project
└── filters
├── a:12 = 'foo' [outer=(12), constraints=(/12: [/'foo' - /'foo']; tight), fd=()-->(12)]
└── b:13 = '2ab23800-06b1-4e19-a3bb-df3768b808d2' [outer=(13), constraints=(/13: [/'2ab23800-06b1-4e19-a3bb-df3768b808d2' - /'2ab23800-06b1-4e19-a3bb-df3768b808d2']; tight), fd=()-->(13)]

exec-ddl
DROP TABLE abcde
----

exec-ddl
DROP TABLE wxyz
----

exec-ddl
CREATE TABLE abcde (
a TEXT NOT NULL,
b UUID NOT NULL,
c UUID NOT NULL,
d VARCHAR(255) NOT NULL,
e TEXT NOT NULL,
CONSTRAINT "primary" PRIMARY KEY (a, b, c),
UNIQUE INDEX idx_abd (a, b, d),
UNIQUE INDEX idx_abcd (a, b, c, d)
)
----

exec-ddl
ALTER TABLE abcde INJECT STATISTICS '[
{
"columns": ["a"],
"created_at": "2019-02-08 04:10:40.001179+00:00",
"row_count": 250000,
"distinct_count": 1
},
{
"columns": ["b"],
"created_at": "2019-02-08 04:10:40.119954+00:00",
"row_count": 250000,
"distinct_count": 2
},
{
"columns": ["d"],
"created_at": "2019-02-08 04:10:40.119954+00:00",
"row_count": 250000,
"distinct_count": 125000
}
]'
----

exec-ddl
CREATE TABLE wxyz (
w TEXT NOT NULL,
x UUID NOT NULL,
y UUID NOT NULL,
z TEXT NOT NULL,
CONSTRAINT "primary" PRIMARY KEY (w, x, y),
CONSTRAINT "foreign" FOREIGN KEY (w, x, y) REFERENCES abcde (a, b, c)
)
----

exec-ddl
ALTER TABLE wxyz INJECT STATISTICS '[
{
"columns": ["w"],
"created_at": "2019-02-08 04:10:40.001179+00:00",
"row_count": 10000,
"distinct_count": 1
},
{
"columns": ["x"],
"created_at": "2019-02-08 04:10:40.119954+00:00",
"row_count": 10000,
"distinct_count": 1
},
{
"columns": ["y"],
"created_at": "2019-02-08 04:10:40.119954+00:00",
"row_count": 10000,
"distinct_count": 2500
}
]'
----

# Regression test for #34811. Ensure the soft limit propagation causes us to
# select a lookup join.
opt
SELECT w, x, y, z
FROM wxyz
INNER JOIN abcde
ON w = a AND x = b AND y = c
WHERE w = 'foo' AND x = '2AB23800-06B1-4E19-A3BB-DF3768B808D2'
ORDER BY d
LIMIT 10
----
project
├── columns: w:1!null x:2!null y:3!null z:4!null [hidden: d:8!null]
├── cardinality: [0 - 10]
├── stats: [rows=10]
├── cost: 104717.922
├── key: (8)
├── fd: ()-->(1,2), (3)-->(4,8), (8)-->(3,4)
├── ordering: +8 opt(1,2) [actual: +8]
└── limit
├── columns: w:1!null x:2!null y:3!null z:4!null a:5!null b:6!null c:7!null d:8!null
├── internal-ordering: +8 opt(1,2,5,6)
├── cardinality: [0 - 10]
├── stats: [rows=10]
├── cost: 104717.812
├── key: (7)
├── fd: ()-->(1,2,5,6), (3)-->(4), (7)-->(8), (8)-->(7), (1)==(5), (5)==(1), (2)==(6), (6)==(2), (3)==(7), (7)==(3)
├── ordering: +8 opt(1,2,5,6) [actual: +8]
├── inner-join (lookup wxyz)
│ ├── columns: w:1!null x:2!null y:3!null z:4!null a:5!null b:6!null c:7!null d:8!null
│ ├── key columns: [5 6 7] = [1 2 3]
│ ├── lookup columns are key
│ ├── stats: [rows=50048.8759, distinct(1)=1, null(1)=0, distinct(2)=1, null(2)=0, distinct(3)=2500, null(3)=0, distinct(5)=1, null(5)=0, distinct(6)=1, null(6)=0, distinct(7)=2500, null(7)=0]
│ ├── cost: 104717.702
│ ├── key: (7)
│ ├── fd: ()-->(1,2,5,6), (3)-->(4), (7)-->(8), (8)-->(7), (1)==(5), (5)==(1), (2)==(6), (6)==(2), (3)==(7), (7)==(3)
│ ├── ordering: +8 opt(1,2,5,6) [actual: +8]
│ ├── limit hint: 10.00
│ ├── scan abcde@idx_abd
│ │ ├── columns: a:5!null b:6!null c:7!null d:8!null
│ │ ├── constraint: /5/6/8: [/'foo'/'2ab23800-06b1-4e19-a3bb-df3768b808d2' - /'foo'/'2ab23800-06b1-4e19-a3bb-df3768b808d2']
│ │ ├── stats: [rows=125000, distinct(5)=1, null(5)=0, distinct(6)=1, null(6)=0, distinct(7)=24975.5859, null(7)=0, distinct(8)=93750, null(8)=0]
│ │ ├── cost: 216.01
│ │ ├── key: (7)
│ │ ├── fd: ()-->(5,6), (7)-->(8), (8)-->(7)
│ │ ├── ordering: +8 opt(5,6) [actual: +8]
│ │ └── limit hint: 100.00
│ └── filters
│ ├── w:1 = 'foo' [outer=(1), constraints=(/1: [/'foo' - /'foo']; tight), fd=()-->(1)]
│ └── x:2 = '2ab23800-06b1-4e19-a3bb-df3768b808d2' [outer=(2), constraints=(/2: [/'2ab23800-06b1-4e19-a3bb-df3768b808d2' - /'2ab23800-06b1-4e19-a3bb-df3768b808d2']; tight), fd=()-->(2)]
└── 10
Loading

0 comments on commit 7e0ba7c

Please sign in to comment.