opt/rowexec: support range lookup joins on input columns

Previously, it was possible to perform lookup joins using inequality conditions between index columns and constant values. This commit allows lookup joins to also use inequalities between index columns and input columns. There are restrictions on when an inequality can be used in a lookup join: 1. The left and right sides of the inequality must have identical types. 2. The inequality is between an index column and input column (or constant). 3. If the index column is `DESC` and the inequality is of the form `idxCol < inputCol`, the column type must support `Datum.Prev` without any chance of failing other than for the minimum value for that type. Condition (3) is necessary because when the index column is `DESC`, the `idxCol < inputCol` filter will be used in forming the start key of each span. The spans are expected to be inclusive, so the value of inputCol will have to be decremented to the value that orders immediately before it. Unlike the case of retrieving the next possible key (ex: `ASC` index with `idxCol > inputCol`) it is not possible in general to directly obtain the immediate previous key, because it would have an infinite number of `0xff` bytes appended to it. Thus, we have to use `Datum.Prev` on the inequality bound before adding it to the start key. Additionally, this commit allows lookup joins to be planned without equality filters when the following conditions are met: 1. There is an inequality filter between an index column and an input column that can be used to perform lookups. 2. Either the input has only one row or the join has a LOOKUP hint. These restrictions ensure that planning lookup joins in more cases does not lead to performance regressions, since the current execution logic does not fully de-duplicate spans when inequalities are used. Informs #51576 Release note (performance improvement): The execution engine can now perform lookup joins in more cases. This can significantly improve join performance when there is a large table with an index that conforms to the join ON conditions, as well as allow joins to halt early in the presence of a limit.
cockroachdb · Aug 5, 2022 · 244c88b · 244c88b
1 parent b2bb3e4
commit 244c88b
Show file tree

Hide file tree

Showing 21 changed files with 1,752 additions and 382 deletions.
diff --git a/pkg/sql/logictest/testdata/logic_test/lookup_join b/pkg/sql/logictest/testdata/logic_test/lookup_join
diff --git a/pkg/sql/logictest/testdata/logic_test/lookup_join_spans b/pkg/sql/logictest/testdata/logic_test/lookup_join_spans
@@ -425,7 +425,7 @@ WHERE
   name='cpu'
 ORDER BY value
 ----
-2  2020-01-01 00:01:01 +0000 UTC  -11   4  2  1     cpu
+2  2020-01-01 00:01:01 +0000 UTC  -11  4  2  1  cpu
 
 # Test NULL values in <= unbounded lookup span.
 query ITIIIIT
@@ -452,8 +452,8 @@ WHERE
   name='cpu'
 ORDER BY value
 ----
-2  2020-01-01 00:01:01 +0000 UTC  -11   4  2  1     cpu
-2  2020-01-01 00:01:02 +0000 UTC  -10   5  2  1     cpu
+2  2020-01-01 00:01:01 +0000 UTC  -11  4  2  1  cpu
+2  2020-01-01 00:01:02 +0000 UTC  -10  5  2  1  cpu
 
 # Test NULL values in WHERE equality conditions.
 query ITIIIIT

diff --git a/pkg/sql/opt/exec/execbuilder/testdata/lookup_join b/pkg/sql/opt/exec/execbuilder/testdata/lookup_join
@@ -4,10 +4,10 @@ statement ok
 CREATE TABLE abc (a INT, b INT, c INT, PRIMARY KEY (a, c))
 
 statement ok
-CREATE TABLE def (d INT, e INT, f INT, PRIMARY KEY (f, e))
+CREATE TABLE def (d INT, e INT, f INT, PRIMARY KEY (f, e), INDEX desc_idx (f, e DESC) STORING (d))
 
 statement ok
-CREATE TABLE def_e_decimal (d INT, e DECIMAL, f INT, PRIMARY KEY (f, e))
+CREATE TABLE def_e_decimal (d INT, e DECIMAL, f INT, PRIMARY KEY (f, e), INDEX desc_idx (f, e DESC) STORING (d))
 
 # Set up the statistics as if the first table is much smaller than the second.
 # This will make lookup join into the second table be the best plan.
@@ -96,9 +96,26 @@ vectorized: true
       table: abc@abc_pkey
       spans: /2-
 
-# The filter on 'e' is a contradiction. At the moment, we're handling it as the
-# ON expression (the optimizer should be smart enough to avoid execution
-# altogether, #80402).
+query T
+EXPLAIN (VERBOSE) SELECT * FROM abc JOIN def@desc_idx ON f = b WHERE a > 1 AND e > 1
+----
+distribution: local
+vectorized: true
+·
+• lookup join (inner)
+│ columns: (a, b, c, d, e, f)
+│ estimated row count: 33
+│ table: def@desc_idx
+│ lookup condition: (b = f) AND (e > 1)
+│
+└── • scan
+      columns: (a, b, c)
+      estimated row count: 33 (33% of the table; stats collected <hidden> ago)
+      table: abc@abc_pkey
+      spans: /2-
+
+# The filter on 'e' is a contradiction. The optimizer should be smart enough to
+# avoid execution altogether, #80402.
 query T
 EXPLAIN (VERBOSE) SELECT * FROM abc JOIN def ON f = b WHERE a > 1 AND e > 9223372036854775807
 ----
@@ -109,17 +126,16 @@ vectorized: true
 │ columns: (a, b, c, d, e, f)
 │ estimated row count: 33
 │ table: def@def_pkey
-│ equality: (b) = (f)
-│ pred: e > 9223372036854775807
+│ lookup condition: (b = f) AND (e > 9223372036854775807)
 │
 └── • scan
       columns: (a, b, c)
       estimated row count: 33 (33% of the table; stats collected <hidden> ago)
       table: abc@abc_pkey
       spans: /2-
 
-# Decimals don't support Next / Prev calls, so we handle the filter on 'e' as
-# the ON expression.
+# Decimals don't support Next / Prev calls, but the index column is ASC so
+# we can still use the (e > 1) filter in the lookup condition.
 query T
 EXPLAIN (VERBOSE) SELECT * FROM abc JOIN def_e_decimal ON f = b WHERE a > 1 AND e > 1
 ----
@@ -130,8 +146,48 @@ vectorized: true
 │ columns: (a, b, c, d, e, f)
 │ estimated row count: 33
 │ table: def_e_decimal@def_e_decimal_pkey
+│ lookup condition: (b = f) AND (e > 1)
+│
+└── • scan
+      columns: (a, b, c)
+      estimated row count: 33 (33% of the table; stats collected <hidden> ago)
+      table: abc@abc_pkey
+      spans: /2-
+
+# Decimals don't support Next / Prev calls, but the inequality is inclusive so
+# we can still use the (e >= 1) filter in the lookup condition.
+query T
+EXPLAIN (VERBOSE) SELECT * FROM abc JOIN def_e_decimal@desc_idx ON f = b WHERE a > 1 AND e >= 1
+----
+distribution: local
+vectorized: true
+·
+• lookup join (inner)
+│ columns: (a, b, c, d, e, f)
+│ estimated row count: 33
+│ table: def_e_decimal@desc_idx
+│ lookup condition: (b = f) AND (e >= 1)
+│
+└── • scan
+      columns: (a, b, c)
+      estimated row count: 33 (33% of the table; stats collected <hidden> ago)
+      table: abc@abc_pkey
+      spans: /2-
+
+# Decimals don't support Next / Prev calls and the index column is DESC so
+# we handle the (e < 1) filter in the ON expression.
+query T
+EXPLAIN (VERBOSE) SELECT * FROM abc JOIN def_e_decimal@desc_idx ON f = b WHERE a > 1 AND e < 1
+----
+distribution: local
+vectorized: true
+·
+• lookup join (inner)
+│ columns: (a, b, c, d, e, f)
+│ estimated row count: 33
+│ table: def_e_decimal@desc_idx
 │ equality: (b) = (f)
-│ pred: e > 1
+│ pred: e < 1
 │
 └── • scan
       columns: (a, b, c)
@@ -158,6 +214,7 @@ vectorized: true
       table: abc@abc_pkey
       spans: /2-
 
+# Inclusive inequality referencing an input column.
 query T
 EXPLAIN (VERBOSE) SELECT * FROM abc JOIN def ON f = b WHERE a >= e
 ----
@@ -168,17 +225,36 @@ vectorized: true
 │ columns: (a, b, c, d, e, f)
 │ estimated row count: 33
 │ table: def@def_pkey
-│ equality: (b) = (f)
-│ pred: a >= e
+│ lookup condition: (b = f) AND (a >= e)
+│
+└── • scan
+      columns: (a, b, c)
+      estimated row count: 100 (100% of the table; stats collected <hidden> ago)
+      table: abc@abc_pkey
+      spans: FULL SCAN
+
+# Inclusive inequality on a descending index column referencing an input column.
+query T
+EXPLAIN (VERBOSE) SELECT * FROM abc JOIN def@desc_idx ON f = b WHERE a >= e
+----
+distribution: local
+vectorized: true
+·
+• lookup join (inner)
+│ columns: (a, b, c, d, e, f)
+│ estimated row count: 33
+│ table: def@desc_idx
+│ lookup condition: (b = f) AND (a >= e)
 │
 └── • scan
       columns: (a, b, c)
       estimated row count: 100 (100% of the table; stats collected <hidden> ago)
       table: abc@abc_pkey
       spans: FULL SCAN
 
+# Exclusive inequality referencing an input column.
 query T
-EXPLAIN (VERBOSE) SELECT * FROM abc JOIN def ON f = b AND a >= e
+EXPLAIN (VERBOSE) SELECT * FROM abc JOIN def ON f = b AND a > e
 ----
 distribution: local
 vectorized: true
@@ -187,15 +263,97 @@ vectorized: true
 │ columns: (a, b, c, d, e, f)
 │ estimated row count: 33
 │ table: def@def_pkey
-│ equality: (b) = (f)
-│ pred: a >= e
+│ lookup condition: (b = f) AND (a > e)
+│
+└── • scan
+      columns: (a, b, c)
+      estimated row count: 100 (100% of the table; stats collected <hidden> ago)
+      table: abc@abc_pkey
+      spans: FULL SCAN
+
+# Exclusive inequality on a descending index column referencing an input column.
+query T
+EXPLAIN (VERBOSE) SELECT * FROM abc JOIN def@desc_idx ON f = b AND a > e
+----
+distribution: local
+vectorized: true
+·
+• lookup join (inner)
+│ columns: (a, b, c, d, e, f)
+│ estimated row count: 33
+│ table: def@desc_idx
+│ lookup condition: (b = f) AND (a > e)
 │
 └── • scan
       columns: (a, b, c)
       estimated row count: 100 (100% of the table; stats collected <hidden> ago)
       table: abc@abc_pkey
       spans: FULL SCAN
 
+# Inequality on a decimal column.
+query T
+EXPLAIN (VERBOSE) SELECT * FROM abc JOIN def_e_decimal ON f = b AND a::DECIMAL >= e
+----
+distribution: local
+vectorized: true
+·
+• project
+│ columns: (a, b, c, d, e, f)
+│ estimated row count: 33
+│
+└── • lookup join (inner)
+    │ columns: (column11, a, b, c, d, e, f)
+    │ estimated row count: 33
+    │ table: def_e_decimal@def_e_decimal_pkey
+    │ lookup condition: (b = f) AND (column11 >= e)
+    │
+    └── • render
+        │ columns: (column11, a, b, c)
+        │ estimated row count: 100
+        │ render column11: a::DECIMAL
+        │ render a: a
+        │ render b: b
+        │ render c: c
+        │
+        └── • scan
+              columns: (a, b, c)
+              estimated row count: 100 (100% of the table; stats collected <hidden> ago)
+              table: abc@abc_pkey
+              spans: FULL SCAN
+
+# Inequality on a descending decimal index column.
+query T
+EXPLAIN (VERBOSE) SELECT * FROM abc JOIN def_e_decimal@desc_idx ON f = b AND a::DECIMAL >= e
+----
+distribution: local
+vectorized: true
+·
+• project
+│ columns: (a, b, c, d, e, f)
+│ estimated row count: 33
+│
+└── • lookup join (inner)
+    │ columns: (column11, a, b, c, d, e, f)
+    │ estimated row count: 33
+    │ table: def_e_decimal@desc_idx
+    │ lookup condition: (b = f) AND (column11 >= e)
+    │
+    └── • render
+        │ columns: (column11, a, b, c)
+        │ estimated row count: 100
+        │ render column11: a::DECIMAL
+        │ render a: a
+        │ render b: b
+        │ render c: c
+        │
+        └── • scan
+              columns: (a, b, c)
+              estimated row count: 100 (100% of the table; stats collected <hidden> ago)
+              table: abc@abc_pkey
+              spans: FULL SCAN
+
+# The inequality has to be in the ON condition because the columns are
+# different types.
 query T
 EXPLAIN (VERBOSE) SELECT * FROM abc JOIN def_e_decimal ON f = b AND a >= e
 ----
@@ -215,6 +373,39 @@ vectorized: true
       table: abc@abc_pkey
       spans: FULL SCAN
 
+# The inequality has to be in the ON condition because decimals don't support
+# Prev calls.
+query T
+EXPLAIN (VERBOSE) SELECT * FROM abc JOIN def_e_decimal@desc_idx ON f = b AND a::DECIMAL > e
+----
+distribution: local
+vectorized: true
+·
+• project
+│ columns: (a, b, c, d, e, f)
+│ estimated row count: 33
+│
+└── • lookup join (inner)
+    │ columns: (column11, a, b, c, d, e, f)
+    │ estimated row count: 33
+    │ table: def_e_decimal@desc_idx
+    │ equality: (b) = (f)
+    │ pred: column11 > e
+    │
+    └── • render
+        │ columns: (column11, a, b, c)
+        │ estimated row count: 100
+        │ render column11: a::DECIMAL
+        │ render a: a
+        │ render b: b
+        │ render c: c
+        │
+        └── • scan
+              columns: (a, b, c)
+              estimated row count: 100 (100% of the table; stats collected <hidden> ago)
+              table: abc@abc_pkey
+              spans: FULL SCAN
+
 # Verify a distsql plan.
 statement ok
 CREATE TABLE data (a INT, b INT, c INT, d INT, PRIMARY KEY (a, b, c, d))