From 0d11fe09c508c9d88e7b5f983dc9ed048104c60e Mon Sep 17 00:00:00 2001 From: hari krishna Date: Mon, 6 Feb 2023 19:24:36 +0530 Subject: [PATCH] [FIXME] Wrong results on main branch for INDF query GPDB_12_MERGE_FIXME: the following comparison query has an ORCA plan that relies on "IS NOT DISTINCT FROM" Hash Join, a variant that we likely have lost during the merge with upstream Postgres 12. Disable ORCA for this query Issue: HASH Join(left, right, inner) removing NULL tuples when there is INDF in query. The field hashqualclauses in HashJoin node holds the expressions "IS NOT DISTINCT FROM" that test for match. This is normally idential to hashclauses but differs in case of non-equijoin comparision. While scanning the HashBucket we were using hashclauses so we missed the expressions with "IS NOT DISTINCT FROM". --- src/backend/executor/nodeHash.c | 14 +- src/test/regress/expected/groupingsets.out | 106 +++++++-------- .../expected/groupingsets_optimizer.out | 106 +++++++-------- src/test/regress/expected/select_distinct.out | 92 ++++++++++++- .../expected/select_distinct_optimizer.out | 128 ++++++++++++++---- src/test/regress/sql/groupingsets.sql | 35 +++-- src/test/regress/sql/select_distinct.sql | 22 ++- 7 files changed, 344 insertions(+), 159 deletions(-) diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c index 8ec9a8d956d..9c17ecbdfe3 100644 --- a/src/backend/executor/nodeHash.c +++ b/src/backend/executor/nodeHash.c @@ -2192,7 +2192,12 @@ bool ExecScanHashBucket(HashState *hashState, HashJoinState *hjstate, ExprContext *econtext) { - ExprState *hjclauses = hjstate->hashclauses; + /* + * Greenplum specific behavior. + * Using hashqualclauses to support hash join on 'IS NOT DISTINCT FROM' + * as well as '='. + */ + ExprState *hjclauses = hjstate->hashqualclauses; HashJoinTable hashtable = hjstate->hj_HashTable; HashJoinTuple hashTuple = hjstate->hj_CurTuple; uint32 hashvalue = hjstate->hj_CurHashValue; @@ -2253,7 +2258,12 @@ bool ExecParallelScanHashBucket(HashState *hashState, HashJoinState *hjstate, ExprContext *econtext) { - ExprState *hjclauses = hjstate->hashclauses; + /* + * Greenplum specific behavior. + * Using hashqualclauses to support hash join on 'IS NOT DISTINCT FROM' + * as well as '='. + */ + ExprState *hjclauses = hjstate->hashqualclauses; HashJoinTable hashtable = hjstate->hj_HashTable; HashJoinTuple hashTuple = hjstate->hj_CurTuple; uint32 hashvalue = hjstate->hj_CurHashValue; diff --git a/src/test/regress/expected/groupingsets.out b/src/test/regress/expected/groupingsets.out index a1075faeafa..cfa38e80113 100644 --- a/src/test/regress/expected/groupingsets.out +++ b/src/test/regress/expected/groupingsets.out @@ -2102,78 +2102,76 @@ set enable_sort = true; set enable_hashagg = false; set jit_above_cost = 0; explain (costs off) -select g100, g10, sum(g::numeric), count(*), max(g::text) -from gs_data_1 group by cube (g1000, g100,g10); +select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from + (select g%1000 as g1000, g%100 as g100, g%10 as g10, g + from generate_series(0,199999) g) s +group by cube (g1000,g100,g10); QUERY PLAN --------------------------------------------------------------- - Finalize GroupAggregate - Group Key: g1000, g100, g10, (GROUPINGSET_ID()) - -> Gather Motion 3:1 (slice1; segments: 3) - Merge Key: g1000, g100, g10, (GROUPINGSET_ID()) - -> Sort - Sort Key: g1000, g100, g10, (GROUPINGSET_ID()) - -> Partial GroupAggregate - Group Key: g1000, g100, g10 - Group Key: g1000, g100 - Group Key: g1000 - Group Key: () - Sort Key: g100, g10 - Group Key: g100, g10 - Group Key: g100 - Sort Key: g10, g1000 - Group Key: g10, g1000 - Group Key: g10 - -> Sort - Sort Key: g1000, g100, g10 - -> Seq Scan on gs_data_1 +--------------------------------------------------------------- + GroupAggregate + Group Key: ((g.g % 1000)), ((g.g % 100)), ((g.g % 10)) + Group Key: ((g.g % 1000)), ((g.g % 100)) + Group Key: ((g.g % 1000)) + Group Key: () + Sort Key: ((g.g % 100)), ((g.g % 10)) + Group Key: ((g.g % 100)), ((g.g % 10)) + Group Key: ((g.g % 100)) + Sort Key: ((g.g % 10)), ((g.g % 1000)) + Group Key: ((g.g % 10)), ((g.g % 1000)) + Group Key: ((g.g % 10)) + -> Sort + Sort Key: ((g.g % 1000)), ((g.g % 100)), ((g.g % 10)) + -> Function Scan on generate_series g Optimizer: Postgres query optimizer -(21 rows) +(15 rows) create table gs_group_1 as -select g100, g10, sum(g::numeric), count(*), max(g::text) -from gs_data_1 group by cube (g1000, g100,g10); +select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from + (select g%1000 as g1000, g%100 as g100, g%10 as g10, g + from generate_series(0,199999) g) s +group by cube (g1000,g100,g10) distributed by (g1000); -- Produce results with hash aggregation. set enable_hashagg = true; set enable_sort = false; explain (costs off) -select g100, g10, sum(g::numeric), count(*), max(g::text) -from gs_data_1 group by cube (g1000, g100,g10); +select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from + (select g%1000 as g1000, g%100 as g100, g%10 as g10, g + from generate_series(0,199999) g) s +group by cube (g1000,g100,g10); QUERY PLAN --------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) - -> Finalize HashAggregate - Group Key: g1000, g100, g10, (GROUPINGSET_ID()) - -> Redistribute Motion 3:3 (slice2; segments: 3) - Hash Key: g1000, g100, g10, (GROUPINGSET_ID()) - -> Partial MixedAggregate - Hash Key: g1000, g100, g10 - Hash Key: g1000, g100 - Hash Key: g1000 - Hash Key: g100, g10 - Hash Key: g100 - Hash Key: g10, g1000 - Hash Key: g10 - Group Key: () - -> Seq Scan on gs_data_1 +--------------------------------------------------------------- + GroupAggregate + Group Key: ((g.g % 1000)), ((g.g % 100)), ((g.g % 10)) + Group Key: ((g.g % 1000)), ((g.g % 100)) + Group Key: ((g.g % 1000)) + Group Key: () + Sort Key: ((g.g % 100)), ((g.g % 10)) + Group Key: ((g.g % 100)), ((g.g % 10)) + Group Key: ((g.g % 100)) + Sort Key: ((g.g % 10)), ((g.g % 1000)) + Group Key: ((g.g % 10)), ((g.g % 1000)) + Group Key: ((g.g % 10)) + -> Sort + Sort Key: ((g.g % 1000)), ((g.g % 100)), ((g.g % 10)) + -> Function Scan on generate_series g Optimizer: Postgres query optimizer -(16 rows) +(15 rows) create table gs_hash_1 as -select g100, g10, sum(g::numeric), count(*), max(g::text) -from gs_data_1 group by cube (g1000, g100,g10); +select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from + (select g%1000 as g1000, g%100 as g100, g%10 as g10, g + from generate_series(0,199999) g) s +group by cube (g1000,g100,g10) distributed by (g1000); +set jit_above_cost to default; set enable_sort = true; set work_mem to default; WARNING: "work_mem": setting is deprecated, and may be removed in a future release. --- GPDB_12_MERGE_FIXME: the following comparison query has an ORCA plan that --- relies on "IS NOT DISTINCT FROM" Hash Join, a variant that we likely have --- lost during the merge with upstream Postgres 12. Disable ORCA for this query -SET optimizer TO off; --- Compare results +-- Compare results of ORCA plan that relies on "IS NOT DISTINCT FROM" HASH Join (select * from gs_hash_1 except select * from gs_group_1) union all (select * from gs_group_1 except select * from gs_hash_1); - g100 | g10 | sum | count | max -------+-----+-----+-------+----- + g1000 | g100 | g10 | sum | count | max +-------+------+-----+-----+-------+----- (0 rows) RESET optimizer; diff --git a/src/test/regress/expected/groupingsets_optimizer.out b/src/test/regress/expected/groupingsets_optimizer.out index 24eb0b42bac..3e7a6b2afdd 100644 --- a/src/test/regress/expected/groupingsets_optimizer.out +++ b/src/test/regress/expected/groupingsets_optimizer.out @@ -2248,78 +2248,76 @@ set enable_sort = true; set enable_hashagg = false; set jit_above_cost = 0; explain (costs off) -select g100, g10, sum(g::numeric), count(*), max(g::text) -from gs_data_1 group by cube (g1000, g100,g10); +select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from + (select g%1000 as g1000, g%100 as g100, g%10 as g10, g + from generate_series(0,199999) g) s +group by cube (g1000,g100,g10); QUERY PLAN --------------------------------------------------------------- - Finalize GroupAggregate - Group Key: g1000, g100, g10, (GROUPINGSET_ID()) - -> Gather Motion 3:1 (slice1; segments: 3) - Merge Key: g1000, g100, g10, (GROUPINGSET_ID()) - -> Sort - Sort Key: g1000, g100, g10, (GROUPINGSET_ID()) - -> Partial GroupAggregate - Group Key: g1000, g100, g10 - Group Key: g1000, g100 - Group Key: g1000 - Group Key: () - Sort Key: g100, g10 - Group Key: g100, g10 - Group Key: g100 - Sort Key: g10, g1000 - Group Key: g10, g1000 - Group Key: g10 - -> Sort - Sort Key: g1000, g100, g10 - -> Seq Scan on gs_data_1 +--------------------------------------------------------------- + GroupAggregate + Group Key: ((g.g % 1000)), ((g.g % 100)), ((g.g % 10)) + Group Key: ((g.g % 1000)), ((g.g % 100)) + Group Key: ((g.g % 1000)) + Group Key: () + Sort Key: ((g.g % 100)), ((g.g % 10)) + Group Key: ((g.g % 100)), ((g.g % 10)) + Group Key: ((g.g % 100)) + Sort Key: ((g.g % 10)), ((g.g % 1000)) + Group Key: ((g.g % 10)), ((g.g % 1000)) + Group Key: ((g.g % 10)) + -> Sort + Sort Key: ((g.g % 1000)), ((g.g % 100)), ((g.g % 10)) + -> Function Scan on generate_series g Optimizer: Postgres query optimizer -(21 rows) +(15 rows) create table gs_group_1 as -select g100, g10, sum(g::numeric), count(*), max(g::text) -from gs_data_1 group by cube (g1000, g100,g10); +select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from + (select g%1000 as g1000, g%100 as g100, g%10 as g10, g + from generate_series(0,199999) g) s +group by cube (g1000,g100,g10) distributed by (g1000); -- Produce results with hash aggregation. set enable_hashagg = true; set enable_sort = false; explain (costs off) -select g100, g10, sum(g::numeric), count(*), max(g::text) -from gs_data_1 group by cube (g1000, g100,g10); +select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from + (select g%1000 as g1000, g%100 as g100, g%10 as g10, g + from generate_series(0,199999) g) s +group by cube (g1000,g100,g10); QUERY PLAN --------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) - -> Finalize HashAggregate - Group Key: g1000, g100, g10, (GROUPINGSET_ID()) - -> Redistribute Motion 3:3 (slice2; segments: 3) - Hash Key: g1000, g100, g10, (GROUPINGSET_ID()) - -> Partial MixedAggregate - Hash Key: g1000, g100, g10 - Hash Key: g1000, g100 - Hash Key: g1000 - Hash Key: g100, g10 - Hash Key: g100 - Hash Key: g10, g1000 - Hash Key: g10 - Group Key: () - -> Seq Scan on gs_data_1 +--------------------------------------------------------------- + GroupAggregate + Group Key: ((g.g % 1000)), ((g.g % 100)), ((g.g % 10)) + Group Key: ((g.g % 1000)), ((g.g % 100)) + Group Key: ((g.g % 1000)) + Group Key: () + Sort Key: ((g.g % 100)), ((g.g % 10)) + Group Key: ((g.g % 100)), ((g.g % 10)) + Group Key: ((g.g % 100)) + Sort Key: ((g.g % 10)), ((g.g % 1000)) + Group Key: ((g.g % 10)), ((g.g % 1000)) + Group Key: ((g.g % 10)) + -> Sort + Sort Key: ((g.g % 1000)), ((g.g % 100)), ((g.g % 10)) + -> Function Scan on generate_series g Optimizer: Postgres query optimizer -(16 rows) +(15 rows) create table gs_hash_1 as -select g100, g10, sum(g::numeric), count(*), max(g::text) -from gs_data_1 group by cube (g1000, g100,g10); +select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from + (select g%1000 as g1000, g%100 as g100, g%10 as g10, g + from generate_series(0,199999) g) s +group by cube (g1000,g100,g10) distributed by (g1000); +set jit_above_cost to default; set enable_sort = true; set work_mem to default; WARNING: "work_mem": setting is deprecated, and may be removed in a future release. --- GPDB_12_MERGE_FIXME: the following comparison query has an ORCA plan that --- relies on "IS NOT DISTINCT FROM" Hash Join, a variant that we likely have --- lost during the merge with upstream Postgres 12. Disable ORCA for this query -SET optimizer TO off; --- Compare results +-- Compare results of ORCA plan that relies on "IS NOT DISTINCT FROM" HASH Join (select * from gs_hash_1 except select * from gs_group_1) union all (select * from gs_group_1 except select * from gs_hash_1); - g100 | g10 | sum | count | max -------+-----+-----+-------+----- + g1000 | g100 | g10 | sum | count | max +-------+------+-----+-----+-------+----- (0 rows) RESET optimizer; diff --git a/src/test/regress/expected/select_distinct.out b/src/test/regress/expected/select_distinct.out index 21b98c3333e..bcfa5237e29 100644 --- a/src/test/regress/expected/select_distinct.out +++ b/src/test/regress/expected/select_distinct.out @@ -324,10 +324,8 @@ SELECT null IS NOT DISTINCT FROM null as "yes"; (1 row) -- join cases --- test IS DISTINCT FROM and IS NOT DISTINCT FROM join qual. the postgres planner doesn't support hash join --- on IS NOT DISTINCT FROM for now, ORCA support "IS NOT DISTINCT FROM" Hash Join but generates wrong result, --- DISABLE ORCA. Please fix me later if ORCA resolve the problem. -SET optimizer TO off; +-- test IS DISTINCT FROM and IS NOT DISTINCT FROM join qual.The postgres planner doesn't support hash join on +-- IS NOT DISTINCT FROM for now, ORCA supports Hash Join on "IS NOT DISTINCT FROM". CREATE TABLE distinct_1(a int); NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Cloudberry Database data distribution key for this table. HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. @@ -349,6 +347,32 @@ EXPLAIN SELECT * FROM distinct_1, distinct_2 WHERE distinct_1.a IS DISTINCT FROM Optimizer: Postgres query optimizer (8 rows) +EXPLAIN SELECT * FROM distinct_1 left join distinct_2 on distinct_1.a IS DISTINCT FROM distinct_2.a; + QUERY PLAN +--------------------------------------------------------------------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) (cost=10000000000.00..10214778180.80 rows=9264416310 width=8) + -> Nested Loop Left Join (cost=10000000000.00..10091252630.00 rows=3088138770 width=8) + Join Filter: (distinct_1.a IS DISTINCT FROM distinct_2.a) + -> Seq Scan on distinct_1 (cost=0.00..355.00 rows=32100 width=4) + -> Materialize (cost=0.00..2120.50 rows=96300 width=4) + -> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..1639.00 rows=96300 width=4) + -> Seq Scan on distinct_2 (cost=0.00..355.00 rows=32100 width=4) + Optimizer: Postgres query optimizer +(8 rows) + +EXPLAIN SELECT * FROM distinct_1 right join distinct_2 on distinct_1.a IS DISTINCT FROM distinct_2.a; + QUERY PLAN +--------------------------------------------------------------------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) (cost=10000000000.00..10214778180.80 rows=9264416310 width=8) + -> Nested Loop Left Join (cost=10000000000.00..10091252630.00 rows=3088138770 width=8) + Join Filter: (distinct_1.a IS DISTINCT FROM distinct_2.a) + -> Seq Scan on distinct_2 (cost=0.00..355.00 rows=32100 width=4) + -> Materialize (cost=0.00..2120.50 rows=96300 width=4) + -> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..1639.00 rows=96300 width=4) + -> Seq Scan on distinct_1 (cost=0.00..355.00 rows=32100 width=4) + Optimizer: Postgres query optimizer +(8 rows) + EXPLAIN SELECT * FROM distinct_1, distinct_2 WHERE distinct_1.a IS NOT DISTINCT FROM distinct_2.a; QUERY PLAN -------------------------------------------------------------------------------------------------------- @@ -362,17 +386,76 @@ EXPLAIN SELECT * FROM distinct_1, distinct_2 WHERE distinct_1.a IS NOT DISTINCT Optimizer: Postgres query optimizer (8 rows) +EXPLAIN SELECT * FROM distinct_1 left join distinct_2 on distinct_1.a IS NOT DISTINCT FROM distinct_2.a; + QUERY PLAN +-------------------------------------------------------------------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) (cost=10000000000.00..10091376279.20 rows=9273690 width=8) + -> Nested Loop Left Join (cost=10000000000.00..10091252630.00 rows=3091230 width=8) + Join Filter: (NOT (distinct_1.a IS DISTINCT FROM distinct_2.a)) + -> Seq Scan on distinct_1 (cost=0.00..355.00 rows=32100 width=4) + -> Materialize (cost=0.00..2120.50 rows=96300 width=4) + -> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..1639.00 rows=96300 width=4) + -> Seq Scan on distinct_2 (cost=0.00..355.00 rows=32100 width=4) + Optimizer: Postgres query optimizer +(8 rows) + +EXPLAIN SELECT * FROM distinct_1 right join distinct_2 on distinct_1.a IS NOT DISTINCT FROM distinct_2.a; + QUERY PLAN +-------------------------------------------------------------------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) (cost=10000000000.00..10091376279.20 rows=9273690 width=8) + -> Nested Loop Left Join (cost=10000000000.00..10091252630.00 rows=3091230 width=8) + Join Filter: (NOT (distinct_1.a IS DISTINCT FROM distinct_2.a)) + -> Seq Scan on distinct_2 (cost=0.00..355.00 rows=32100 width=4) + -> Materialize (cost=0.00..2120.50 rows=96300 width=4) + -> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..1639.00 rows=96300 width=4) + -> Seq Scan on distinct_1 (cost=0.00..355.00 rows=32100 width=4) + Optimizer: Postgres query optimizer +(8 rows) + SELECT * FROM distinct_1, distinct_2 WHERE distinct_1.a IS DISTINCT FROM distinct_2.a; a | a ---+--- + 2 | + 2 | 1 + | 1 1 | +(4 rows) + +SELECT * FROM distinct_1 left join distinct_2 on distinct_1.a IS DISTINCT FROM distinct_2.a; + a | a +---+--- + 2 | 2 | 1 + | 1 + 1 | +(4 rows) + +SELECT * FROM distinct_1 right join distinct_2 on distinct_1.a IS DISTINCT FROM distinct_2.a; + a | a +---+--- + 1 | 2 | + 2 | 1 | 1 (4 rows) SELECT * FROM distinct_1, distinct_2 WHERE distinct_1.a IS NOT DISTINCT FROM distinct_2.a; a | a +---+--- + 1 | 1 + | +(2 rows) + +SELECT * FROM distinct_1 left join distinct_2 on distinct_1.a IS NOT DISTINCT FROM distinct_2.a; + a | a +---+--- + 1 | 1 + 2 | + | +(3 rows) + +SELECT * FROM distinct_1 right join distinct_2 on distinct_1.a IS NOT DISTINCT FROM distinct_2.a; + a | a ---+--- | 1 | 1 @@ -380,7 +463,6 @@ SELECT * FROM distinct_1, distinct_2 WHERE distinct_1.a IS NOT DISTINCT FROM dis DROP TABLE distinct_1; DROP TABLE distinct_2; -RESET optimizer; -- gpdb start: test inherit/partition table distinct when gp_statistics_pullup_from_child_partition is on set gp_statistics_pullup_from_child_partition to on; CREATE TABLE sales (id int, date date, amt decimal(10,2)) diff --git a/src/test/regress/expected/select_distinct_optimizer.out b/src/test/regress/expected/select_distinct_optimizer.out index 3579654f986..91b868818c9 100644 --- a/src/test/regress/expected/select_distinct_optimizer.out +++ b/src/test/regress/expected/select_distinct_optimizer.out @@ -331,10 +331,8 @@ SELECT null IS NOT DISTINCT FROM null as "yes"; (1 row) -- join cases --- test IS DISTINCT FROM and IS NOT DISTINCT FROM join qual. the postgres planner doesn't support hash join --- on IS NOT DISTINCT FROM for now, ORCA support "IS NOT DISTINCT FROM" Hash Join but generates wrong result, --- DISABLE ORCA. Please fix me later if ORCA resolve the problem. -SET optimizer TO off; +-- test IS DISTINCT FROM and IS NOT DISTINCT FROM join qual.The postgres planner doesn't support hash join on +-- IS NOT DISTINCT FROM for now, ORCA supports Hash Join on "IS NOT DISTINCT FROM". CREATE TABLE distinct_1(a int); NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Cloudberry Database data distribution key for this table. HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. @@ -344,33 +342,90 @@ HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sur INSERT INTO distinct_1 VALUES(1),(2),(NULL); INSERT INTO distinct_2 VALUES(1),(NULL); EXPLAIN SELECT * FROM distinct_1, distinct_2 WHERE distinct_1.a IS DISTINCT FROM distinct_2.a; - QUERY PLAN ---------------------------------------------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) (cost=10000000000.00..10214778180.80 rows=9264416310 width=8) - -> Nested Loop (cost=10000000000.00..10091252630.00 rows=3088138770 width=8) + QUERY PLAN +--------------------------------------------------------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..1324032.54 rows=1 width=8) + -> Nested Loop (cost=0.00..1324032.54 rows=1 width=8) Join Filter: (distinct_1.a IS DISTINCT FROM distinct_2.a) - -> Seq Scan on distinct_1 (cost=0.00..355.00 rows=32100 width=4) - -> Materialize (cost=0.00..2120.50 rows=96300 width=4) - -> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..1639.00 rows=96300 width=4) - -> Seq Scan on distinct_2 (cost=0.00..355.00 rows=32100 width=4) - Optimizer: Postgres query optimizer + -> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..431.00 rows=1 width=4) + -> Seq Scan on distinct_1 (cost=0.00..431.00 rows=1 width=4) + -> Seq Scan on distinct_2 (cost=0.00..431.00 rows=1 width=4) + Optimizer: Pivotal Optimizer (GPORCA) +(7 rows) + +EXPLAIN SELECT * FROM distinct_1 left join distinct_2 on distinct_1.a IS DISTINCT FROM distinct_2.a; + QUERY PLAN +--------------------------------------------------------------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..1324032.55 rows=2 width=8) + -> Nested Loop Left Join (cost=0.00..1324032.55 rows=1 width=8) + Join Filter: (distinct_1.a IS DISTINCT FROM distinct_2.a) + -> Seq Scan on distinct_1 (cost=0.00..431.00 rows=1 width=4) + -> Materialize (cost=0.00..431.00 rows=1 width=4) + -> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..431.00 rows=1 width=4) + -> Seq Scan on distinct_2 (cost=0.00..431.00 rows=1 width=4) + Optimizer: Pivotal Optimizer (GPORCA) (8 rows) -EXPLAIN SELECT * FROM distinct_1, distinct_2 WHERE distinct_1.a IS NOT DISTINCT FROM distinct_2.a; - QUERY PLAN --------------------------------------------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) (cost=10000000000.00..10091376279.20 rows=9273690 width=8) - -> Nested Loop (cost=10000000000.00..10091252630.00 rows=3091230 width=8) - Join Filter: (NOT (distinct_1.a IS DISTINCT FROM distinct_2.a)) - -> Seq Scan on distinct_1 (cost=0.00..355.00 rows=32100 width=4) - -> Materialize (cost=0.00..2120.50 rows=96300 width=4) - -> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..1639.00 rows=96300 width=4) - -> Seq Scan on distinct_2 (cost=0.00..355.00 rows=32100 width=4) - Optimizer: Postgres query optimizer +EXPLAIN SELECT * FROM distinct_1 right join distinct_2 on distinct_1.a IS DISTINCT FROM distinct_2.a; + QUERY PLAN +--------------------------------------------------------------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..1324032.55 rows=2 width=8) + -> Nested Loop Left Join (cost=0.00..1324032.55 rows=1 width=8) + Join Filter: (distinct_1.a IS DISTINCT FROM distinct_2.a) + -> Seq Scan on distinct_2 (cost=0.00..431.00 rows=1 width=4) + -> Materialize (cost=0.00..431.00 rows=1 width=4) + -> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..431.00 rows=1 width=4) + -> Seq Scan on distinct_1 (cost=0.00..431.00 rows=1 width=4) + Optimizer: Pivotal Optimizer (GPORCA) (8 rows) +EXPLAIN SELECT * FROM distinct_1, distinct_2 WHERE distinct_1.a IS NOT DISTINCT FROM distinct_2.a; + QUERY PLAN +------------------------------------------------------------------------------ + Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..862.00 rows=1 width=8) + -> Hash Join (cost=0.00..862.00 rows=1 width=8) + Hash Cond: (NOT (distinct_1.a IS DISTINCT FROM distinct_2.a)) + -> Seq Scan on distinct_1 (cost=0.00..431.00 rows=1 width=4) + -> Hash (cost=431.00..431.00 rows=1 width=4) + -> Seq Scan on distinct_2 (cost=0.00..431.00 rows=1 width=4) + Optimizer: Pivotal Optimizer (GPORCA) +(7 rows) + +EXPLAIN SELECT * FROM distinct_1 left join distinct_2 on distinct_1.a IS NOT DISTINCT FROM distinct_2.a; + QUERY PLAN +------------------------------------------------------------------------------ + Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..862.00 rows=2 width=8) + -> Hash Left Join (cost=0.00..862.00 rows=1 width=8) + Hash Cond: (NOT (distinct_1.a IS DISTINCT FROM distinct_2.a)) + -> Seq Scan on distinct_1 (cost=0.00..431.00 rows=1 width=4) + -> Hash (cost=431.00..431.00 rows=1 width=4) + -> Seq Scan on distinct_2 (cost=0.00..431.00 rows=1 width=4) + Optimizer: Pivotal Optimizer (GPORCA) +(7 rows) + +EXPLAIN SELECT * FROM distinct_1 right join distinct_2 on distinct_1.a IS NOT DISTINCT FROM distinct_2.a; + QUERY PLAN +------------------------------------------------------------------------------ + Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..862.00 rows=2 width=8) + -> Hash Left Join (cost=0.00..862.00 rows=1 width=8) + Hash Cond: (NOT (distinct_2.a IS DISTINCT FROM distinct_1.a)) + -> Seq Scan on distinct_2 (cost=0.00..431.00 rows=1 width=4) + -> Hash (cost=431.00..431.00 rows=1 width=4) + -> Seq Scan on distinct_1 (cost=0.00..431.00 rows=1 width=4) + Optimizer: Pivotal Optimizer (GPORCA) +(7 rows) + SELECT * FROM distinct_1, distinct_2 WHERE distinct_1.a IS DISTINCT FROM distinct_2.a; a | a +---+--- + 1 | + 2 | + 2 | 1 + | 1 +(4 rows) + +SELECT * FROM distinct_1 left join distinct_2 on distinct_1.a IS DISTINCT FROM distinct_2.a; + a | a ---+--- 2 | 2 | 1 @@ -378,6 +433,15 @@ SELECT * FROM distinct_1, distinct_2 WHERE distinct_1.a IS DISTINCT FROM distinc 1 | (4 rows) +SELECT * FROM distinct_1 right join distinct_2 on distinct_1.a IS DISTINCT FROM distinct_2.a; + a | a +---+--- + 2 | 1 + | 1 + 2 | + 1 | +(4 rows) + SELECT * FROM distinct_1, distinct_2 WHERE distinct_1.a IS NOT DISTINCT FROM distinct_2.a; a | a ---+--- @@ -385,9 +449,23 @@ SELECT * FROM distinct_1, distinct_2 WHERE distinct_1.a IS NOT DISTINCT FROM dis 1 | 1 (2 rows) +SELECT * FROM distinct_1 left join distinct_2 on distinct_1.a IS NOT DISTINCT FROM distinct_2.a; + a | a +---+--- + 1 | 1 + 2 | + | +(3 rows) + +SELECT * FROM distinct_1 right join distinct_2 on distinct_1.a IS NOT DISTINCT FROM distinct_2.a; + a | a +---+--- + 1 | 1 + | +(2 rows) + DROP TABLE distinct_1; DROP TABLE distinct_2; -RESET optimizer; -- gpdb start: test inherit/partition table distinct when gp_statistics_pullup_from_child_partition is on set gp_statistics_pullup_from_child_partition to on; CREATE TABLE sales (id int, date date, amt decimal(10,2)) diff --git a/src/test/regress/sql/groupingsets.sql b/src/test/regress/sql/groupingsets.sql index 603c2a4a80e..d79815dd523 100644 --- a/src/test/regress/sql/groupingsets.sql +++ b/src/test/regress/sql/groupingsets.sql @@ -593,12 +593,17 @@ set enable_hashagg = false; set jit_above_cost = 0; explain (costs off) -select g100, g10, sum(g::numeric), count(*), max(g::text) -from gs_data_1 group by cube (g1000, g100,g10); +select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from + (select g%1000 as g1000, g%100 as g100, g%10 as g10, g + from generate_series(0,199999) g) s +group by cube (g1000,g100,g10); create table gs_group_1 as -select g100, g10, sum(g::numeric), count(*), max(g::text) -from gs_data_1 group by cube (g1000, g100,g10); +select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from + (select g%1000 as g1000, g%100 as g100, g%10 as g10, g + from generate_series(0,199999) g) s +group by cube (g1000,g100,g10) distributed by (g1000); + -- Produce results with hash aggregation. @@ -606,21 +611,23 @@ set enable_hashagg = true; set enable_sort = false; explain (costs off) -select g100, g10, sum(g::numeric), count(*), max(g::text) -from gs_data_1 group by cube (g1000, g100,g10); +select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from + (select g%1000 as g1000, g%100 as g100, g%10 as g10, g + from generate_series(0,199999) g) s +group by cube (g1000,g100,g10); create table gs_hash_1 as -select g100, g10, sum(g::numeric), count(*), max(g::text) -from gs_data_1 group by cube (g1000, g100,g10); +select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from + (select g%1000 as g1000, g%100 as g100, g%10 as g10, g + from generate_series(0,199999) g) s +group by cube (g1000,g100,g10) distributed by (g1000); + +set jit_above_cost to default; + set enable_sort = true; set work_mem to default; --- GPDB_12_MERGE_FIXME: the following comparison query has an ORCA plan that --- relies on "IS NOT DISTINCT FROM" Hash Join, a variant that we likely have --- lost during the merge with upstream Postgres 12. Disable ORCA for this query -SET optimizer TO off; - --- Compare results +-- Compare results of ORCA plan that relies on "IS NOT DISTINCT FROM" HASH Join (select * from gs_hash_1 except select * from gs_group_1) union all diff --git a/src/test/regress/sql/select_distinct.sql b/src/test/regress/sql/select_distinct.sql index c5d5b1245d0..27b63e699a0 100644 --- a/src/test/regress/sql/select_distinct.sql +++ b/src/test/regress/sql/select_distinct.sql @@ -139,21 +139,33 @@ SELECT 2 IS NOT DISTINCT FROM null as "no"; SELECT null IS NOT DISTINCT FROM null as "yes"; -- join cases --- test IS DISTINCT FROM and IS NOT DISTINCT FROM join qual. the postgres planner doesn't support hash join --- on IS NOT DISTINCT FROM for now, ORCA support "IS NOT DISTINCT FROM" Hash Join but generates wrong result, --- DISABLE ORCA. Please fix me later if ORCA resolve the problem. -SET optimizer TO off; +-- test IS DISTINCT FROM and IS NOT DISTINCT FROM join qual.The postgres planner doesn't support hash join on +-- IS NOT DISTINCT FROM for now, ORCA supports Hash Join on "IS NOT DISTINCT FROM". + CREATE TABLE distinct_1(a int); CREATE TABLE distinct_2(a int); INSERT INTO distinct_1 VALUES(1),(2),(NULL); INSERT INTO distinct_2 VALUES(1),(NULL); + EXPLAIN SELECT * FROM distinct_1, distinct_2 WHERE distinct_1.a IS DISTINCT FROM distinct_2.a; +EXPLAIN SELECT * FROM distinct_1 left join distinct_2 on distinct_1.a IS DISTINCT FROM distinct_2.a; +EXPLAIN SELECT * FROM distinct_1 right join distinct_2 on distinct_1.a IS DISTINCT FROM distinct_2.a; + EXPLAIN SELECT * FROM distinct_1, distinct_2 WHERE distinct_1.a IS NOT DISTINCT FROM distinct_2.a; +EXPLAIN SELECT * FROM distinct_1 left join distinct_2 on distinct_1.a IS NOT DISTINCT FROM distinct_2.a; +EXPLAIN SELECT * FROM distinct_1 right join distinct_2 on distinct_1.a IS NOT DISTINCT FROM distinct_2.a; + SELECT * FROM distinct_1, distinct_2 WHERE distinct_1.a IS DISTINCT FROM distinct_2.a; +SELECT * FROM distinct_1 left join distinct_2 on distinct_1.a IS DISTINCT FROM distinct_2.a; +SELECT * FROM distinct_1 right join distinct_2 on distinct_1.a IS DISTINCT FROM distinct_2.a; + SELECT * FROM distinct_1, distinct_2 WHERE distinct_1.a IS NOT DISTINCT FROM distinct_2.a; +SELECT * FROM distinct_1 left join distinct_2 on distinct_1.a IS NOT DISTINCT FROM distinct_2.a; +SELECT * FROM distinct_1 right join distinct_2 on distinct_1.a IS NOT DISTINCT FROM distinct_2.a; + DROP TABLE distinct_1; DROP TABLE distinct_2; -RESET optimizer; + -- gpdb start: test inherit/partition table distinct when gp_statistics_pullup_from_child_partition is on set gp_statistics_pullup_from_child_partition to on;