diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c index 8ec9a8d956d..9c17ecbdfe3 100644 --- a/src/backend/executor/nodeHash.c +++ b/src/backend/executor/nodeHash.c @@ -2192,7 +2192,12 @@ bool ExecScanHashBucket(HashState *hashState, HashJoinState *hjstate, ExprContext *econtext) { - ExprState *hjclauses = hjstate->hashclauses; + /* + * Greenplum specific behavior. + * Using hashqualclauses to support hash join on 'IS NOT DISTINCT FROM' + * as well as '='. + */ + ExprState *hjclauses = hjstate->hashqualclauses; HashJoinTable hashtable = hjstate->hj_HashTable; HashJoinTuple hashTuple = hjstate->hj_CurTuple; uint32 hashvalue = hjstate->hj_CurHashValue; @@ -2253,7 +2258,12 @@ bool ExecParallelScanHashBucket(HashState *hashState, HashJoinState *hjstate, ExprContext *econtext) { - ExprState *hjclauses = hjstate->hashclauses; + /* + * Greenplum specific behavior. + * Using hashqualclauses to support hash join on 'IS NOT DISTINCT FROM' + * as well as '='. + */ + ExprState *hjclauses = hjstate->hashqualclauses; HashJoinTable hashtable = hjstate->hj_HashTable; HashJoinTuple hashTuple = hjstate->hj_CurTuple; uint32 hashvalue = hjstate->hj_CurHashValue; diff --git a/src/test/regress/expected/groupingsets.out b/src/test/regress/expected/groupingsets.out index a1075faeafa..cfa38e80113 100644 --- a/src/test/regress/expected/groupingsets.out +++ b/src/test/regress/expected/groupingsets.out @@ -2102,78 +2102,76 @@ set enable_sort = true; set enable_hashagg = false; set jit_above_cost = 0; explain (costs off) -select g100, g10, sum(g::numeric), count(*), max(g::text) -from gs_data_1 group by cube (g1000, g100,g10); +select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from + (select g%1000 as g1000, g%100 as g100, g%10 as g10, g + from generate_series(0,199999) g) s +group by cube (g1000,g100,g10); QUERY PLAN --------------------------------------------------------------- - Finalize GroupAggregate - Group Key: g1000, g100, g10, (GROUPINGSET_ID()) - -> Gather Motion 3:1 (slice1; segments: 3) - Merge Key: g1000, g100, g10, (GROUPINGSET_ID()) - -> Sort - Sort Key: g1000, g100, g10, (GROUPINGSET_ID()) - -> Partial GroupAggregate - Group Key: g1000, g100, g10 - Group Key: g1000, g100 - Group Key: g1000 - Group Key: () - Sort Key: g100, g10 - Group Key: g100, g10 - Group Key: g100 - Sort Key: g10, g1000 - Group Key: g10, g1000 - Group Key: g10 - -> Sort - Sort Key: g1000, g100, g10 - -> Seq Scan on gs_data_1 +--------------------------------------------------------------- + GroupAggregate + Group Key: ((g.g % 1000)), ((g.g % 100)), ((g.g % 10)) + Group Key: ((g.g % 1000)), ((g.g % 100)) + Group Key: ((g.g % 1000)) + Group Key: () + Sort Key: ((g.g % 100)), ((g.g % 10)) + Group Key: ((g.g % 100)), ((g.g % 10)) + Group Key: ((g.g % 100)) + Sort Key: ((g.g % 10)), ((g.g % 1000)) + Group Key: ((g.g % 10)), ((g.g % 1000)) + Group Key: ((g.g % 10)) + -> Sort + Sort Key: ((g.g % 1000)), ((g.g % 100)), ((g.g % 10)) + -> Function Scan on generate_series g Optimizer: Postgres query optimizer -(21 rows) +(15 rows) create table gs_group_1 as -select g100, g10, sum(g::numeric), count(*), max(g::text) -from gs_data_1 group by cube (g1000, g100,g10); +select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from + (select g%1000 as g1000, g%100 as g100, g%10 as g10, g + from generate_series(0,199999) g) s +group by cube (g1000,g100,g10) distributed by (g1000); -- Produce results with hash aggregation. set enable_hashagg = true; set enable_sort = false; explain (costs off) -select g100, g10, sum(g::numeric), count(*), max(g::text) -from gs_data_1 group by cube (g1000, g100,g10); +select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from + (select g%1000 as g1000, g%100 as g100, g%10 as g10, g + from generate_series(0,199999) g) s +group by cube (g1000,g100,g10); QUERY PLAN --------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) - -> Finalize HashAggregate - Group Key: g1000, g100, g10, (GROUPINGSET_ID()) - -> Redistribute Motion 3:3 (slice2; segments: 3) - Hash Key: g1000, g100, g10, (GROUPINGSET_ID()) - -> Partial MixedAggregate - Hash Key: g1000, g100, g10 - Hash Key: g1000, g100 - Hash Key: g1000 - Hash Key: g100, g10 - Hash Key: g100 - Hash Key: g10, g1000 - Hash Key: g10 - Group Key: () - -> Seq Scan on gs_data_1 +--------------------------------------------------------------- + GroupAggregate + Group Key: ((g.g % 1000)), ((g.g % 100)), ((g.g % 10)) + Group Key: ((g.g % 1000)), ((g.g % 100)) + Group Key: ((g.g % 1000)) + Group Key: () + Sort Key: ((g.g % 100)), ((g.g % 10)) + Group Key: ((g.g % 100)), ((g.g % 10)) + Group Key: ((g.g % 100)) + Sort Key: ((g.g % 10)), ((g.g % 1000)) + Group Key: ((g.g % 10)), ((g.g % 1000)) + Group Key: ((g.g % 10)) + -> Sort + Sort Key: ((g.g % 1000)), ((g.g % 100)), ((g.g % 10)) + -> Function Scan on generate_series g Optimizer: Postgres query optimizer -(16 rows) +(15 rows) create table gs_hash_1 as -select g100, g10, sum(g::numeric), count(*), max(g::text) -from gs_data_1 group by cube (g1000, g100,g10); +select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from + (select g%1000 as g1000, g%100 as g100, g%10 as g10, g + from generate_series(0,199999) g) s +group by cube (g1000,g100,g10) distributed by (g1000); +set jit_above_cost to default; set enable_sort = true; set work_mem to default; WARNING: "work_mem": setting is deprecated, and may be removed in a future release. --- GPDB_12_MERGE_FIXME: the following comparison query has an ORCA plan that --- relies on "IS NOT DISTINCT FROM" Hash Join, a variant that we likely have --- lost during the merge with upstream Postgres 12. Disable ORCA for this query -SET optimizer TO off; --- Compare results +-- Compare results of ORCA plan that relies on "IS NOT DISTINCT FROM" HASH Join (select * from gs_hash_1 except select * from gs_group_1) union all (select * from gs_group_1 except select * from gs_hash_1); - g100 | g10 | sum | count | max -------+-----+-----+-------+----- + g1000 | g100 | g10 | sum | count | max +-------+------+-----+-----+-------+----- (0 rows) RESET optimizer; diff --git a/src/test/regress/expected/groupingsets_optimizer.out b/src/test/regress/expected/groupingsets_optimizer.out index 24eb0b42bac..3e7a6b2afdd 100644 --- a/src/test/regress/expected/groupingsets_optimizer.out +++ b/src/test/regress/expected/groupingsets_optimizer.out @@ -2248,78 +2248,76 @@ set enable_sort = true; set enable_hashagg = false; set jit_above_cost = 0; explain (costs off) -select g100, g10, sum(g::numeric), count(*), max(g::text) -from gs_data_1 group by cube (g1000, g100,g10); +select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from + (select g%1000 as g1000, g%100 as g100, g%10 as g10, g + from generate_series(0,199999) g) s +group by cube (g1000,g100,g10); QUERY PLAN --------------------------------------------------------------- - Finalize GroupAggregate - Group Key: g1000, g100, g10, (GROUPINGSET_ID()) - -> Gather Motion 3:1 (slice1; segments: 3) - Merge Key: g1000, g100, g10, (GROUPINGSET_ID()) - -> Sort - Sort Key: g1000, g100, g10, (GROUPINGSET_ID()) - -> Partial GroupAggregate - Group Key: g1000, g100, g10 - Group Key: g1000, g100 - Group Key: g1000 - Group Key: () - Sort Key: g100, g10 - Group Key: g100, g10 - Group Key: g100 - Sort Key: g10, g1000 - Group Key: g10, g1000 - Group Key: g10 - -> Sort - Sort Key: g1000, g100, g10 - -> Seq Scan on gs_data_1 +--------------------------------------------------------------- + GroupAggregate + Group Key: ((g.g % 1000)), ((g.g % 100)), ((g.g % 10)) + Group Key: ((g.g % 1000)), ((g.g % 100)) + Group Key: ((g.g % 1000)) + Group Key: () + Sort Key: ((g.g % 100)), ((g.g % 10)) + Group Key: ((g.g % 100)), ((g.g % 10)) + Group Key: ((g.g % 100)) + Sort Key: ((g.g % 10)), ((g.g % 1000)) + Group Key: ((g.g % 10)), ((g.g % 1000)) + Group Key: ((g.g % 10)) + -> Sort + Sort Key: ((g.g % 1000)), ((g.g % 100)), ((g.g % 10)) + -> Function Scan on generate_series g Optimizer: Postgres query optimizer -(21 rows) +(15 rows) create table gs_group_1 as -select g100, g10, sum(g::numeric), count(*), max(g::text) -from gs_data_1 group by cube (g1000, g100,g10); +select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from + (select g%1000 as g1000, g%100 as g100, g%10 as g10, g + from generate_series(0,199999) g) s +group by cube (g1000,g100,g10) distributed by (g1000); -- Produce results with hash aggregation. set enable_hashagg = true; set enable_sort = false; explain (costs off) -select g100, g10, sum(g::numeric), count(*), max(g::text) -from gs_data_1 group by cube (g1000, g100,g10); +select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from + (select g%1000 as g1000, g%100 as g100, g%10 as g10, g + from generate_series(0,199999) g) s +group by cube (g1000,g100,g10); QUERY PLAN --------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) - -> Finalize HashAggregate - Group Key: g1000, g100, g10, (GROUPINGSET_ID()) - -> Redistribute Motion 3:3 (slice2; segments: 3) - Hash Key: g1000, g100, g10, (GROUPINGSET_ID()) - -> Partial MixedAggregate - Hash Key: g1000, g100, g10 - Hash Key: g1000, g100 - Hash Key: g1000 - Hash Key: g100, g10 - Hash Key: g100 - Hash Key: g10, g1000 - Hash Key: g10 - Group Key: () - -> Seq Scan on gs_data_1 +--------------------------------------------------------------- + GroupAggregate + Group Key: ((g.g % 1000)), ((g.g % 100)), ((g.g % 10)) + Group Key: ((g.g % 1000)), ((g.g % 100)) + Group Key: ((g.g % 1000)) + Group Key: () + Sort Key: ((g.g % 100)), ((g.g % 10)) + Group Key: ((g.g % 100)), ((g.g % 10)) + Group Key: ((g.g % 100)) + Sort Key: ((g.g % 10)), ((g.g % 1000)) + Group Key: ((g.g % 10)), ((g.g % 1000)) + Group Key: ((g.g % 10)) + -> Sort + Sort Key: ((g.g % 1000)), ((g.g % 100)), ((g.g % 10)) + -> Function Scan on generate_series g Optimizer: Postgres query optimizer -(16 rows) +(15 rows) create table gs_hash_1 as -select g100, g10, sum(g::numeric), count(*), max(g::text) -from gs_data_1 group by cube (g1000, g100,g10); +select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from + (select g%1000 as g1000, g%100 as g100, g%10 as g10, g + from generate_series(0,199999) g) s +group by cube (g1000,g100,g10) distributed by (g1000); +set jit_above_cost to default; set enable_sort = true; set work_mem to default; WARNING: "work_mem": setting is deprecated, and may be removed in a future release. --- GPDB_12_MERGE_FIXME: the following comparison query has an ORCA plan that --- relies on "IS NOT DISTINCT FROM" Hash Join, a variant that we likely have --- lost during the merge with upstream Postgres 12. Disable ORCA for this query -SET optimizer TO off; --- Compare results +-- Compare results of ORCA plan that relies on "IS NOT DISTINCT FROM" HASH Join (select * from gs_hash_1 except select * from gs_group_1) union all (select * from gs_group_1 except select * from gs_hash_1); - g100 | g10 | sum | count | max -------+-----+-----+-------+----- + g1000 | g100 | g10 | sum | count | max +-------+------+-----+-----+-------+----- (0 rows) RESET optimizer; diff --git a/src/test/regress/expected/select_distinct.out b/src/test/regress/expected/select_distinct.out index 21b98c3333e..bcfa5237e29 100644 --- a/src/test/regress/expected/select_distinct.out +++ b/src/test/regress/expected/select_distinct.out @@ -324,10 +324,8 @@ SELECT null IS NOT DISTINCT FROM null as "yes"; (1 row) -- join cases --- test IS DISTINCT FROM and IS NOT DISTINCT FROM join qual. the postgres planner doesn't support hash join --- on IS NOT DISTINCT FROM for now, ORCA support "IS NOT DISTINCT FROM" Hash Join but generates wrong result, --- DISABLE ORCA. Please fix me later if ORCA resolve the problem. -SET optimizer TO off; +-- test IS DISTINCT FROM and IS NOT DISTINCT FROM join qual.The postgres planner doesn't support hash join on +-- IS NOT DISTINCT FROM for now, ORCA supports Hash Join on "IS NOT DISTINCT FROM". CREATE TABLE distinct_1(a int); NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Cloudberry Database data distribution key for this table. HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. @@ -349,6 +347,32 @@ EXPLAIN SELECT * FROM distinct_1, distinct_2 WHERE distinct_1.a IS DISTINCT FROM Optimizer: Postgres query optimizer (8 rows) +EXPLAIN SELECT * FROM distinct_1 left join distinct_2 on distinct_1.a IS DISTINCT FROM distinct_2.a; + QUERY PLAN +--------------------------------------------------------------------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) (cost=10000000000.00..10214778180.80 rows=9264416310 width=8) + -> Nested Loop Left Join (cost=10000000000.00..10091252630.00 rows=3088138770 width=8) + Join Filter: (distinct_1.a IS DISTINCT FROM distinct_2.a) + -> Seq Scan on distinct_1 (cost=0.00..355.00 rows=32100 width=4) + -> Materialize (cost=0.00..2120.50 rows=96300 width=4) + -> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..1639.00 rows=96300 width=4) + -> Seq Scan on distinct_2 (cost=0.00..355.00 rows=32100 width=4) + Optimizer: Postgres query optimizer +(8 rows) + +EXPLAIN SELECT * FROM distinct_1 right join distinct_2 on distinct_1.a IS DISTINCT FROM distinct_2.a; + QUERY PLAN +--------------------------------------------------------------------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) (cost=10000000000.00..10214778180.80 rows=9264416310 width=8) + -> Nested Loop Left Join (cost=10000000000.00..10091252630.00 rows=3088138770 width=8) + Join Filter: (distinct_1.a IS DISTINCT FROM distinct_2.a) + -> Seq Scan on distinct_2 (cost=0.00..355.00 rows=32100 width=4) + -> Materialize (cost=0.00..2120.50 rows=96300 width=4) + -> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..1639.00 rows=96300 width=4) + -> Seq Scan on distinct_1 (cost=0.00..355.00 rows=32100 width=4) + Optimizer: Postgres query optimizer +(8 rows) + EXPLAIN SELECT * FROM distinct_1, distinct_2 WHERE distinct_1.a IS NOT DISTINCT FROM distinct_2.a; QUERY PLAN -------------------------------------------------------------------------------------------------------- @@ -362,17 +386,76 @@ EXPLAIN SELECT * FROM distinct_1, distinct_2 WHERE distinct_1.a IS NOT DISTINCT Optimizer: Postgres query optimizer (8 rows) +EXPLAIN SELECT * FROM distinct_1 left join distinct_2 on distinct_1.a IS NOT DISTINCT FROM distinct_2.a; + QUERY PLAN +-------------------------------------------------------------------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) (cost=10000000000.00..10091376279.20 rows=9273690 width=8) + -> Nested Loop Left Join (cost=10000000000.00..10091252630.00 rows=3091230 width=8) + Join Filter: (NOT (distinct_1.a IS DISTINCT FROM distinct_2.a)) + -> Seq Scan on distinct_1 (cost=0.00..355.00 rows=32100 width=4) + -> Materialize (cost=0.00..2120.50 rows=96300 width=4) + -> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..1639.00 rows=96300 width=4) + -> Seq Scan on distinct_2 (cost=0.00..355.00 rows=32100 width=4) + Optimizer: Postgres query optimizer +(8 rows) + +EXPLAIN SELECT * FROM distinct_1 right join distinct_2 on distinct_1.a IS NOT DISTINCT FROM distinct_2.a; + QUERY PLAN +-------------------------------------------------------------------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) (cost=10000000000.00..10091376279.20 rows=9273690 width=8) + -> Nested Loop Left Join (cost=10000000000.00..10091252630.00 rows=3091230 width=8) + Join Filter: (NOT (distinct_1.a IS DISTINCT FROM distinct_2.a)) + -> Seq Scan on distinct_2 (cost=0.00..355.00 rows=32100 width=4) + -> Materialize (cost=0.00..2120.50 rows=96300 width=4) + -> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..1639.00 rows=96300 width=4) + -> Seq Scan on distinct_1 (cost=0.00..355.00 rows=32100 width=4) + Optimizer: Postgres query optimizer +(8 rows) + SELECT * FROM distinct_1, distinct_2 WHERE distinct_1.a IS DISTINCT FROM distinct_2.a; a | a ---+--- + 2 | + 2 | 1 + | 1 1 | +(4 rows) + +SELECT * FROM distinct_1 left join distinct_2 on distinct_1.a IS DISTINCT FROM distinct_2.a; + a | a +---+--- + 2 | 2 | 1 + | 1 + 1 | +(4 rows) + +SELECT * FROM distinct_1 right join distinct_2 on distinct_1.a IS DISTINCT FROM distinct_2.a; + a | a +---+--- + 1 | 2 | + 2 | 1 | 1 (4 rows) SELECT * FROM distinct_1, distinct_2 WHERE distinct_1.a IS NOT DISTINCT FROM distinct_2.a; a | a +---+--- + 1 | 1 + | +(2 rows) + +SELECT * FROM distinct_1 left join distinct_2 on distinct_1.a IS NOT DISTINCT FROM distinct_2.a; + a | a +---+--- + 1 | 1 + 2 | + | +(3 rows) + +SELECT * FROM distinct_1 right join distinct_2 on distinct_1.a IS NOT DISTINCT FROM distinct_2.a; + a | a ---+--- | 1 | 1 @@ -380,7 +463,6 @@ SELECT * FROM distinct_1, distinct_2 WHERE distinct_1.a IS NOT DISTINCT FROM dis DROP TABLE distinct_1; DROP TABLE distinct_2; -RESET optimizer; -- gpdb start: test inherit/partition table distinct when gp_statistics_pullup_from_child_partition is on set gp_statistics_pullup_from_child_partition to on; CREATE TABLE sales (id int, date date, amt decimal(10,2)) diff --git a/src/test/regress/expected/select_distinct_optimizer.out b/src/test/regress/expected/select_distinct_optimizer.out index 3579654f986..91b868818c9 100644 --- a/src/test/regress/expected/select_distinct_optimizer.out +++ b/src/test/regress/expected/select_distinct_optimizer.out @@ -331,10 +331,8 @@ SELECT null IS NOT DISTINCT FROM null as "yes"; (1 row) -- join cases --- test IS DISTINCT FROM and IS NOT DISTINCT FROM join qual. the postgres planner doesn't support hash join --- on IS NOT DISTINCT FROM for now, ORCA support "IS NOT DISTINCT FROM" Hash Join but generates wrong result, --- DISABLE ORCA. Please fix me later if ORCA resolve the problem. -SET optimizer TO off; +-- test IS DISTINCT FROM and IS NOT DISTINCT FROM join qual.The postgres planner doesn't support hash join on +-- IS NOT DISTINCT FROM for now, ORCA supports Hash Join on "IS NOT DISTINCT FROM". CREATE TABLE distinct_1(a int); NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Cloudberry Database data distribution key for this table. HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. @@ -344,33 +342,90 @@ HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sur INSERT INTO distinct_1 VALUES(1),(2),(NULL); INSERT INTO distinct_2 VALUES(1),(NULL); EXPLAIN SELECT * FROM distinct_1, distinct_2 WHERE distinct_1.a IS DISTINCT FROM distinct_2.a; - QUERY PLAN ---------------------------------------------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) (cost=10000000000.00..10214778180.80 rows=9264416310 width=8) - -> Nested Loop (cost=10000000000.00..10091252630.00 rows=3088138770 width=8) + QUERY PLAN +--------------------------------------------------------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..1324032.54 rows=1 width=8) + -> Nested Loop (cost=0.00..1324032.54 rows=1 width=8) Join Filter: (distinct_1.a IS DISTINCT FROM distinct_2.a) - -> Seq Scan on distinct_1 (cost=0.00..355.00 rows=32100 width=4) - -> Materialize (cost=0.00..2120.50 rows=96300 width=4) - -> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..1639.00 rows=96300 width=4) - -> Seq Scan on distinct_2 (cost=0.00..355.00 rows=32100 width=4) - Optimizer: Postgres query optimizer + -> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..431.00 rows=1 width=4) + -> Seq Scan on distinct_1 (cost=0.00..431.00 rows=1 width=4) + -> Seq Scan on distinct_2 (cost=0.00..431.00 rows=1 width=4) + Optimizer: Pivotal Optimizer (GPORCA) +(7 rows) + +EXPLAIN SELECT * FROM distinct_1 left join distinct_2 on distinct_1.a IS DISTINCT FROM distinct_2.a; + QUERY PLAN +--------------------------------------------------------------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..1324032.55 rows=2 width=8) + -> Nested Loop Left Join (cost=0.00..1324032.55 rows=1 width=8) + Join Filter: (distinct_1.a IS DISTINCT FROM distinct_2.a) + -> Seq Scan on distinct_1 (cost=0.00..431.00 rows=1 width=4) + -> Materialize (cost=0.00..431.00 rows=1 width=4) + -> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..431.00 rows=1 width=4) + -> Seq Scan on distinct_2 (cost=0.00..431.00 rows=1 width=4) + Optimizer: Pivotal Optimizer (GPORCA) (8 rows) -EXPLAIN SELECT * FROM distinct_1, distinct_2 WHERE distinct_1.a IS NOT DISTINCT FROM distinct_2.a; - QUERY PLAN --------------------------------------------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) (cost=10000000000.00..10091376279.20 rows=9273690 width=8) - -> Nested Loop (cost=10000000000.00..10091252630.00 rows=3091230 width=8) - Join Filter: (NOT (distinct_1.a IS DISTINCT FROM distinct_2.a)) - -> Seq Scan on distinct_1 (cost=0.00..355.00 rows=32100 width=4) - -> Materialize (cost=0.00..2120.50 rows=96300 width=4) - -> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..1639.00 rows=96300 width=4) - -> Seq Scan on distinct_2 (cost=0.00..355.00 rows=32100 width=4) - Optimizer: Postgres query optimizer +EXPLAIN SELECT * FROM distinct_1 right join distinct_2 on distinct_1.a IS DISTINCT FROM distinct_2.a; + QUERY PLAN +--------------------------------------------------------------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..1324032.55 rows=2 width=8) + -> Nested Loop Left Join (cost=0.00..1324032.55 rows=1 width=8) + Join Filter: (distinct_1.a IS DISTINCT FROM distinct_2.a) + -> Seq Scan on distinct_2 (cost=0.00..431.00 rows=1 width=4) + -> Materialize (cost=0.00..431.00 rows=1 width=4) + -> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..431.00 rows=1 width=4) + -> Seq Scan on distinct_1 (cost=0.00..431.00 rows=1 width=4) + Optimizer: Pivotal Optimizer (GPORCA) (8 rows) +EXPLAIN SELECT * FROM distinct_1, distinct_2 WHERE distinct_1.a IS NOT DISTINCT FROM distinct_2.a; + QUERY PLAN +------------------------------------------------------------------------------ + Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..862.00 rows=1 width=8) + -> Hash Join (cost=0.00..862.00 rows=1 width=8) + Hash Cond: (NOT (distinct_1.a IS DISTINCT FROM distinct_2.a)) + -> Seq Scan on distinct_1 (cost=0.00..431.00 rows=1 width=4) + -> Hash (cost=431.00..431.00 rows=1 width=4) + -> Seq Scan on distinct_2 (cost=0.00..431.00 rows=1 width=4) + Optimizer: Pivotal Optimizer (GPORCA) +(7 rows) + +EXPLAIN SELECT * FROM distinct_1 left join distinct_2 on distinct_1.a IS NOT DISTINCT FROM distinct_2.a; + QUERY PLAN +------------------------------------------------------------------------------ + Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..862.00 rows=2 width=8) + -> Hash Left Join (cost=0.00..862.00 rows=1 width=8) + Hash Cond: (NOT (distinct_1.a IS DISTINCT FROM distinct_2.a)) + -> Seq Scan on distinct_1 (cost=0.00..431.00 rows=1 width=4) + -> Hash (cost=431.00..431.00 rows=1 width=4) + -> Seq Scan on distinct_2 (cost=0.00..431.00 rows=1 width=4) + Optimizer: Pivotal Optimizer (GPORCA) +(7 rows) + +EXPLAIN SELECT * FROM distinct_1 right join distinct_2 on distinct_1.a IS NOT DISTINCT FROM distinct_2.a; + QUERY PLAN +------------------------------------------------------------------------------ + Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..862.00 rows=2 width=8) + -> Hash Left Join (cost=0.00..862.00 rows=1 width=8) + Hash Cond: (NOT (distinct_2.a IS DISTINCT FROM distinct_1.a)) + -> Seq Scan on distinct_2 (cost=0.00..431.00 rows=1 width=4) + -> Hash (cost=431.00..431.00 rows=1 width=4) + -> Seq Scan on distinct_1 (cost=0.00..431.00 rows=1 width=4) + Optimizer: Pivotal Optimizer (GPORCA) +(7 rows) + SELECT * FROM distinct_1, distinct_2 WHERE distinct_1.a IS DISTINCT FROM distinct_2.a; a | a +---+--- + 1 | + 2 | + 2 | 1 + | 1 +(4 rows) + +SELECT * FROM distinct_1 left join distinct_2 on distinct_1.a IS DISTINCT FROM distinct_2.a; + a | a ---+--- 2 | 2 | 1 @@ -378,6 +433,15 @@ SELECT * FROM distinct_1, distinct_2 WHERE distinct_1.a IS DISTINCT FROM distinc 1 | (4 rows) +SELECT * FROM distinct_1 right join distinct_2 on distinct_1.a IS DISTINCT FROM distinct_2.a; + a | a +---+--- + 2 | 1 + | 1 + 2 | + 1 | +(4 rows) + SELECT * FROM distinct_1, distinct_2 WHERE distinct_1.a IS NOT DISTINCT FROM distinct_2.a; a | a ---+--- @@ -385,9 +449,23 @@ SELECT * FROM distinct_1, distinct_2 WHERE distinct_1.a IS NOT DISTINCT FROM dis 1 | 1 (2 rows) +SELECT * FROM distinct_1 left join distinct_2 on distinct_1.a IS NOT DISTINCT FROM distinct_2.a; + a | a +---+--- + 1 | 1 + 2 | + | +(3 rows) + +SELECT * FROM distinct_1 right join distinct_2 on distinct_1.a IS NOT DISTINCT FROM distinct_2.a; + a | a +---+--- + 1 | 1 + | +(2 rows) + DROP TABLE distinct_1; DROP TABLE distinct_2; -RESET optimizer; -- gpdb start: test inherit/partition table distinct when gp_statistics_pullup_from_child_partition is on set gp_statistics_pullup_from_child_partition to on; CREATE TABLE sales (id int, date date, amt decimal(10,2)) diff --git a/src/test/regress/sql/groupingsets.sql b/src/test/regress/sql/groupingsets.sql index 603c2a4a80e..d79815dd523 100644 --- a/src/test/regress/sql/groupingsets.sql +++ b/src/test/regress/sql/groupingsets.sql @@ -593,12 +593,17 @@ set enable_hashagg = false; set jit_above_cost = 0; explain (costs off) -select g100, g10, sum(g::numeric), count(*), max(g::text) -from gs_data_1 group by cube (g1000, g100,g10); +select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from + (select g%1000 as g1000, g%100 as g100, g%10 as g10, g + from generate_series(0,199999) g) s +group by cube (g1000,g100,g10); create table gs_group_1 as -select g100, g10, sum(g::numeric), count(*), max(g::text) -from gs_data_1 group by cube (g1000, g100,g10); +select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from + (select g%1000 as g1000, g%100 as g100, g%10 as g10, g + from generate_series(0,199999) g) s +group by cube (g1000,g100,g10) distributed by (g1000); + -- Produce results with hash aggregation. @@ -606,21 +611,23 @@ set enable_hashagg = true; set enable_sort = false; explain (costs off) -select g100, g10, sum(g::numeric), count(*), max(g::text) -from gs_data_1 group by cube (g1000, g100,g10); +select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from + (select g%1000 as g1000, g%100 as g100, g%10 as g10, g + from generate_series(0,199999) g) s +group by cube (g1000,g100,g10); create table gs_hash_1 as -select g100, g10, sum(g::numeric), count(*), max(g::text) -from gs_data_1 group by cube (g1000, g100,g10); +select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from + (select g%1000 as g1000, g%100 as g100, g%10 as g10, g + from generate_series(0,199999) g) s +group by cube (g1000,g100,g10) distributed by (g1000); + +set jit_above_cost to default; + set enable_sort = true; set work_mem to default; --- GPDB_12_MERGE_FIXME: the following comparison query has an ORCA plan that --- relies on "IS NOT DISTINCT FROM" Hash Join, a variant that we likely have --- lost during the merge with upstream Postgres 12. Disable ORCA for this query -SET optimizer TO off; - --- Compare results +-- Compare results of ORCA plan that relies on "IS NOT DISTINCT FROM" HASH Join (select * from gs_hash_1 except select * from gs_group_1) union all diff --git a/src/test/regress/sql/select_distinct.sql b/src/test/regress/sql/select_distinct.sql index c5d5b1245d0..27b63e699a0 100644 --- a/src/test/regress/sql/select_distinct.sql +++ b/src/test/regress/sql/select_distinct.sql @@ -139,21 +139,33 @@ SELECT 2 IS NOT DISTINCT FROM null as "no"; SELECT null IS NOT DISTINCT FROM null as "yes"; -- join cases --- test IS DISTINCT FROM and IS NOT DISTINCT FROM join qual. the postgres planner doesn't support hash join --- on IS NOT DISTINCT FROM for now, ORCA support "IS NOT DISTINCT FROM" Hash Join but generates wrong result, --- DISABLE ORCA. Please fix me later if ORCA resolve the problem. -SET optimizer TO off; +-- test IS DISTINCT FROM and IS NOT DISTINCT FROM join qual.The postgres planner doesn't support hash join on +-- IS NOT DISTINCT FROM for now, ORCA supports Hash Join on "IS NOT DISTINCT FROM". + CREATE TABLE distinct_1(a int); CREATE TABLE distinct_2(a int); INSERT INTO distinct_1 VALUES(1),(2),(NULL); INSERT INTO distinct_2 VALUES(1),(NULL); + EXPLAIN SELECT * FROM distinct_1, distinct_2 WHERE distinct_1.a IS DISTINCT FROM distinct_2.a; +EXPLAIN SELECT * FROM distinct_1 left join distinct_2 on distinct_1.a IS DISTINCT FROM distinct_2.a; +EXPLAIN SELECT * FROM distinct_1 right join distinct_2 on distinct_1.a IS DISTINCT FROM distinct_2.a; + EXPLAIN SELECT * FROM distinct_1, distinct_2 WHERE distinct_1.a IS NOT DISTINCT FROM distinct_2.a; +EXPLAIN SELECT * FROM distinct_1 left join distinct_2 on distinct_1.a IS NOT DISTINCT FROM distinct_2.a; +EXPLAIN SELECT * FROM distinct_1 right join distinct_2 on distinct_1.a IS NOT DISTINCT FROM distinct_2.a; + SELECT * FROM distinct_1, distinct_2 WHERE distinct_1.a IS DISTINCT FROM distinct_2.a; +SELECT * FROM distinct_1 left join distinct_2 on distinct_1.a IS DISTINCT FROM distinct_2.a; +SELECT * FROM distinct_1 right join distinct_2 on distinct_1.a IS DISTINCT FROM distinct_2.a; + SELECT * FROM distinct_1, distinct_2 WHERE distinct_1.a IS NOT DISTINCT FROM distinct_2.a; +SELECT * FROM distinct_1 left join distinct_2 on distinct_1.a IS NOT DISTINCT FROM distinct_2.a; +SELECT * FROM distinct_1 right join distinct_2 on distinct_1.a IS NOT DISTINCT FROM distinct_2.a; + DROP TABLE distinct_1; DROP TABLE distinct_2; -RESET optimizer; + -- gpdb start: test inherit/partition table distinct when gp_statistics_pullup_from_child_partition is on set gp_statistics_pullup_from_child_partition to on;