[FIXME] Wrong results on main branch for INDF query (#180)

GPDB_12_MERGE_FIXME: the following comparison query has an ORCA plan that relies on "IS NOT DISTINCT FROM" Hash Join, a variant that we likely have lost during the merge with upstream Postgres 12. Disable ORCA for this query Issue: HASH Join(left, right, inner) removing NULL tuples when there is INDF in query. The field hashqualclauses in HashJoin node holds the expressions "IS NOT DISTINCT FROM" that test for match. This is normally idential to hashclauses but differs in case of non-equijoin comparision. While scanning the HashBucket we were using hashclauses so we missed the expressions with "IS NOT DISTINCT FROM". Co-authored-by: hari krishna <hmaddileti@vmware.com>
apache · Dec 1, 2023 · 865e441 · 865e441
1 parent 485cc72
commit 865e441
Show file tree

Hide file tree

Showing 7 changed files with 344 additions and 159 deletions.
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
@@ -2192,7 +2192,12 @@ bool
 ExecScanHashBucket(HashState *hashState, HashJoinState *hjstate,
 				   ExprContext *econtext)
 {
-	ExprState  *hjclauses = hjstate->hashclauses;
+	/*
+	 * Greenplum specific behavior.
+	 * Using hashqualclauses to support hash join on 'IS NOT DISTINCT FROM'
+	 * as well as '='.
+	 */
+	ExprState  *hjclauses = hjstate->hashqualclauses;
 	HashJoinTable hashtable = hjstate->hj_HashTable;
 	HashJoinTuple hashTuple = hjstate->hj_CurTuple;
 	uint32		hashvalue = hjstate->hj_CurHashValue;
@@ -2253,7 +2258,12 @@ bool
 ExecParallelScanHashBucket(HashState *hashState, HashJoinState *hjstate,
 						   ExprContext *econtext)
 {
-	ExprState  *hjclauses = hjstate->hashclauses;
+	/*
+	 * Greenplum specific behavior.
+	 * Using hashqualclauses to support hash join on 'IS NOT DISTINCT FROM'
+	 * as well as '='.
+	 */
+	ExprState  *hjclauses = hjstate->hashqualclauses;
 	HashJoinTable hashtable = hjstate->hj_HashTable;
 	HashJoinTuple hashTuple = hjstate->hj_CurTuple;
 	uint32		hashvalue = hjstate->hj_CurHashValue;

diff --git a/src/test/regress/expected/groupingsets.out b/src/test/regress/expected/groupingsets.out
@@ -2102,78 +2102,76 @@ set enable_sort = true;
 set enable_hashagg = false;
 set jit_above_cost = 0;
 explain (costs off)
-select g100, g10, sum(g::numeric), count(*), max(g::text)
-from gs_data_1 group by cube (g1000, g100,g10);
+select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from
+  (select g%1000 as g1000, g%100 as g100, g%10 as g10, g
+   from generate_series(0,199999) g) s
+group by cube (g1000,g100,g10);
                           QUERY PLAN
---------------------------------------------------------------
- Finalize GroupAggregate
-   Group Key: g1000, g100, g10, (GROUPINGSET_ID())
-   ->  Gather Motion 3:1  (slice1; segments: 3)
-         Merge Key: g1000, g100, g10, (GROUPINGSET_ID())
-         ->  Sort
-               Sort Key: g1000, g100, g10, (GROUPINGSET_ID())
-               ->  Partial GroupAggregate
-                     Group Key: g1000, g100, g10
-                     Group Key: g1000, g100
-                     Group Key: g1000
-                     Group Key: ()
-                     Sort Key: g100, g10
-                       Group Key: g100, g10
-                       Group Key: g100
-                     Sort Key: g10, g1000
-                       Group Key: g10, g1000
-                       Group Key: g10
-                     ->  Sort
-                           Sort Key: g1000, g100, g10
-                           ->  Seq Scan on gs_data_1
+---------------------------------------------------------------
+ GroupAggregate
+   Group Key: ((g.g % 1000)), ((g.g % 100)), ((g.g % 10))
+   Group Key: ((g.g % 1000)), ((g.g % 100))
+   Group Key: ((g.g % 1000))
+   Group Key: ()
+   Sort Key: ((g.g % 100)), ((g.g % 10))
+     Group Key: ((g.g % 100)), ((g.g % 10))
+     Group Key: ((g.g % 100))
+   Sort Key: ((g.g % 10)), ((g.g % 1000))
+     Group Key: ((g.g % 10)), ((g.g % 1000))
+     Group Key: ((g.g % 10))
+   ->  Sort
+         Sort Key: ((g.g % 1000)), ((g.g % 100)), ((g.g % 10))
+         ->  Function Scan on generate_series g
  Optimizer: Postgres query optimizer
-(21 rows)
+(15 rows)
 
 create table gs_group_1 as
-select g100, g10, sum(g::numeric), count(*), max(g::text)
-from gs_data_1 group by cube (g1000, g100,g10);
+select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from
+  (select g%1000 as g1000, g%100 as g100, g%10 as g10, g
+   from generate_series(0,199999) g) s
+group by cube (g1000,g100,g10) distributed by (g1000);
 -- Produce results with hash aggregation.
 set enable_hashagg = true;
 set enable_sort = false;
 explain (costs off)
-select g100, g10, sum(g::numeric), count(*), max(g::text)
-from gs_data_1 group by cube (g1000, g100,g10);
+select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from
+  (select g%1000 as g1000, g%100 as g100, g%10 as g10, g
+   from generate_series(0,199999) g) s
+group by cube (g1000,g100,g10);
                           QUERY PLAN
---------------------------------------------------------------
- Gather Motion 3:1  (slice1; segments: 3)
-   ->  Finalize HashAggregate
-         Group Key: g1000, g100, g10, (GROUPINGSET_ID())
-         ->  Redistribute Motion 3:3  (slice2; segments: 3)
-               Hash Key: g1000, g100, g10, (GROUPINGSET_ID())
-               ->  Partial MixedAggregate
-                     Hash Key: g1000, g100, g10
-                     Hash Key: g1000, g100
-                     Hash Key: g1000
-                     Hash Key: g100, g10
-                     Hash Key: g100
-                     Hash Key: g10, g1000
-                     Hash Key: g10
-                     Group Key: ()
-                     ->  Seq Scan on gs_data_1
+---------------------------------------------------------------
+ GroupAggregate
+   Group Key: ((g.g % 1000)), ((g.g % 100)), ((g.g % 10))
+   Group Key: ((g.g % 1000)), ((g.g % 100))
+   Group Key: ((g.g % 1000))
+   Group Key: ()
+   Sort Key: ((g.g % 100)), ((g.g % 10))
+     Group Key: ((g.g % 100)), ((g.g % 10))
+     Group Key: ((g.g % 100))
+   Sort Key: ((g.g % 10)), ((g.g % 1000))
+     Group Key: ((g.g % 10)), ((g.g % 1000))
+     Group Key: ((g.g % 10))
+   ->  Sort
+         Sort Key: ((g.g % 1000)), ((g.g % 100)), ((g.g % 10))
+         ->  Function Scan on generate_series g
  Optimizer: Postgres query optimizer
-(16 rows)
+(15 rows)
 
 create table gs_hash_1 as
-select g100, g10, sum(g::numeric), count(*), max(g::text)
-from gs_data_1 group by cube (g1000, g100,g10);
+select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from
+  (select g%1000 as g1000, g%100 as g100, g%10 as g10, g
+   from generate_series(0,199999) g) s
+group by cube (g1000,g100,g10) distributed by (g1000);
+set jit_above_cost to default;
 set enable_sort = true;
 set work_mem to default;
 WARNING:  "work_mem": setting is deprecated, and may be removed in a future release.
--- GPDB_12_MERGE_FIXME: the following comparison query has an ORCA plan that
--- relies on "IS NOT DISTINCT FROM" Hash Join, a variant that we likely have
--- lost during the merge with upstream Postgres 12. Disable ORCA for this query
-SET optimizer TO off;
--- Compare results
+-- Compare results of ORCA plan that relies on "IS NOT DISTINCT FROM" HASH Join
 (select * from gs_hash_1 except select * from gs_group_1)
   union all
 (select * from gs_group_1 except select * from gs_hash_1);
- g100 | g10 | sum | count | max 
-------+-----+-----+-------+-----
+ g1000 | g100 | g10 | sum | count | max 
+-------+------+-----+-----+-------+-----
 (0 rows)
 
 RESET optimizer;

diff --git a/src/test/regress/expected/groupingsets_optimizer.out b/src/test/regress/expected/groupingsets_optimizer.out
@@ -2248,78 +2248,76 @@ set enable_sort = true;
 set enable_hashagg = false;
 set jit_above_cost = 0;
 explain (costs off)
-select g100, g10, sum(g::numeric), count(*), max(g::text)
-from gs_data_1 group by cube (g1000, g100,g10);
+select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from
+  (select g%1000 as g1000, g%100 as g100, g%10 as g10, g
+   from generate_series(0,199999) g) s
+group by cube (g1000,g100,g10);
                           QUERY PLAN
---------------------------------------------------------------
- Finalize GroupAggregate
-   Group Key: g1000, g100, g10, (GROUPINGSET_ID())
-   ->  Gather Motion 3:1  (slice1; segments: 3)
-         Merge Key: g1000, g100, g10, (GROUPINGSET_ID())
-         ->  Sort
-               Sort Key: g1000, g100, g10, (GROUPINGSET_ID())
-               ->  Partial GroupAggregate
-                     Group Key: g1000, g100, g10
-                     Group Key: g1000, g100
-                     Group Key: g1000
-                     Group Key: ()
-                     Sort Key: g100, g10
-                       Group Key: g100, g10
-                       Group Key: g100
-                     Sort Key: g10, g1000
-                       Group Key: g10, g1000
-                       Group Key: g10
-                     ->  Sort
-                           Sort Key: g1000, g100, g10
-                           ->  Seq Scan on gs_data_1
+---------------------------------------------------------------
+ GroupAggregate
+   Group Key: ((g.g % 1000)), ((g.g % 100)), ((g.g % 10))
+   Group Key: ((g.g % 1000)), ((g.g % 100))
+   Group Key: ((g.g % 1000))
+   Group Key: ()
+   Sort Key: ((g.g % 100)), ((g.g % 10))
+     Group Key: ((g.g % 100)), ((g.g % 10))
+     Group Key: ((g.g % 100))
+   Sort Key: ((g.g % 10)), ((g.g % 1000))
+     Group Key: ((g.g % 10)), ((g.g % 1000))
+     Group Key: ((g.g % 10))
+   ->  Sort
+         Sort Key: ((g.g % 1000)), ((g.g % 100)), ((g.g % 10))
+         ->  Function Scan on generate_series g
  Optimizer: Postgres query optimizer
-(21 rows)
+(15 rows)
 
 create table gs_group_1 as
-select g100, g10, sum(g::numeric), count(*), max(g::text)
-from gs_data_1 group by cube (g1000, g100,g10);
+select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from
+  (select g%1000 as g1000, g%100 as g100, g%10 as g10, g
+   from generate_series(0,199999) g) s
+group by cube (g1000,g100,g10) distributed by (g1000);
 -- Produce results with hash aggregation.
 set enable_hashagg = true;
 set enable_sort = false;
 explain (costs off)
-select g100, g10, sum(g::numeric), count(*), max(g::text)
-from gs_data_1 group by cube (g1000, g100,g10);
+select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from
+  (select g%1000 as g1000, g%100 as g100, g%10 as g10, g
+   from generate_series(0,199999) g) s
+group by cube (g1000,g100,g10);
                           QUERY PLAN
---------------------------------------------------------------
- Gather Motion 3:1  (slice1; segments: 3)
-   ->  Finalize HashAggregate
-         Group Key: g1000, g100, g10, (GROUPINGSET_ID())
-         ->  Redistribute Motion 3:3  (slice2; segments: 3)
-               Hash Key: g1000, g100, g10, (GROUPINGSET_ID())
-               ->  Partial MixedAggregate
-                     Hash Key: g1000, g100, g10
-                     Hash Key: g1000, g100
-                     Hash Key: g1000
-                     Hash Key: g100, g10
-                     Hash Key: g100
-                     Hash Key: g10, g1000
-                     Hash Key: g10
-                     Group Key: ()
-                     ->  Seq Scan on gs_data_1
+---------------------------------------------------------------
+ GroupAggregate
+   Group Key: ((g.g % 1000)), ((g.g % 100)), ((g.g % 10))
+   Group Key: ((g.g % 1000)), ((g.g % 100))
+   Group Key: ((g.g % 1000))
+   Group Key: ()
+   Sort Key: ((g.g % 100)), ((g.g % 10))
+     Group Key: ((g.g % 100)), ((g.g % 10))
+     Group Key: ((g.g % 100))
+   Sort Key: ((g.g % 10)), ((g.g % 1000))
+     Group Key: ((g.g % 10)), ((g.g % 1000))
+     Group Key: ((g.g % 10))
+   ->  Sort
+         Sort Key: ((g.g % 1000)), ((g.g % 100)), ((g.g % 10))
+         ->  Function Scan on generate_series g
  Optimizer: Postgres query optimizer
-(16 rows)
+(15 rows)
 
 create table gs_hash_1 as
-select g100, g10, sum(g::numeric), count(*), max(g::text)
-from gs_data_1 group by cube (g1000, g100,g10);
+select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from
+  (select g%1000 as g1000, g%100 as g100, g%10 as g10, g
+   from generate_series(0,199999) g) s
+group by cube (g1000,g100,g10) distributed by (g1000);
+set jit_above_cost to default;
 set enable_sort = true;
 set work_mem to default;
 WARNING:  "work_mem": setting is deprecated, and may be removed in a future release.
--- GPDB_12_MERGE_FIXME: the following comparison query has an ORCA plan that
--- relies on "IS NOT DISTINCT FROM" Hash Join, a variant that we likely have
--- lost during the merge with upstream Postgres 12. Disable ORCA for this query
-SET optimizer TO off;
--- Compare results
+-- Compare results of ORCA plan that relies on "IS NOT DISTINCT FROM" HASH Join
 (select * from gs_hash_1 except select * from gs_group_1)
   union all
 (select * from gs_group_1 except select * from gs_hash_1);
- g100 | g10 | sum | count | max 
-------+-----+-----+-------+-----
+ g1000 | g100 | g10 | sum | count | max 
+-------+------+-----+-----+-------+-----
 (0 rows)
 
 RESET optimizer;