Skip to content

Commit

Permalink
[FIXME] Wrong results on main branch for INDF query (#180)
Browse files Browse the repository at this point in the history
GPDB_12_MERGE_FIXME: the following comparison query has an ORCA plan that
relies on "IS NOT DISTINCT FROM" Hash Join, a variant that we likely have
lost during the merge with upstream Postgres 12. Disable ORCA for this query

Issue: HASH Join(left, right, inner) removing NULL tuples when there is
INDF in query.

The field hashqualclauses in HashJoin node holds the expressions "IS NOT
DISTINCT FROM" that test for match. This is normally idential to
hashclauses but differs in case of non-equijoin comparision.

While scanning the HashBucket we were using hashclauses so we missed
the expressions with "IS NOT DISTINCT FROM".

Co-authored-by: hari krishna <hmaddileti@vmware.com>
  • Loading branch information
2 people authored and baotingfang committed Dec 1, 2023
1 parent 485cc72 commit 865e441
Show file tree
Hide file tree
Showing 7 changed files with 344 additions and 159 deletions.
14 changes: 12 additions & 2 deletions src/backend/executor/nodeHash.c
Original file line number Diff line number Diff line change
Expand Up @@ -2192,7 +2192,12 @@ bool
ExecScanHashBucket(HashState *hashState, HashJoinState *hjstate,
ExprContext *econtext)
{
ExprState *hjclauses = hjstate->hashclauses;
/*
* Greenplum specific behavior.
* Using hashqualclauses to support hash join on 'IS NOT DISTINCT FROM'
* as well as '='.
*/
ExprState *hjclauses = hjstate->hashqualclauses;
HashJoinTable hashtable = hjstate->hj_HashTable;
HashJoinTuple hashTuple = hjstate->hj_CurTuple;
uint32 hashvalue = hjstate->hj_CurHashValue;
Expand Down Expand Up @@ -2253,7 +2258,12 @@ bool
ExecParallelScanHashBucket(HashState *hashState, HashJoinState *hjstate,
ExprContext *econtext)
{
ExprState *hjclauses = hjstate->hashclauses;
/*
* Greenplum specific behavior.
* Using hashqualclauses to support hash join on 'IS NOT DISTINCT FROM'
* as well as '='.
*/
ExprState *hjclauses = hjstate->hashqualclauses;
HashJoinTable hashtable = hjstate->hj_HashTable;
HashJoinTuple hashTuple = hjstate->hj_CurTuple;
uint32 hashvalue = hjstate->hj_CurHashValue;
Expand Down
106 changes: 52 additions & 54 deletions src/test/regress/expected/groupingsets.out
Original file line number Diff line number Diff line change
Expand Up @@ -2102,78 +2102,76 @@ set enable_sort = true;
set enable_hashagg = false;
set jit_above_cost = 0;
explain (costs off)
select g100, g10, sum(g::numeric), count(*), max(g::text)
from gs_data_1 group by cube (g1000, g100,g10);
select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from
(select g%1000 as g1000, g%100 as g100, g%10 as g10, g
from generate_series(0,199999) g) s
group by cube (g1000,g100,g10);
QUERY PLAN
--------------------------------------------------------------
Finalize GroupAggregate
Group Key: g1000, g100, g10, (GROUPINGSET_ID())
-> Gather Motion 3:1 (slice1; segments: 3)
Merge Key: g1000, g100, g10, (GROUPINGSET_ID())
-> Sort
Sort Key: g1000, g100, g10, (GROUPINGSET_ID())
-> Partial GroupAggregate
Group Key: g1000, g100, g10
Group Key: g1000, g100
Group Key: g1000
Group Key: ()
Sort Key: g100, g10
Group Key: g100, g10
Group Key: g100
Sort Key: g10, g1000
Group Key: g10, g1000
Group Key: g10
-> Sort
Sort Key: g1000, g100, g10
-> Seq Scan on gs_data_1
---------------------------------------------------------------
GroupAggregate
Group Key: ((g.g % 1000)), ((g.g % 100)), ((g.g % 10))
Group Key: ((g.g % 1000)), ((g.g % 100))
Group Key: ((g.g % 1000))
Group Key: ()
Sort Key: ((g.g % 100)), ((g.g % 10))
Group Key: ((g.g % 100)), ((g.g % 10))
Group Key: ((g.g % 100))
Sort Key: ((g.g % 10)), ((g.g % 1000))
Group Key: ((g.g % 10)), ((g.g % 1000))
Group Key: ((g.g % 10))
-> Sort
Sort Key: ((g.g % 1000)), ((g.g % 100)), ((g.g % 10))
-> Function Scan on generate_series g
Optimizer: Postgres query optimizer
(21 rows)
(15 rows)

create table gs_group_1 as
select g100, g10, sum(g::numeric), count(*), max(g::text)
from gs_data_1 group by cube (g1000, g100,g10);
select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from
(select g%1000 as g1000, g%100 as g100, g%10 as g10, g
from generate_series(0,199999) g) s
group by cube (g1000,g100,g10) distributed by (g1000);
-- Produce results with hash aggregation.
set enable_hashagg = true;
set enable_sort = false;
explain (costs off)
select g100, g10, sum(g::numeric), count(*), max(g::text)
from gs_data_1 group by cube (g1000, g100,g10);
select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from
(select g%1000 as g1000, g%100 as g100, g%10 as g10, g
from generate_series(0,199999) g) s
group by cube (g1000,g100,g10);
QUERY PLAN
--------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
-> Finalize HashAggregate
Group Key: g1000, g100, g10, (GROUPINGSET_ID())
-> Redistribute Motion 3:3 (slice2; segments: 3)
Hash Key: g1000, g100, g10, (GROUPINGSET_ID())
-> Partial MixedAggregate
Hash Key: g1000, g100, g10
Hash Key: g1000, g100
Hash Key: g1000
Hash Key: g100, g10
Hash Key: g100
Hash Key: g10, g1000
Hash Key: g10
Group Key: ()
-> Seq Scan on gs_data_1
---------------------------------------------------------------
GroupAggregate
Group Key: ((g.g % 1000)), ((g.g % 100)), ((g.g % 10))
Group Key: ((g.g % 1000)), ((g.g % 100))
Group Key: ((g.g % 1000))
Group Key: ()
Sort Key: ((g.g % 100)), ((g.g % 10))
Group Key: ((g.g % 100)), ((g.g % 10))
Group Key: ((g.g % 100))
Sort Key: ((g.g % 10)), ((g.g % 1000))
Group Key: ((g.g % 10)), ((g.g % 1000))
Group Key: ((g.g % 10))
-> Sort
Sort Key: ((g.g % 1000)), ((g.g % 100)), ((g.g % 10))
-> Function Scan on generate_series g
Optimizer: Postgres query optimizer
(16 rows)
(15 rows)

create table gs_hash_1 as
select g100, g10, sum(g::numeric), count(*), max(g::text)
from gs_data_1 group by cube (g1000, g100,g10);
select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from
(select g%1000 as g1000, g%100 as g100, g%10 as g10, g
from generate_series(0,199999) g) s
group by cube (g1000,g100,g10) distributed by (g1000);
set jit_above_cost to default;
set enable_sort = true;
set work_mem to default;
WARNING: "work_mem": setting is deprecated, and may be removed in a future release.
-- GPDB_12_MERGE_FIXME: the following comparison query has an ORCA plan that
-- relies on "IS NOT DISTINCT FROM" Hash Join, a variant that we likely have
-- lost during the merge with upstream Postgres 12. Disable ORCA for this query
SET optimizer TO off;
-- Compare results
-- Compare results of ORCA plan that relies on "IS NOT DISTINCT FROM" HASH Join
(select * from gs_hash_1 except select * from gs_group_1)
union all
(select * from gs_group_1 except select * from gs_hash_1);
g100 | g10 | sum | count | max
------+-----+-----+-------+-----
g1000 | g100 | g10 | sum | count | max
-------+------+-----+-----+-------+-----
(0 rows)

RESET optimizer;
Expand Down
106 changes: 52 additions & 54 deletions src/test/regress/expected/groupingsets_optimizer.out
Original file line number Diff line number Diff line change
Expand Up @@ -2248,78 +2248,76 @@ set enable_sort = true;
set enable_hashagg = false;
set jit_above_cost = 0;
explain (costs off)
select g100, g10, sum(g::numeric), count(*), max(g::text)
from gs_data_1 group by cube (g1000, g100,g10);
select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from
(select g%1000 as g1000, g%100 as g100, g%10 as g10, g
from generate_series(0,199999) g) s
group by cube (g1000,g100,g10);
QUERY PLAN
--------------------------------------------------------------
Finalize GroupAggregate
Group Key: g1000, g100, g10, (GROUPINGSET_ID())
-> Gather Motion 3:1 (slice1; segments: 3)
Merge Key: g1000, g100, g10, (GROUPINGSET_ID())
-> Sort
Sort Key: g1000, g100, g10, (GROUPINGSET_ID())
-> Partial GroupAggregate
Group Key: g1000, g100, g10
Group Key: g1000, g100
Group Key: g1000
Group Key: ()
Sort Key: g100, g10
Group Key: g100, g10
Group Key: g100
Sort Key: g10, g1000
Group Key: g10, g1000
Group Key: g10
-> Sort
Sort Key: g1000, g100, g10
-> Seq Scan on gs_data_1
---------------------------------------------------------------
GroupAggregate
Group Key: ((g.g % 1000)), ((g.g % 100)), ((g.g % 10))
Group Key: ((g.g % 1000)), ((g.g % 100))
Group Key: ((g.g % 1000))
Group Key: ()
Sort Key: ((g.g % 100)), ((g.g % 10))
Group Key: ((g.g % 100)), ((g.g % 10))
Group Key: ((g.g % 100))
Sort Key: ((g.g % 10)), ((g.g % 1000))
Group Key: ((g.g % 10)), ((g.g % 1000))
Group Key: ((g.g % 10))
-> Sort
Sort Key: ((g.g % 1000)), ((g.g % 100)), ((g.g % 10))
-> Function Scan on generate_series g
Optimizer: Postgres query optimizer
(21 rows)
(15 rows)

create table gs_group_1 as
select g100, g10, sum(g::numeric), count(*), max(g::text)
from gs_data_1 group by cube (g1000, g100,g10);
select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from
(select g%1000 as g1000, g%100 as g100, g%10 as g10, g
from generate_series(0,199999) g) s
group by cube (g1000,g100,g10) distributed by (g1000);
-- Produce results with hash aggregation.
set enable_hashagg = true;
set enable_sort = false;
explain (costs off)
select g100, g10, sum(g::numeric), count(*), max(g::text)
from gs_data_1 group by cube (g1000, g100,g10);
select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from
(select g%1000 as g1000, g%100 as g100, g%10 as g10, g
from generate_series(0,199999) g) s
group by cube (g1000,g100,g10);
QUERY PLAN
--------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
-> Finalize HashAggregate
Group Key: g1000, g100, g10, (GROUPINGSET_ID())
-> Redistribute Motion 3:3 (slice2; segments: 3)
Hash Key: g1000, g100, g10, (GROUPINGSET_ID())
-> Partial MixedAggregate
Hash Key: g1000, g100, g10
Hash Key: g1000, g100
Hash Key: g1000
Hash Key: g100, g10
Hash Key: g100
Hash Key: g10, g1000
Hash Key: g10
Group Key: ()
-> Seq Scan on gs_data_1
---------------------------------------------------------------
GroupAggregate
Group Key: ((g.g % 1000)), ((g.g % 100)), ((g.g % 10))
Group Key: ((g.g % 1000)), ((g.g % 100))
Group Key: ((g.g % 1000))
Group Key: ()
Sort Key: ((g.g % 100)), ((g.g % 10))
Group Key: ((g.g % 100)), ((g.g % 10))
Group Key: ((g.g % 100))
Sort Key: ((g.g % 10)), ((g.g % 1000))
Group Key: ((g.g % 10)), ((g.g % 1000))
Group Key: ((g.g % 10))
-> Sort
Sort Key: ((g.g % 1000)), ((g.g % 100)), ((g.g % 10))
-> Function Scan on generate_series g
Optimizer: Postgres query optimizer
(16 rows)
(15 rows)

create table gs_hash_1 as
select g100, g10, sum(g::numeric), count(*), max(g::text)
from gs_data_1 group by cube (g1000, g100,g10);
select g1000, g100, g10, sum(g::numeric), count(*), max(g::text) from
(select g%1000 as g1000, g%100 as g100, g%10 as g10, g
from generate_series(0,199999) g) s
group by cube (g1000,g100,g10) distributed by (g1000);
set jit_above_cost to default;
set enable_sort = true;
set work_mem to default;
WARNING: "work_mem": setting is deprecated, and may be removed in a future release.
-- GPDB_12_MERGE_FIXME: the following comparison query has an ORCA plan that
-- relies on "IS NOT DISTINCT FROM" Hash Join, a variant that we likely have
-- lost during the merge with upstream Postgres 12. Disable ORCA for this query
SET optimizer TO off;
-- Compare results
-- Compare results of ORCA plan that relies on "IS NOT DISTINCT FROM" HASH Join
(select * from gs_hash_1 except select * from gs_group_1)
union all
(select * from gs_group_1 except select * from gs_hash_1);
g100 | g10 | sum | count | max
------+-----+-----+-------+-----
g1000 | g100 | g10 | sum | count | max
-------+------+-----+-----+-------+-----
(0 rows)

RESET optimizer;
Expand Down
Loading

0 comments on commit 865e441

Please sign in to comment.