Bring back cdbsubselect_drop_distinct. (#14322)

Previous commit ebb691a03 is to fix ANY sublink contains distinct-on syntax on partial exprs in the target list of subselect. That fix method simply not to drop distinct or order-by clause and might lead to some sublink cannot be pulled up and bad performance, below is an example of such regression: select a from t where a in (select b from t1 group by b) For the above SQL, if we do not drop the group-by clause, then later planner will take the subselect as not simple and fail to pull it up to a join. This commit fixes the regression by: 1. revert the logic of ebb691a03 2. fix the issue #12656 using a correct way: if the distinct on clause's length is not the same as the length of subselect's targetlist, we do not drop the distinct clause. Authored-by: Jian Guo <gjian@vmware.com> Co-authored-by: Zhenghua Lyu <kainwen@gmail.com>
apache · Dec 27, 2024 · cce0937 · cce0937
1 parent eb3216b
commit cce0937
Show file tree

Hide file tree

Showing 13 changed files with 206 additions and 176 deletions.
diff --git a/src/backend/cdb/cdbsubselect.c b/src/backend/cdb/cdbsubselect.c
@@ -466,7 +466,9 @@ cdbsubselect_drop_distinct(Query *subselect)
 		subselect->limitOffset == NULL)
 	{
 		/* Delete DISTINCT. */
-		subselect->distinctClause = NIL;
+		if (!subselect->hasDistinctOn ||
+			list_length(subselect->distinctClause) == list_length(subselect->targetList))
+			subselect->distinctClause = NIL;
 
 		/* Delete GROUP BY if subquery has no aggregates and no HAVING. */
 		if (!subselect->hasAggs &&
@@ -485,7 +487,9 @@ cdbsubselect_drop_orderby(Query *subselect)
 		subselect->limitOffset == NULL)
 	{
 		/* Delete ORDER BY. */
-		subselect->sortClause = NIL;
+		if (!subselect->hasDistinctOn ||
+			list_length(subselect->distinctClause) == list_length(subselect->targetList))
+			subselect->sortClause = NIL;
 	}
 }	/* cdbsubselect_drop_orderby */
 
@@ -1500,6 +1504,20 @@ convert_IN_to_antijoin(PlannerInfo *root, SubLink *sublink,
 
 	if (safe_to_convert_NOTIN(root, sublink, available_rels))
 	{
+		/* Delete ORDER BY and DISTINCT.
+		 *
+		 * There is no need to do the group-by or order-by inside the
+		 * subquery, if we have decided to pull up the sublink. For the
+		 * group-by case, after the sublink pull-up, there will be a semi-join
+		 * plan node generated in top level, which will weed out duplicate
+		 * tuples naturally. For the order-by case, after the sublink pull-up,
+		 * the subquery will become a jointree, inside which the tuples' order
+		 * doesn't matter. In a summary, it's safe to elimate the group-by or
+		 * order-by causes here.
+		 */
+		cdbsubselect_drop_orderby(subselect);
+		cdbsubselect_drop_distinct(subselect);
+
 		int			subq_indx      = add_notin_subquery_rte(parse, subselect);
 		List       *inner_exprs    = NIL;
 		List       *outer_exprs    = NIL;

diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c
@@ -1538,6 +1538,20 @@ convert_ANY_sublink_to_join(PlannerInfo *root, SubLink *sublink,
 	Assert(sublink->subLinkType == ANY_SUBLINK);
 	Assert(IsA(subselect, Query));
 
+	/* Delete ORDER BY and DISTINCT.
+	 *
+	 * There is no need to do the group-by or order-by inside the
+	 * subquery, if we have decided to pull up the sublink. For the
+	 * group-by case, after the sublink pull-up, there will be a semi-join
+	 * plan node generated in top level, which will weed out duplicate
+	 * tuples naturally. For the order-by case, after the sublink pull-up,
+	 * the subquery will become a jointree, inside which the tuples' order
+	 * doesn't matter. In a summary, it's safe to elimate the group-by or
+	 * order-by causes here.
+	*/
+	cdbsubselect_drop_orderby(subselect);
+	cdbsubselect_drop_distinct(subselect);
+
 	/*
 	 * If uncorrelated, and no Var nodes on lhs, the subquery will be executed
 	 * only once.  It should become an InitPlan, but make_subplan() doesn't
@@ -1548,20 +1562,6 @@ convert_ANY_sublink_to_join(PlannerInfo *root, SubLink *sublink,
 
 	if (correlated)
 	{
-		/* Delete ORDER BY and DISTINCT. 
-		 *
-		 * There is no need to do the group-by or order-by inside the
-		 * subquery, if we have decided to pull up the sublink (when the
-		 * subquery is correlated). For the group-by case, after the sublink
-		 * pull-up, there will be a semi-join plan node generated in top
-		 * level, which will weed out duplicate tuples naturally. For the
-		 * order-by case, after the sublink pull-up, the subquery will become
-		 * a jointree, inside which the tuples' order doesn't matter. In a
-		 * summary, it's safe to elimate the group-by or order-by causes here.
-		*/
-		cdbsubselect_drop_orderby(subselect);
-		cdbsubselect_drop_distinct(subselect);
-
 		/*
 		 * Under certain conditions, we cannot pull up the subquery as a join.
 		 */

diff --git a/src/test/regress/expected/notin.out b/src/test/regress/expected/notin.out
@@ -400,30 +400,26 @@ select a,b from g1 where (a,b) not in
 --
 explain select x,y from l1 where (x,y) not in
 	(select distinct y, sum(x) from l1 group by y having y < 4 order by y) order by 1,2;
-                                                         QUERY PLAN                                                         
-----------------------------------------------------------------------------------------------------------------------------
- Gather Motion 3:1  (slice1; segments: 3)  (cost=10000000002.35..10000000002.40 rows=3 width=8)
+                                                         QUERY PLAN                                                          
+-----------------------------------------------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice3; segments: 3)  (cost=10000000008.02..10000000008.03 rows=4 width=8)
    Merge Key: l1.x, l1.y
    ->  Sort  (cost=10000000002.35..10000000002.36 rows=1 width=8)
          Sort Key: l1.x, l1.y
-         ->  Nested Loop Left Anti Semi (Not-In) Join  (cost=10000000001.07..10000000002.34 rows=1 width=8)
+         ->  Nested Loop Left Anti Semi (Not-In) Join  (cost=10000000003.25..10000000007.99 rows=2 width=8)
                Join Filter: ((l1.x = "NotIn_SUBQUERY".y) AND (l1.y = "NotIn_SUBQUERY".sum))
-               ->  Seq Scan on l1  (cost=0.00..1.03 rows=3 width=8)
-               ->  Materialize  (cost=1.09..1.16 rows=3 width=12)
-                     ->  Broadcast Motion 3:3  (slice2; segments: 3)  (cost=1.09..1.15 rows=3 width=12)
-                           ->  Subquery Scan on "NotIn_SUBQUERY"  (cost=1.09..1.11 rows=1 width=12)
-                                 ->  Unique  (cost=1.09..1.10 rows=1 width=16)
-                                       Group Key: l1_1.y, (sum(l1_1.x))
-                                       ->  Sort  (cost=1.09..1.09 rows=1 width=16)
-                                             Sort Key: l1_1.y, (sum(l1_1.x))
-                                             ->  HashAggregate  (cost=1.07..1.08 rows=1 width=16)
-                                                   Group Key: l1_1.y
-                                                   ->  Redistribute Motion 3:3  (slice3; segments: 3)  (cost=0.00..1.06 rows=1 width=8)
-                                                         Hash Key: l1_1.y
-                                                         ->  Seq Scan on l1 l1_1  (cost=0.00..1.04 rows=1 width=8)
-                                                               Filter: (y < 4)
+               ->  Seq Scan on l1  (cost=0.00..3.10 rows=4 width=8)
+               ->  Materialize  (cost=3.25..3.47 rows=3 width=12)
+                     ->  Broadcast Motion 3:3  (slice2; segments: 3)  (cost=3.25..3.43 rows=3 width=12)
+                           ->  Subquery Scan on "NotIn_SUBQUERY"  (cost=3.25..3.31 rows=1 width=12)
+                                 ->  HashAggregate  (cost=3.25..3.28 rows=1 width=16)
+                                       Group Key: l1_1.y
+                                       ->  Redistribute Motion 3:3  (slice1; segments: 3)  (cost=3.14..3.20 rows=1 width=12)
+                                             Hash Key: l1_1.y
+                                                   ->  Seq Scan on l1 l1_1  (cost=0.00..3.12 rows=2 width=8)
+                                                         Filter: (y < 4)
  Optimizer: Postgres query optimizer
-(21 rows)
+(17 rows)
 
 select x,y from l1 where (x,y) not in
 	(select distinct y, sum(x) from l1 group by y having y < 4 order by y) order by 1,2;

diff --git a/src/test/regress/expected/notin_optimizer.out b/src/test/regress/expected/notin_optimizer.out
@@ -407,30 +407,26 @@ select a,b from g1 where (a,b) not in
 --
 explain select x,y from l1 where (x,y) not in
 	(select distinct y, sum(x) from l1 group by y having y < 4 order by y) order by 1,2;
-                                                         QUERY PLAN                                                         
-----------------------------------------------------------------------------------------------------------------------------
- Gather Motion 3:1  (slice1; segments: 3)  (cost=10000000002.35..10000000002.40 rows=3 width=8)
+                                                         QUERY PLAN                                                          
+-----------------------------------------------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice3; segments: 3)  (cost=10000000008.02..10000000008.03 rows=4 width=8)
    Merge Key: l1.x, l1.y
    ->  Sort  (cost=10000000002.35..10000000002.36 rows=1 width=8)
          Sort Key: l1.x, l1.y
-         ->  Nested Loop Left Anti Semi (Not-In) Join  (cost=10000000001.07..10000000002.34 rows=1 width=8)
+         ->  Nested Loop Left Anti Semi (Not-In) Join  (cost=10000000003.25..10000000007.99 rows=2 width=8)
                Join Filter: ((l1.x = "NotIn_SUBQUERY".y) AND (l1.y = "NotIn_SUBQUERY".sum))
-               ->  Seq Scan on l1  (cost=0.00..1.03 rows=3 width=8)
-               ->  Materialize  (cost=1.09..1.16 rows=3 width=12)
-                     ->  Broadcast Motion 3:3  (slice2; segments: 3)  (cost=1.09..1.15 rows=3 width=12)
-                           ->  Subquery Scan on "NotIn_SUBQUERY"  (cost=1.09..1.11 rows=1 width=12)
-                                 ->  Unique  (cost=1.09..1.10 rows=1 width=16)
-                                       Group Key: l1_1.y, (sum(l1_1.x))
-                                       ->  Sort  (cost=1.09..1.09 rows=1 width=16)
-                                             Sort Key: l1_1.y, (sum(l1_1.x))
-                                             ->  HashAggregate  (cost=1.07..1.08 rows=1 width=16)
-                                                   Group Key: l1_1.y
-                                                   ->  Redistribute Motion 3:3  (slice3; segments: 3)  (cost=0.00..1.06 rows=1 width=8)
-                                                         Hash Key: l1_1.y
-                                                         ->  Seq Scan on l1 l1_1  (cost=0.00..1.04 rows=1 width=8)
-                                                               Filter: (y < 4)
+               ->  Seq Scan on l1  (cost=0.00..3.10 rows=4 width=8)
+               ->  Materialize  (cost=3.25..3.47 rows=3 width=12)
+                     ->  Broadcast Motion 3:3  (slice2; segments: 3)  (cost=3.25..3.43 rows=3 width=12)
+                           ->  Subquery Scan on "NotIn_SUBQUERY"  (cost=3.25..3.31 rows=1 width=12)
+                                 ->  HashAggregate  (cost=3.25..3.28 rows=1 width=16)
+                                       Group Key: l1_1.y
+                                       ->  Redistribute Motion 3:3  (slice1; segments: 3)  (cost=3.14..3.20 rows=1 width=12)
+                                             Hash Key: l1_1.y
+                                                   ->  Seq Scan on l1 l1_1  (cost=0.00..3.12 rows=2 width=8)
+                                                         Filter: (y < 4)
  Optimizer: Postgres query optimizer
-(21 rows)
+(19 rows)
 
 select x,y from l1 where (x,y) not in
 	(select distinct y, sum(x) from l1 group by y having y < 4 order by y) order by 1,2;