Support WITHIN GROUP syntax to standardize certain existing aggregate functions (apache#13511)

Garamda · jayzhan211 · web-flow · commit e41c02c6996c · 2025-04-23T18:46:03.000+08:00
* Add within group variable to aggregate function and arguments * Support within group and disable null handling for ordered set aggregate functions (apache#13511) * Refactored function to match updated signature * Modify proto to support within group clause * Modify physical planner and accumulator to support ordered set aggregate function * Support session management for ordered set aggregate functions * Align code, tests, and examples with changes to aggregate function logic * Ensure compatibility with new `within_group` and `order_by` handling. * Adjust tests and examples to align with the new logic. * Fix typo in existing comments * Enhance test * Add test cases for changed signature * Update signature in docs * Fix bug : handle missing within_group when applying children tree node * Change the signature of approx_percentile_cont for consistency * Add missing within_group for expr display * Handle edge case when over and within group clause are used together * Apply clippy advice: avoids too many arguments * Add new test cases using descending order * Apply cargo fmt * Revert unintended submodule changes * Apply prettier guidance * Apply doc guidance by update_function_doc.sh * Rollback WITHIN GROUP and related logic after converting it into expr * Make it not to handle redundant logic * Rollback ordered set aggregate functions from session to save same info in udf itself * Convert within group to order by when converting sql to expr * Add function to determine it is ordered-set aggregate function * Rollback within group from proto * Utilize within group as order by in functions-aggregate * Apply clippy * Convert order by to within group * Apply cargo fmt * Remove plain line breaks * Remove duplicated column arg in schema name * Refactor boolean functions to just return primitive type * Make within group necessary in the signature of existing ordered set aggr funcs * Apply cargo fmt * Support a single ordering expression in the signature * Apply cargo fmt * Add dataframe function test cases to verify descending ordering * Apply cargo fmt * Apply code reviews * Uses order by consistently after done with sql * Remove redundant comment * Serve more clear error msg * Handle error cases in the same code block * Update error msg in test as corresponding code changed * fix --------- Co-authored-by: Jay Zhan <jayzhan211@gmail.com>
diff --git a/datafusion/core/benches/aggregate_query_sql.rs b/datafusion/core/benches/aggregate_query_sql.rs
@@ -158,7 +158,7 @@ fn criterion_benchmark(c: &mut Criterion) {
             query(
                 ctx.clone(),
                 &rt,
-                "SELECT utf8, approx_percentile_cont(u64_wide, 0.5, 2500)  \
+                "SELECT utf8, approx_percentile_cont(0.5, 2500) WITHIN GROUP (ORDER BY u64_wide)  \
                  FROM t GROUP BY utf8",
             )
         })
@@ -169,7 +169,7 @@ fn criterion_benchmark(c: &mut Criterion) {
             query(
                 ctx.clone(),
                 &rt,
-                "SELECT utf8, approx_percentile_cont(f32, 0.5, 2500)  \
+                "SELECT utf8, approx_percentile_cont(0.5, 2500) WITHIN GROUP (ORDER BY f32)  \
                  FROM t GROUP BY utf8",
             )
         })
diff --git a/datafusion/core/tests/dataframe/dataframe_functions.rs b/datafusion/core/tests/dataframe/dataframe_functions.rs
@@ -384,19 +384,34 @@ async fn test_fn_approx_median() -> Result<()> {
 
 #[tokio::test]
 async fn test_fn_approx_percentile_cont() -> Result<()> {
-    let expr = approx_percentile_cont(col("b"), lit(0.5), None);
+    let expr = approx_percentile_cont(col("b").sort(true, false), lit(0.5), None);
 
     let df = create_test_table().await?;
     let batches = df.aggregate(vec![], vec![expr]).unwrap().collect().await?;
 
     assert_snapshot!(
         batches_to_string(&batches),
         @r"
-    +---------------------------------------------+
-    | approx_percentile_cont(test.b,Float64(0.5)) |
-    +---------------------------------------------+
-    | 10                                          |
-    +---------------------------------------------+
+    +---------------------------------------------------------------------------+
+    | approx_percentile_cont(Float64(0.5)) WITHIN GROUP [test.b ASC NULLS LAST] |
+    +---------------------------------------------------------------------------+
+    | 10                                                                        |
+    +---------------------------------------------------------------------------+
+    ");
+
+    let expr = approx_percentile_cont(col("b").sort(false, false), lit(0.1), None);
+
+    let df = create_test_table().await?;
+    let batches = df.aggregate(vec![], vec![expr]).unwrap().collect().await?;
+
+    assert_snapshot!(
+        batches_to_string(&batches),
+        @r"
+    +----------------------------------------------------------------------------+
+    | approx_percentile_cont(Float64(0.1)) WITHIN GROUP [test.b DESC NULLS LAST] |
+    +----------------------------------------------------------------------------+
+    | 100                                                                        |
+    +----------------------------------------------------------------------------+
     ");
 
     // the arg2 parameter is a complex expr, but it can be evaluated to the literal value
@@ -405,35 +420,71 @@ async fn test_fn_approx_percentile_cont() -> Result<()> {
         None::<&str>,
         "arg_2".to_string(),
     ));
-    let expr = approx_percentile_cont(col("b"), alias_expr, None);
+    let expr = approx_percentile_cont(col("b").sort(true, false), alias_expr, None);
     let df = create_test_table().await?;
     let batches = df.aggregate(vec![], vec![expr]).unwrap().collect().await?;
 
     assert_snapshot!(
         batches_to_string(&batches),
         @r"
-    +--------------------------------------+
-    | approx_percentile_cont(test.b,arg_2) |
-    +--------------------------------------+
-    | 10                                   |
-    +--------------------------------------+
+    +--------------------------------------------------------------------+
+    | approx_percentile_cont(arg_2) WITHIN GROUP [test.b ASC NULLS LAST] |
+    +--------------------------------------------------------------------+
+    | 10                                                                 |
+    +--------------------------------------------------------------------+
+    "
+    );
+
+    let alias_expr = Expr::Alias(Alias::new(
+        cast(lit(0.1), DataType::Float32),
+        None::<&str>,
+        "arg_2".to_string(),
+    ));
+    let expr = approx_percentile_cont(col("b").sort(false, false), alias_expr, None);
+    let df = create_test_table().await?;
+    let batches = df.aggregate(vec![], vec![expr]).unwrap().collect().await?;
+
+    assert_snapshot!(
+        batches_to_string(&batches),
+        @r"
+    +---------------------------------------------------------------------+
+    | approx_percentile_cont(arg_2) WITHIN GROUP [test.b DESC NULLS LAST] |
+    +---------------------------------------------------------------------+
+    | 100                                                                 |
+    +---------------------------------------------------------------------+
     "
     );
 
     // with number of centroids set
-    let expr = approx_percentile_cont(col("b"), lit(0.5), Some(lit(2)));
+    let expr = approx_percentile_cont(col("b").sort(true, false), lit(0.5), Some(lit(2)));
+
+    let df = create_test_table().await?;
+    let batches = df.aggregate(vec![], vec![expr]).unwrap().collect().await?;
+
+    assert_snapshot!(
+        batches_to_string(&batches),
+        @r"
+    +------------------------------------------------------------------------------------+
+    | approx_percentile_cont(Float64(0.5),Int32(2)) WITHIN GROUP [test.b ASC NULLS LAST] |
+    +------------------------------------------------------------------------------------+
+    | 30                                                                                 |
+    +------------------------------------------------------------------------------------+
+    ");
+
+    let expr =
+        approx_percentile_cont(col("b").sort(false, false), lit(0.1), Some(lit(2)));
 
     let df = create_test_table().await?;
     let batches = df.aggregate(vec![], vec![expr]).unwrap().collect().await?;
 
     assert_snapshot!(
         batches_to_string(&batches),
         @r"
-    +------------------------------------------------------+
-    | approx_percentile_cont(test.b,Float64(0.5),Int32(2)) |
-    +------------------------------------------------------+
-    | 30                                                   |
-    +------------------------------------------------------+
+    +-------------------------------------------------------------------------------------+
+    | approx_percentile_cont(Float64(0.1),Int32(2)) WITHIN GROUP [test.b DESC NULLS LAST] |
+    +-------------------------------------------------------------------------------------+
+    | 69                                                                                  |
+    +-------------------------------------------------------------------------------------+
     ");
 
     Ok(())
diff --git a/datafusion/expr/src/udaf.rs b/datafusion/expr/src/udaf.rs
@@ -315,6 +315,16 @@ impl AggregateUDF {
         self.inner.default_value(data_type)
     }
 
+    /// See [`AggregateUDFImpl::supports_null_handling_clause`] for more details.
+    pub fn supports_null_handling_clause(&self) -> bool {
+        self.inner.supports_null_handling_clause()
+    }
+
+    /// See [`AggregateUDFImpl::is_ordered_set_aggregate`] for more details.
+    pub fn is_ordered_set_aggregate(&self) -> bool {
+        self.inner.is_ordered_set_aggregate()
+    }
+
     /// Returns the documentation for this Aggregate UDF.
     ///
     /// Documentation can be accessed programmatically as well as
@@ -432,6 +442,14 @@ pub trait AggregateUDFImpl: Debug + Send + Sync {
             null_treatment,
         } = params;
 
+        // exclude the first function argument(= column) in ordered set aggregate function,
+        // because it is duplicated with the WITHIN GROUP clause in schema name.
+        let args = if self.is_ordered_set_aggregate() {
+            &args[1..]
+        } else {
+            &args[..]
+        };
+
         let mut schema_name = String::new();
 
         schema_name.write_fmt(format_args!(
@@ -450,8 +468,14 @@ pub trait AggregateUDFImpl: Debug + Send + Sync {
         };
 
         if let Some(order_by) = order_by {
+            let clause = match self.is_ordered_set_aggregate() {
+                true => "WITHIN GROUP",
+                false => "ORDER BY",
+            };
+
             schema_name.write_fmt(format_args!(
-                " ORDER BY [{}]",
+                " {} [{}]",
+                clause,
                 schema_name_from_sorts(order_by)?
             ))?;
         };
@@ -891,6 +915,18 @@ pub trait AggregateUDFImpl: Debug + Send + Sync {
         ScalarValue::try_from(data_type)
     }
 
+    /// If this function supports `[IGNORE NULLS | RESPECT NULLS]` clause, return true
+    /// If the function does not, return false
+    fn supports_null_handling_clause(&self) -> bool {
+        true
+    }
+
+    /// If this function is ordered-set aggregate function, return true
+    /// If the function is not, return false
+    fn is_ordered_set_aggregate(&self) -> bool {
+        false
+    }
+
     /// Returns the documentation for this Aggregate UDF.
     ///
     /// Documentation can be accessed programmatically as well as
diff --git a/datafusion/functions-aggregate/src/approx_median.rs b/datafusion/functions-aggregate/src/approx_median.rs
@@ -45,7 +45,7 @@ make_udaf_expr_and_func!(
 /// APPROX_MEDIAN aggregate expression
 #[user_doc(
     doc_section(label = "Approximate Functions"),
-    description = "Returns the approximate median (50th percentile) of input values. It is an alias of `approx_percentile_cont(x, 0.5)`.",
+    description = "Returns the approximate median (50th percentile) of input values. It is an alias of `approx_percentile_cont(0.5) WITHIN GROUP (ORDER BY x)`.",
     syntax_example = "approx_median(expression)",
     sql_example = r#"```sql
 > SELECT approx_median(column_name) FROM table_name;
diff --git a/datafusion/functions-aggregate/src/approx_percentile_cont.rs b/datafusion/functions-aggregate/src/approx_percentile_cont.rs
@@ -34,6 +34,7 @@ use datafusion_common::{
     downcast_value, internal_err, not_impl_datafusion_err, not_impl_err, plan_err,
     Result, ScalarValue,
 };
+use datafusion_expr::expr::{AggregateFunction, Sort};
 use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
 use datafusion_expr::type_coercion::aggregates::{INTEGERS, NUMERICS};
 use datafusion_expr::utils::format_state_name;
@@ -51,29 +52,39 @@ create_func!(ApproxPercentileCont, approx_percentile_cont_udaf);
 
 /// Computes the approximate percentile continuous of a set of numbers
 pub fn approx_percentile_cont(
-    expression: Expr,
+    order_by: Sort,
     percentile: Expr,
     centroids: Option<Expr>,
 ) -> Expr {
+    let expr = order_by.expr.clone();
+
     let args = if let Some(centroids) = centroids {
-        vec![expression, percentile, centroids]
+        vec![expr, percentile, centroids]
     } else {
-        vec![expression, percentile]
+        vec![expr, percentile]
     };
-    approx_percentile_cont_udaf().call(args)
+
+    Expr::AggregateFunction(AggregateFunction::new_udf(
+        approx_percentile_cont_udaf(),
+        args,
+        false,
+        None,
+        Some(vec![order_by]),
+        None,
+    ))
 }
 
 #[user_doc(
     doc_section(label = "Approximate Functions"),
     description = "Returns the approximate percentile of input values using the t-digest algorithm.",
-    syntax_example = "approx_percentile_cont(expression, percentile, centroids)",
+    syntax_example = "approx_percentile_cont(percentile, centroids) WITHIN GROUP (ORDER BY expression)",
     sql_example = r#"```sql
-> SELECT approx_percentile_cont(column_name, 0.75, 100) FROM table_name;
-+-------------------------------------------------+
-| approx_percentile_cont(column_name, 0.75, 100)  |
-+-------------------------------------------------+
-| 65.0                                            |
-+-------------------------------------------------+
+> SELECT approx_percentile_cont(0.75, 100) WITHIN GROUP (ORDER BY column_name) FROM table_name;
++-----------------------------------------------------------------------+
+| approx_percentile_cont(0.75, 100) WITHIN GROUP (ORDER BY column_name) |
++-----------------------------------------------------------------------+
+| 65.0                                                                  |
++-----------------------------------------------------------------------+
 ```"#,
     standard_argument(name = "expression",),
     argument(
@@ -130,6 +141,19 @@ impl ApproxPercentileCont {
         args: AccumulatorArgs,
     ) -> Result<ApproxPercentileAccumulator> {
         let percentile = validate_input_percentile_expr(&args.exprs[1])?;
+
+        let is_descending = args
+            .ordering_req
+            .first()
+            .map(|sort_expr| sort_expr.options.descending)
+            .unwrap_or(false);
+
+        let percentile = if is_descending {
+            1.0 - percentile
+        } else {
+            percentile
+        };
+
         let tdigest_max_size = if args.exprs.len() == 3 {
             Some(validate_input_max_size_expr(&args.exprs[2])?)
         } else {
@@ -292,6 +316,14 @@ impl AggregateUDFImpl for ApproxPercentileCont {
         Ok(arg_types[0].clone())
     }
 
+    fn supports_null_handling_clause(&self) -> bool {
+        false
+    }
+
+    fn is_ordered_set_aggregate(&self) -> bool {
+        true
+    }
+
     fn documentation(&self) -> Option<&Documentation> {
         self.doc()
     }
diff --git a/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs b/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs
@@ -52,14 +52,14 @@ make_udaf_expr_and_func!(
 #[user_doc(
     doc_section(label = "Approximate Functions"),
     description = "Returns the weighted approximate percentile of input values using the t-digest algorithm.",
-    syntax_example = "approx_percentile_cont_with_weight(expression, weight, percentile)",
+    syntax_example = "approx_percentile_cont_with_weight(weight, percentile) WITHIN GROUP (ORDER BY expression)",
     sql_example = r#"```sql
-> SELECT approx_percentile_cont_with_weight(column_name, weight_column, 0.90) FROM table_name;
-+----------------------------------------------------------------------+
-| approx_percentile_cont_with_weight(column_name, weight_column, 0.90) |
-+----------------------------------------------------------------------+
-| 78.5                                                                 |
-+----------------------------------------------------------------------+
+> SELECT approx_percentile_cont_with_weight(weight_column, 0.90) WITHIN GROUP (ORDER BY column_name) FROM table_name;
++---------------------------------------------------------------------------------------------+
+| approx_percentile_cont_with_weight(weight_column, 0.90) WITHIN GROUP (ORDER BY column_name) |
++---------------------------------------------------------------------------------------------+
+| 78.5                                                                                        |
++---------------------------------------------------------------------------------------------+
 ```"#,
     standard_argument(name = "expression", prefix = "The"),
     argument(
@@ -178,6 +178,14 @@ impl AggregateUDFImpl for ApproxPercentileContWithWeight {
         self.approx_percentile_cont.state_fields(args)
     }
 
+    fn supports_null_handling_clause(&self) -> bool {
+        false
+    }
+
+    fn is_ordered_set_aggregate(&self) -> bool {
+        true
+    }
+
     fn documentation(&self) -> Option<&Documentation> {
         self.doc()
     }
diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs
@@ -973,8 +973,8 @@ async fn roundtrip_expr_api() -> Result<()> {
         stddev_pop(lit(2.2)),
         approx_distinct(lit(2)),
         approx_median(lit(2)),
-        approx_percentile_cont(lit(2), lit(0.5), None),
-        approx_percentile_cont(lit(2), lit(0.5), Some(lit(50))),
+        approx_percentile_cont(lit(2).sort(true, false), lit(0.5), None),
+        approx_percentile_cont(lit(2).sort(true, false), lit(0.5), Some(lit(50))),
         approx_percentile_cont_with_weight(lit(2), lit(1), lit(0.5)),
         grouping(lit(1)),
         bit_and(lit(2)),
diff --git a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs
@@ -504,7 +504,7 @@ fn rountrip_aggregate_with_approx_pencentile_cont() -> Result<()> {
         vec![col("b", &schema)?, lit(0.5)],
     )
     .schema(Arc::clone(&schema))
-    .alias("APPROX_PERCENTILE_CONT(b, 0.5)")
+    .alias("APPROX_PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY b)")
     .build()
     .map(Arc::new)?];
 
diff --git a/datafusion/sql/src/expr/function.rs b/datafusion/sql/src/expr/function.rs
diff --git a/datafusion/sql/src/unparser/expr.rs b/datafusion/sql/src/unparser/expr.rs
diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt
diff --git a/docs/source/user-guide/sql/aggregate_functions.md b/docs/source/user-guide/sql/aggregate_functions.md

Original file line number	Diff line number	Diff line change
`@@ -504,7 +504,7 @@ fn rountrip_aggregate_with_approx_pencentile_cont() -> Result<()> {`
`504`	`504`	`vec![col("b", &schema)?, lit(0.5)],`
`505`	`505`	`)`
`506`	`506`	`.schema(Arc::clone(&schema))`
`507`		`- .alias("APPROX_PERCENTILE_CONT(b, 0.5)")`
	`507`	`+ .alias("APPROX_PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY b)")`
`508`	`508`	`.build()`
`509`	`509`	`.map(Arc::new)?];`
`510`	`510`