diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index a7e337d3f9af..b91124f96a6f 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -1558,6 +1558,9 @@ setMethod("max", #' @details #' \code{max_by}: Returns the value associated with the maximum value of ord. #' +#' Note: The function is non-deterministic so the output order can be different +#' for those associated the same values of `x`. +#' #' @rdname column_aggregate_functions #' @aliases max_by max_by,Column-method #' @note max_by since 3.3.0 @@ -1633,6 +1636,9 @@ setMethod("min", #' @details #' \code{min_by}: Returns the value associated with the minimum value of ord. #' +#' Note: The function is non-deterministic so the output order can be different +#' for those associated the same values of `x`. +#' #' @rdname column_aggregate_functions #' @aliases min_by min_by,Column-method #' @note min_by since 3.3.0 diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala index eae239a25589..47f7266f3bf5 100644 --- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala @@ -884,6 +884,10 @@ object functions { /** * Aggregate function: returns the value associated with the maximum value of ord. * + * @note + * The function is non-deterministic so the output order can be different for those associated + * the same values of `e`. + * * @group agg_funcs * @since 3.4.0 */ @@ -932,6 +936,10 @@ object functions { /** * Aggregate function: returns the value associated with the minimum value of ord. * + * @note + * The function is non-deterministic so the output order can be different for those associated + * the same values of `e`. + * * @group agg_funcs * @since 3.4.0 */ diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index c24b9e4378a6..f89c8b2e64a3 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -1271,6 +1271,11 @@ def max_by(col: "ColumnOrName", ord: "ColumnOrName") -> Column: .. versionchanged:: 3.4.0 Supports Spark Connect. + Notes + ----- + The function is non-deterministic so the output order can be different for those + associated the same values of `col`. + Parameters ---------- col : :class:`~pyspark.sql.Column` or str @@ -1352,6 +1357,11 @@ def min_by(col: "ColumnOrName", ord: "ColumnOrName") -> Column: .. versionchanged:: 3.4.0 Supports Spark Connect. + Notes + ----- + The function is non-deterministic so the output order can be different for those + associated the same values of `col`. + Parameters ---------- col : :class:`~pyspark.sql.Column` or str diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/MaxByAndMinBy.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/MaxByAndMinBy.scala index 56941c9de451..b33142ed29cc 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/MaxByAndMinBy.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/MaxByAndMinBy.scala @@ -99,6 +99,10 @@ abstract class MaxMinBy extends DeclarativeAggregate with BinaryLike[Expression] > SELECT _FUNC_(x, y) FROM VALUES ('a', 10), ('b', 50), ('c', 20) AS tab(x, y); b """, + note = """ + The function is non-deterministic so the output order can be different for + those associated the same values of `x`. + """, group = "agg_funcs", since = "3.0.0") case class MaxBy(valueExpr: Expression, orderingExpr: Expression) extends MaxMinBy { @@ -122,6 +126,10 @@ case class MaxBy(valueExpr: Expression, orderingExpr: Expression) extends MaxMin > SELECT _FUNC_(x, y) FROM VALUES ('a', 10), ('b', 50), ('c', 20) AS tab(x, y); a """, + note = """ + The function is non-deterministic so the output order can be different for + those associated the same values of `x`. + """, group = "agg_funcs", since = "3.0.0") case class MinBy(valueExpr: Expression, orderingExpr: Expression) extends MaxMinBy { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index 882918eb78c7..5b4d27fc65d0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -902,6 +902,9 @@ object functions { /** * Aggregate function: returns the value associated with the maximum value of ord. * + * @note The function is non-deterministic so the output order can be different for + * those associated the same values of `e`. + * * @group agg_funcs * @since 3.3.0 */ @@ -952,6 +955,9 @@ object functions { /** * Aggregate function: returns the value associated with the minimum value of ord. * + * @note The function is non-deterministic so the output order can be different for + * those associated the same values of `e`. + * * @group agg_funcs * @since 3.3.0 */