diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 177e0353c52a..e340b0604e98 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -879,8 +879,8 @@ setMethod("factorial", #' #' The function by default returns the first values it sees. It will return the first non-missing #' value it sees when na.rm is set to true. If all values are missing, then NA is returned. -#' Note: the function is non-deterministic because its results depends on order of rows which -#' may be non-deterministic after a shuffle. +#' Note: the function is non-deterministic because its results depends on the order of the rows +#' which may be non-deterministic after a shuffle. #' #' @param na.rm a logical value indicating whether NA values should be stripped #' before the computation proceeds. @@ -1024,8 +1024,8 @@ setMethod("kurtosis", #' #' The function by default returns the last values it sees. It will return the last non-missing #' value it sees when na.rm is set to true. If all values are missing, then NA is returned. -#' Note: the function is non-deterministic because its results depends on order of rows which -#' may be non-deterministic after a shuffle. +#' Note: the function is non-deterministic because its results depends on the order of the rows +#' which may be non-deterministic after a shuffle. #' #' @param x column to compute on. #' @param na.rm a logical value indicating whether NA values should be stripped @@ -3706,7 +3706,7 @@ setMethod("create_map", #' @details #' \code{collect_list}: Creates a list of objects with duplicates. #' Note: the function is non-deterministic because the order of collected results depends -#' on order of rows which may be non-deterministic after a shuffle. +#' on the order of the rows which may be non-deterministic after a shuffle. #' #' @rdname column_aggregate_functions #' @aliases collect_list collect_list,Column-method @@ -3727,7 +3727,7 @@ setMethod("collect_list", #' @details #' \code{collect_set}: Creates a list of objects with duplicate elements eliminated. #' Note: the function is non-deterministic because the order of collected results depends -#' on order of rows which may be non-deterministic after a shuffle. +#' on the order of the rows which may be non-deterministic after a shuffle. #' #' @rdname column_aggregate_functions #' @aliases collect_set collect_set,Column-method diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 2cd91ec2b1ae..a8d4732237e4 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -196,7 +196,7 @@ def _options_to_str(options): Aggregate function: returns a list of objects with duplicates. .. note:: The function is non-deterministic because the order of collected results depends - on order of rows which may be non-deterministic after a shuffle. + on the order of the rows which may be non-deterministic after a shuffle. >>> df2 = spark.createDataFrame([(2,), (5,), (5,)], ('age',)) >>> df2.agg(collect_list('age')).collect() @@ -206,7 +206,7 @@ def _options_to_str(options): Aggregate function: returns a set of objects with duplicate elements eliminated. .. note:: The function is non-deterministic because the order of collected results depends - on order of rows which may be non-deterministic after a shuffle. + on the order of the rows which may be non-deterministic after a shuffle. >>> df2 = spark.createDataFrame([(2,), (5,), (5,)], ('age',)) >>> df2.agg(collect_set('age')).collect() @@ -444,8 +444,8 @@ def first(col, ignorenulls=False): The function by default returns the first values it sees. It will return the first non-null value it sees when ignoreNulls is set to true. If all values are null, then null is returned. - .. note:: The function is non-deterministic because its results depends on order of rows which - may be non-deterministic after a shuffle. + .. note:: The function is non-deterministic because its results depends on the order of the + rows which may be non-deterministic after a shuffle. """ sc = SparkContext._active_spark_context jc = sc._jvm.functions.first(_to_java_column(col), ignorenulls) @@ -535,8 +535,8 @@ def last(col, ignorenulls=False): The function by default returns the last values it sees. It will return the last non-null value it sees when ignoreNulls is set to true. If all values are null, then null is returned. - .. note:: The function is non-deterministic because its results depends on order of rows - which may be non-deterministic after a shuffle. + .. note:: The function is non-deterministic because its results depends on the order of the + rows which may be non-deterministic after a shuffle. """ sc = SparkContext._active_spark_context jc = sc._jvm.functions.last(_to_java_column(col), ignorenulls) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/First.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/First.scala index 8de866ed9fb1..210acf33fc43 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/First.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/First.scala @@ -43,6 +43,10 @@ import org.apache.spark.sql.types._ > SELECT _FUNC_(col, true) FROM VALUES (NULL), (5), (20) AS tab(col); 5 """, + note = """ + The function is non-deterministic because its results depends on the order of the rows + which may be non-deterministic after a shuffle. + """, since = "2.0.0") case class First(child: Expression, ignoreNullsExpr: Expression) extends DeclarativeAggregate with ExpectsInputTypes { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Last.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Last.scala index f8af0cd1f303..2c89a4b973a7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Last.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Last.scala @@ -43,6 +43,10 @@ import org.apache.spark.sql.types._ > SELECT _FUNC_(col, true) FROM VALUES (10), (5), (NULL) AS tab(col); 5 """, + note = """ + The function is non-deterministic because its results depends on the order of the rows + which may be non-deterministic after a shuffle. + """, since = "2.0.0") case class Last(child: Expression, ignoreNullsExpr: Expression) extends DeclarativeAggregate with ExpectsInputTypes { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala index a302726af983..29f89989b496 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala @@ -92,6 +92,10 @@ abstract class Collect[T <: Growable[Any] with Iterable[Any]] extends TypedImper > SELECT _FUNC_(col) FROM VALUES (1), (2), (1) AS tab(col); [1,2,1] """, + note = """ + The function is non-deterministic because the order of collected results depends + on the order of the rows which may be non-deterministic after a shuffle. + """, since = "2.0.0") case class CollectList( child: Expression, @@ -121,6 +125,10 @@ case class CollectList( > SELECT _FUNC_(col) FROM VALUES (1), (2), (1) AS tab(col); [1,2] """, + note = """ + The function is non-deterministic because the order of collected results depends + on the order of the rows which may be non-deterministic after a shuffle. + """, since = "2.0.0") case class CollectSet( child: Expression, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index 9911972d0f1b..59dbe3e4b397 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -273,7 +273,7 @@ object functions { * Aggregate function: returns a list of objects with duplicates. * * @note The function is non-deterministic because the order of collected results depends - * on order of rows which may be non-deterministic after a shuffle. + * on the order of the rows which may be non-deterministic after a shuffle. * * @group agg_funcs * @since 1.6.0 @@ -284,7 +284,7 @@ object functions { * Aggregate function: returns a list of objects with duplicates. * * @note The function is non-deterministic because the order of collected results depends - * on order of rows which may be non-deterministic after a shuffle. + * on the order of the rows which may be non-deterministic after a shuffle. * * @group agg_funcs * @since 1.6.0 @@ -295,7 +295,7 @@ object functions { * Aggregate function: returns a set of objects with duplicate elements eliminated. * * @note The function is non-deterministic because the order of collected results depends - * on order of rows which may be non-deterministic after a shuffle. + * on the order of the rows which may be non-deterministic after a shuffle. * * @group agg_funcs * @since 1.6.0 @@ -306,7 +306,7 @@ object functions { * Aggregate function: returns a set of objects with duplicate elements eliminated. * * @note The function is non-deterministic because the order of collected results depends - * on order of rows which may be non-deterministic after a shuffle. + * on the order of the rows which may be non-deterministic after a shuffle. * * @group agg_funcs * @since 1.6.0 @@ -424,8 +424,8 @@ object functions { * The function by default returns the first values it sees. It will return the first non-null * value it sees when ignoreNulls is set to true. If all values are null, then null is returned. * - * @note The function is non-deterministic because its results depends on order of rows which - * may be non-deterministic after a shuffle. + * @note The function is non-deterministic because its results depends on the order of the rows + * which may be non-deterministic after a shuffle. * * @group agg_funcs * @since 2.0.0 @@ -440,8 +440,8 @@ object functions { * The function by default returns the first values it sees. It will return the first non-null * value it sees when ignoreNulls is set to true. If all values are null, then null is returned. * - * @note The function is non-deterministic because its results depends on order of rows which - * may be non-deterministic after a shuffle. + * @note The function is non-deterministic because its results depends on the order of the rows + * which may be non-deterministic after a shuffle. * * @group agg_funcs * @since 2.0.0 @@ -456,8 +456,8 @@ object functions { * The function by default returns the first values it sees. It will return the first non-null * value it sees when ignoreNulls is set to true. If all values are null, then null is returned. * - * @note The function is non-deterministic because its results depends on order of rows which - * may be non-deterministic after a shuffle. + * @note The function is non-deterministic because its results depends on the order of the rows + * which may be non-deterministic after a shuffle. * * @group agg_funcs * @since 1.3.0 @@ -470,8 +470,8 @@ object functions { * The function by default returns the first values it sees. It will return the first non-null * value it sees when ignoreNulls is set to true. If all values are null, then null is returned. * - * @note The function is non-deterministic because its results depends on order of rows which - * may be non-deterministic after a shuffle. + * @note The function is non-deterministic because its results depends on the order of the rows + * which may be non-deterministic after a shuffle. * * @group agg_funcs * @since 1.3.0 @@ -549,8 +549,8 @@ object functions { * The function by default returns the last values it sees. It will return the last non-null * value it sees when ignoreNulls is set to true. If all values are null, then null is returned. * - * @note The function is non-deterministic because its results depends on order of rows which - * may be non-deterministic after a shuffle. + * @note The function is non-deterministic because its results depends on the order of the rows + * which may be non-deterministic after a shuffle. * * @group agg_funcs * @since 2.0.0 @@ -565,8 +565,8 @@ object functions { * The function by default returns the last values it sees. It will return the last non-null * value it sees when ignoreNulls is set to true. If all values are null, then null is returned. * - * @note The function is non-deterministic because its results depends on order of rows which - * may be non-deterministic after a shuffle. + * @note The function is non-deterministic because its results depends on the order of the rows + * which may be non-deterministic after a shuffle. * * @group agg_funcs * @since 2.0.0 @@ -581,8 +581,8 @@ object functions { * The function by default returns the last values it sees. It will return the last non-null * value it sees when ignoreNulls is set to true. If all values are null, then null is returned. * - * @note The function is non-deterministic because its results depends on order of rows which - * may be non-deterministic after a shuffle. + * @note The function is non-deterministic because its results depends on the order of the rows + * which may be non-deterministic after a shuffle. * * @group agg_funcs * @since 1.3.0 @@ -595,8 +595,8 @@ object functions { * The function by default returns the last values it sees. It will return the last non-null * value it sees when ignoreNulls is set to true. If all values are null, then null is returned. * - * @note The function is non-deterministic because its results depends on order of rows which - * may be non-deterministic after a shuffle. + * @note The function is non-deterministic because its results depends on the order of the rows + * which may be non-deterministic after a shuffle. * * @group agg_funcs * @since 1.3.0