From ab6e4f1651ec09e576b8dcf8a611c9f2ea2169a5 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Thu, 25 May 2017 23:48:23 -0700 Subject: [PATCH 01/11] consolidated doc change for SQL aggregate functions --- R/pkg/R/functions.R | 398 +++++++++++++++++--------------------------- R/pkg/R/generics.R | 60 ++++--- 2 files changed, 194 insertions(+), 264 deletions(-) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 06a90192bb12..a9a341c22715 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -18,6 +18,21 @@ #' @include generics.R column.R NULL +#' Aggregate functions for Column operations +#' +#' Aggregate functions defined for \code{Column}. +#' +#' @param x Column to compute on. +#' @param ... additional argument(s). +#' @name column_aggregate_functions +#' @rdname column_aggregate_functions +#' @family aggregate functions +#' @examples +#' \dontrun{ +#' # Dataframe used throughout this doc +#' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars))} +NULL + #' lit #' #' A new \linkS4class{Column} is created to represent the literal value. @@ -85,17 +100,20 @@ setMethod("acos", column(jc) }) -#' Returns the approximate number of distinct items in a group +#' @section Details: +#' \code{approxCountDistinct}: Returns the approximate number of distinct items in a group. #' -#' Returns the approximate number of distinct items in a group. This is a column -#' aggregate function. -#' -#' @rdname approxCountDistinct -#' @name approxCountDistinct -#' @return the approximate number of distinct items in a group. +#' @rdname column_aggregate_functions #' @export -#' @aliases approxCountDistinct,Column-method -#' @examples \dontrun{approxCountDistinct(df$c)} +#' @aliases approxCountDistinct approxCountDistinct,Column-method +#' @examples +#' +#' \dontrun{ +#' head(select(df, approxCountDistinct(df$gear))) +#' head(select(df, approxCountDistinct(df$gear, 0.02))) +#' head(select(df, countDistinct(df$gear))) +#' head(select(df, n_distinct(df$gear))) +#' head(distinct(select(df, "gear")))} #' @note approxCountDistinct(Column) since 1.4.0 setMethod("approxCountDistinct", signature(x = "Column"), @@ -166,16 +184,24 @@ setMethod("atan", column(jc) }) -#' avg -#' -#' Aggregate function: returns the average of the values in a group. +#' @section Details: +#' \code{avg}: Returns the average of the values in a group. #' -#' @rdname avg -#' @name avg -#' @family aggregate functions +#' @rdname column_aggregate_functions #' @export -#' @aliases avg,Column-method -#' @examples \dontrun{avg(df$c)} +#' @aliases avg avg,Column-method +#' @examples +#' +#' \dontrun{ +#' head(select(df, avg(df$mpg), mean(df$mpg), min(df$wt), max(df$qsec))) +#' +#' # metrics by num of cylinders +#' tmp <- agg(groupBy(df, "cyl"), avg(df$mpg), avg(df$hp), avg(df$wt), avg(df$qsec)) +#' head(orderBy(tmp, "cyl")) +#' +#' # car with the max mpg +#' mpg_max <- as.numeric(collect(agg(df, max(df$mpg)))) +#' head(where(df, df$mpg == mpg_max))} #' @note avg since 1.4.0 setMethod("avg", signature(x = "Column"), @@ -823,18 +849,16 @@ setMethod("isnan", column(jc) }) -#' kurtosis +#' @section Details: +#' \code{kurtosis}: Returns the kurtosis of the values in a group. #' -#' Aggregate function: returns the kurtosis of the values in a group. -#' -#' @param x Column to compute on. -#' -#' @rdname kurtosis -#' @name kurtosis -#' @aliases kurtosis,Column-method -#' @family aggregate functions +#' @rdname column_aggregate_functions +#' @aliases kurtosis kurtosis,Column-method #' @export -#' @examples \dontrun{kurtosis(df$c)} +#' @examples +#' +#' \dontrun{ +#' head(select(df, mean(df$mpg), sd(df$mpg), skewness(df$mpg), kurtosis(df$mpg)))} #' @note kurtosis since 1.6.0 setMethod("kurtosis", signature(x = "Column"), @@ -1040,18 +1064,11 @@ setMethod("ltrim", column(jc) }) -#' max -#' -#' Aggregate function: returns the maximum value of the expression in a group. +#' @section Details: +#' \code{max}: Returns the maximum value of the expression in a group. #' -#' @param x Column to compute on. -#' -#' @rdname max -#' @name max -#' @family aggregate functions -#' @aliases max,Column-method -#' @export -#' @examples \dontrun{max(df$c)} +#' @rdname column_aggregate_functions +#' @aliases max max,Column-method #' @note max since 1.5.0 setMethod("max", signature(x = "Column"), @@ -1081,19 +1098,12 @@ setMethod("md5", column(jc) }) -#' mean -#' -#' Aggregate function: returns the average of the values in a group. -#' Alias for avg. -#' -#' @param x Column to compute on. +#' @section Details: +#' \code{mean}: Returns the average of the values in a group. Alias for avg. #' -#' @rdname mean -#' @name mean -#' @family aggregate functions -#' @aliases mean,Column-method +#' @rdname column_aggregate_functions +#' @aliases mean mean,Column-method #' @export -#' @examples \dontrun{mean(df$c)} #' @note mean since 1.5.0 setMethod("mean", signature(x = "Column"), @@ -1102,18 +1112,12 @@ setMethod("mean", column(jc) }) -#' min -#' -#' Aggregate function: returns the minimum value of the expression in a group. -#' -#' @param x Column to compute on. +#' @section Details: +#' \code{min}: Returns the minimum value of the expression in a group. #' -#' @rdname min -#' @name min -#' @aliases min,Column-method -#' @family aggregate functions +#' @rdname column_aggregate_functions +#' @aliases min min,Column-method #' @export -#' @examples \dontrun{min(df$c)} #' @note min since 1.5.0 setMethod("min", signature(x = "Column"), @@ -1338,24 +1342,17 @@ setMethod("rtrim", column(jc) }) -#' sd -#' -#' Aggregate function: alias for \link{stddev_samp} + +#' @section Details: +#' \code{sd}: Alias for \code{stddev_samp}. #' -#' @param x Column to compute on. -#' @param na.rm currently not used. -#' @rdname sd -#' @name sd -#' @family aggregate functions -#' @aliases sd,Column-method -#' @seealso \link{stddev_pop}, \link{stddev_samp} +#' @rdname column_aggregate_functions +#' @aliases sd sd,Column-method #' @export #' @examples -#'\dontrun{ -#'stddev(df$c) -#'select(df, stddev(df$age)) -#'agg(df, sd(df$age)) -#'} +#' +#' \dontrun{ +#' head(select(df, sd(df$mpg), stddev(df$mpg), stddev_pop(df$wt), stddev_samp(df$qsec)))} #' @note sd since 1.6.0 setMethod("sd", signature(x = "Column"), @@ -1465,18 +1462,12 @@ setMethod("sinh", column(jc) }) -#' skewness -#' -#' Aggregate function: returns the skewness of the values in a group. -#' -#' @param x Column to compute on. +#' @section Details: +#' \code{skewness}: Returns the skewness of the values in a group. #' -#' @rdname skewness -#' @name skewness -#' @family aggregate functions -#' @aliases skewness,Column-method +#' @rdname column_aggregate_functions +#' @aliases skewness skewness,Column-method #' @export -#' @examples \dontrun{skewness(df$c)} #' @note skewness since 1.6.0 setMethod("skewness", signature(x = "Column"), @@ -1527,9 +1518,11 @@ setMethod("spark_partition_id", column(jc) }) -#' @rdname sd -#' @aliases stddev,Column-method -#' @name stddev +#' @section Details: +#' \code{stddev}: Alias for \code{std_dev}. +#' +#' @rdname column_aggregate_functions +#' @aliases stddev stddev,Column-method #' @note stddev since 1.6.0 setMethod("stddev", signature(x = "Column"), @@ -1538,19 +1531,12 @@ setMethod("stddev", column(jc) }) -#' stddev_pop +#' @section Details: +#' \code{stddev_pop}: Returns the population standard deviation of the expression in a group. #' -#' Aggregate function: returns the population standard deviation of the expression in a group. -#' -#' @param x Column to compute on. -#' -#' @rdname stddev_pop -#' @name stddev_pop -#' @family aggregate functions -#' @aliases stddev_pop,Column-method -#' @seealso \link{sd}, \link{stddev_samp} +#' @rdname column_aggregate_functions +#' @aliases stddev_pop stddev_pop,Column-method #' @export -#' @examples \dontrun{stddev_pop(df$c)} #' @note stddev_pop since 1.6.0 setMethod("stddev_pop", signature(x = "Column"), @@ -1559,19 +1545,12 @@ setMethod("stddev_pop", column(jc) }) -#' stddev_samp +#' @section Details: +#' \code{stddev_samp}: Returns the unbiased sample standard deviation of the expression in a group. #' -#' Aggregate function: returns the unbiased sample standard deviation of the expression in a group. -#' -#' @param x Column to compute on. -#' -#' @rdname stddev_samp -#' @name stddev_samp -#' @family aggregate functions -#' @aliases stddev_samp,Column-method -#' @seealso \link{stddev_pop}, \link{sd} +#' @rdname column_aggregate_functions +#' @aliases stddev_samp stddev_samp,Column-method #' @export -#' @examples \dontrun{stddev_samp(df$c)} #' @note stddev_samp since 1.6.0 setMethod("stddev_samp", signature(x = "Column"), @@ -1630,18 +1609,12 @@ setMethod("sqrt", column(jc) }) -#' sum -#' -#' Aggregate function: returns the sum of all values in the expression. +#' @section Details: +#' \code{sum}: Returns the sum of all values in the expression. #' -#' @param x Column to compute on. -#' -#' @rdname sum -#' @name sum -#' @family aggregate functions -#' @aliases sum,Column-method +#' @rdname column_aggregate_functions +#' @aliases sum sum,Column-method #' @export -#' @examples \dontrun{sum(df$c)} #' @note sum since 1.5.0 setMethod("sum", signature(x = "Column"), @@ -1650,18 +1623,17 @@ setMethod("sum", column(jc) }) -#' sumDistinct -#' -#' Aggregate function: returns the sum of distinct values in the expression. +#' @section Details: +#' \code{sumDistinct}: Returns the sum of distinct values in the expression. #' -#' @param x Column to compute on. -#' -#' @rdname sumDistinct -#' @name sumDistinct -#' @family aggregate functions -#' @aliases sumDistinct,Column-method +#' @rdname column_aggregate_functions +#' @aliases sumDistinct sumDistinct,Column-method #' @export -#' @examples \dontrun{sumDistinct(df$c)} +#' @examples +#' +#' \dontrun{ +#' head(select(df, sumDistinct(df$gear))) +#' head(distinct(select(df, "gear")))} #' @note sumDistinct since 1.4.0 setMethod("sumDistinct", signature(x = "Column"), @@ -1952,24 +1924,16 @@ setMethod("upper", column(jc) }) -#' var +#' @section Details: +#' \code{var}: Alias for \code{var_samp}. #' -#' Aggregate function: alias for \link{var_samp}. -#' -#' @param x a Column to compute on. -#' @param y,na.rm,use currently not used. -#' @rdname var -#' @name var -#' @family aggregate functions -#' @aliases var,Column-method -#' @seealso \link{var_pop}, \link{var_samp} +#' @rdname column_aggregate_functions +#' @aliases var var,Column-method #' @export #' @examples +#' #'\dontrun{ -#'variance(df$c) -#'select(df, var_pop(df$age)) -#'agg(df, var(df$age)) -#'} +#'head(agg(df, var(df$mpg), variance(df$mpg), var_pop(df$mpg), var_samp(df$mpg)))} #' @note var since 1.6.0 setMethod("var", signature(x = "Column"), @@ -1978,9 +1942,9 @@ setMethod("var", var_samp(x) }) -#' @rdname var -#' @aliases variance,Column-method -#' @name variance +#' @rdname column_aggregate_functions +#' @aliases variance variance,Column-method +#' @export #' @note variance since 1.6.0 setMethod("variance", signature(x = "Column"), @@ -1989,19 +1953,12 @@ setMethod("variance", column(jc) }) -#' var_pop -#' -#' Aggregate function: returns the population variance of the values in a group. +#' @section Details: +#' \code{var_pop}: Returns the population variance of the values in a group. #' -#' @param x Column to compute on. -#' -#' @rdname var_pop -#' @name var_pop -#' @family aggregate functions -#' @aliases var_pop,Column-method -#' @seealso \link{var}, \link{var_samp} +#' @rdname column_aggregate_functions +#' @aliases var_pop var_pop,Column-method #' @export -#' @examples \dontrun{var_pop(df$c)} #' @note var_pop since 1.5.0 setMethod("var_pop", signature(x = "Column"), @@ -2010,19 +1967,12 @@ setMethod("var_pop", column(jc) }) -#' var_samp -#' -#' Aggregate function: returns the unbiased variance of the values in a group. +#' @section Details: +#' \code{var_samp}: Returns the unbiased variance of the values in a group. #' -#' @param x Column to compute on. -#' -#' @rdname var_samp -#' @name var_samp -#' @aliases var_samp,Column-method -#' @family aggregate functions -#' @seealso \link{var_pop}, \link{var} +#' @rdname column_aggregate_functions +#' @aliases var_samp var_samp,Column-method #' @export -#' @examples \dontrun{var_samp(df$c)} #' @note var_samp since 1.6.0 setMethod("var_samp", signature(x = "Column"), @@ -2235,17 +2185,11 @@ setMethod("pmod", signature(y = "Column"), column(jc) }) - -#' @rdname approxCountDistinct -#' @name approxCountDistinct -#' -#' @param x Column to compute on. #' @param rsd maximum estimation error allowed (default = 0.05) -#' @param ... further arguments to be passed to or from other methods. #' +#' @rdname column_aggregate_functions #' @aliases approxCountDistinct,Column-method #' @export -#' @examples \dontrun{approxCountDistinct(df$c, 0.02)} #' @note approxCountDistinct(Column, numeric) since 1.4.0 setMethod("approxCountDistinct", signature(x = "Column"), @@ -2254,18 +2198,12 @@ setMethod("approxCountDistinct", column(jc) }) -#' Count Distinct Values +#' @section Details: +#' \code{countDistinct}: Returns the number of distinct items in a group. #' -#' @param x Column to compute on -#' @param ... other columns -#' -#' @family aggregate functions -#' @rdname countDistinct -#' @name countDistinct -#' @aliases countDistinct,Column-method -#' @return the number of distinct items in a group. +#' @rdname column_aggregate_functions +#' @aliases countDistinct countDistinct,Column-method #' @export -#' @examples \dontrun{countDistinct(df$c)} #' @note countDistinct since 1.4.0 setMethod("countDistinct", signature(x = "Column"), @@ -2384,15 +2322,12 @@ setMethod("sign", signature(x = "Column"), signum(x) }) -#' n_distinct +#' @section Details: +#' \code{n_distinct}: Returns the number of distinct items in a group. #' -#' Aggregate function: returns the number of distinct items in a group. -#' -#' @rdname countDistinct -#' @name n_distinct -#' @aliases n_distinct,Column-method +#' @rdname column_aggregate_functions +#' @aliases n_distinct n_distinct,Column-method #' @export -#' @examples \dontrun{n_distinct(df$c)} #' @note n_distinct since 1.4.0 setMethod("n_distinct", signature(x = "Column"), function(x, ...) { @@ -3717,18 +3652,18 @@ setMethod("create_map", column(jc) }) -#' collect_list -#' -#' Creates a list of objects with duplicates. -#' -#' @param x Column to compute on +#' @section Details: +#' \code{collect_list}: Creates a list of objects with duplicates. #' -#' @rdname collect_list -#' @name collect_list -#' @family aggregate functions -#' @aliases collect_list,Column-method +#' @rdname column_aggregate_functions +#' @aliases collect_list collect_list,Column-method #' @export -#' @examples \dontrun{collect_list(df$x)} +#' @examples +#' +#' \dontrun{ +#' df2 = df[df$mpg > 20, ] +#' collect(select(df2, collect_list(df2$gear))) +#' collect(select(df2, collect_set(df2$gear)))} #' @note collect_list since 2.3.0 setMethod("collect_list", signature(x = "Column"), @@ -3737,18 +3672,12 @@ setMethod("collect_list", column(jc) }) -#' collect_set +#' @section Details: +#' \code{collect_set}: Creates a list of objects with duplicate elements eliminated. #' -#' Creates a list of objects with duplicate elements eliminated. -#' -#' @param x Column to compute on -#' -#' @rdname collect_set -#' @name collect_set -#' @family aggregate functions -#' @aliases collect_set,Column-method +#' @rdname column_aggregate_functions +#' @aliases collect_set collect_set,Column-method #' @export -#' @examples \dontrun{collect_set(df$x)} #' @note collect_set since 2.3.0 setMethod("collect_set", signature(x = "Column"), @@ -3908,24 +3837,17 @@ setMethod("not", column(jc) }) -#' grouping_bit -#' -#' Indicates whether a specified column in a GROUP BY list is aggregated or not, -#' returns 1 for aggregated or 0 for not aggregated in the result set. -#' -#' Same as \code{GROUPING} in SQL and \code{grouping} function in Scala. -#' -#' @param x Column to compute on +#' @section Details: +#' \code{grouping_bit}: Indicates whether a specified column in a GROUP BY list is aggregated or not, +#' returns 1 for aggregated or 0 for not aggregated in the result set. Same as \code{GROUPING} in SQL +#' and \code{grouping} function in Scala. #' -#' @rdname grouping_bit -#' @name grouping_bit -#' @family aggregate functions -#' @aliases grouping_bit,Column-method +#' @rdname column_aggregate_functions +#' @aliases grouping_bit grouping_bit,Column-method #' @export #' @examples -#' \dontrun{ -#' df <- createDataFrame(mtcars) #' +#' \dontrun{ #' # With cube #' agg( #' cube(df, "cyl", "gear", "am"), @@ -3938,8 +3860,7 @@ setMethod("not", #' rollup(df, "cyl", "gear", "am"), #' mean(df$mpg), #' grouping_bit(df$cyl), grouping_bit(df$gear), grouping_bit(df$am) -#' ) -#' } +#' )} #' @note grouping_bit since 2.3.0 setMethod("grouping_bit", signature(x = "Column"), @@ -3948,26 +3869,18 @@ setMethod("grouping_bit", column(jc) }) -#' grouping_id -#' -#' Returns the level of grouping. -#' +#' @section Details: +#' \code{grouping_id}: Returns the level of grouping. #' Equals to \code{ #' grouping_bit(c1) * 2^(n - 1) + grouping_bit(c2) * 2^(n - 2) + ... + grouping_bit(cn) #' } #' -#' @param x Column to compute on -#' @param ... additional Column(s) (optional). -#' -#' @rdname grouping_id -#' @name grouping_id -#' @family aggregate functions -#' @aliases grouping_id,Column-method +#' @rdname column_aggregate_functions +#' @aliases grouping_id grouping_id,Column-method #' @export #' @examples -#' \dontrun{ -#' df <- createDataFrame(mtcars) #' +#' \dontrun{ #' # With cube #' agg( #' cube(df, "cyl", "gear", "am"), @@ -3980,8 +3893,7 @@ setMethod("grouping_bit", #' rollup(df, "cyl", "gear", "am"), #' mean(df$mpg), #' grouping_id(df$cyl, df$gear, df$am) -#' ) -#' } +#' )} #' @note grouping_id since 2.3.0 setMethod("grouping_id", signature(x = "Column"), diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 514ca99d45cd..d8950d4c2d27 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -907,8 +907,9 @@ setGeneric("windowOrderBy", function(col, ...) { standardGeneric("windowOrderBy" #' @export setGeneric("add_months", function(y, x) { standardGeneric("add_months") }) -#' @rdname approxCountDistinct +#' @rdname column_aggregate_functions #' @export +#' @name NULL setGeneric("approxCountDistinct", function(x, ...) { standardGeneric("approxCountDistinct") }) #' @rdname array_contains @@ -919,10 +920,9 @@ setGeneric("array_contains", function(x, value) { standardGeneric("array_contain #' @export setGeneric("ascii", function(x) { standardGeneric("ascii") }) -#' @param x Column to compute on or a GroupedData object. -#' @param ... additional argument(s) when \code{x} is a GroupedData object. -#' @rdname avg +#' @rdname column_aggregate_functions #' @export +#' @name NULL setGeneric("avg", function(x, ...) { standardGeneric("avg") }) #' @rdname base64 @@ -949,12 +949,14 @@ setGeneric("cbrt", function(x) { standardGeneric("cbrt") }) #' @export setGeneric("ceil", function(x) { standardGeneric("ceil") }) -#' @rdname collect_list +#' @rdname column_aggregate_functions #' @export +#' @name NULL setGeneric("collect_list", function(x) { standardGeneric("collect_list") }) -#' @rdname collect_set +#' @rdname column_aggregate_functions #' @export +#' @name NULL setGeneric("collect_set", function(x) { standardGeneric("collect_set") }) #' @rdname column @@ -973,8 +975,9 @@ setGeneric("concat_ws", function(sep, x, ...) { standardGeneric("concat_ws") }) #' @export setGeneric("conv", function(x, fromBase, toBase) { standardGeneric("conv") }) -#' @rdname countDistinct +#' @rdname column_aggregate_functions #' @export +#' @name NULL setGeneric("countDistinct", function(x, ...) { standardGeneric("countDistinct") }) #' @rdname crc32 @@ -1071,12 +1074,14 @@ setGeneric("from_unixtime", function(x, ...) { standardGeneric("from_unixtime") #' @export setGeneric("greatest", function(x, ...) { standardGeneric("greatest") }) -#' @rdname grouping_bit +#' @rdname column_aggregate_functions #' @export +#' @name NULL setGeneric("grouping_bit", function(x) { standardGeneric("grouping_bit") }) -#' @rdname grouping_id +#' @rdname column_aggregate_functions #' @export +#' @name NULL setGeneric("grouping_id", function(x, ...) { standardGeneric("grouping_id") }) #' @rdname hex @@ -1109,8 +1114,9 @@ setGeneric("instr", function(y, x) { standardGeneric("instr") }) #' @export setGeneric("isnan", function(x) { standardGeneric("isnan") }) -#' @rdname kurtosis +#' @rdname column_aggregate_functions #' @export +#' @name NULL setGeneric("kurtosis", function(x) { standardGeneric("kurtosis") }) #' @rdname lag @@ -1203,8 +1209,9 @@ setGeneric("next_day", function(y, x) { standardGeneric("next_day") }) #' @export setGeneric("ntile", function(x) { standardGeneric("ntile") }) -#' @rdname countDistinct +#' @rdname column_aggregate_functions #' @export +#' @name NULL setGeneric("n_distinct", function(x, ...) { standardGeneric("n_distinct") }) #' @param x empty. Should be used with no argument. @@ -1274,8 +1281,9 @@ setGeneric("rpad", function(x, len, pad) { standardGeneric("rpad") }) #' @export setGeneric("rtrim", function(x) { standardGeneric("rtrim") }) -#' @rdname sd +#' @rdname column_aggregate_functions #' @export +#' @name NULL setGeneric("sd", function(x, na.rm = FALSE) { standardGeneric("sd") }) #' @rdname second @@ -1310,8 +1318,9 @@ setGeneric("signum", function(x) { standardGeneric("signum") }) #' @export setGeneric("size", function(x) { standardGeneric("size") }) -#' @rdname skewness +#' @rdname column_aggregate_functions #' @export +#' @name NULL setGeneric("skewness", function(x) { standardGeneric("skewness") }) #' @rdname sort_array @@ -1331,16 +1340,19 @@ setGeneric("soundex", function(x) { standardGeneric("soundex") }) #' @export setGeneric("spark_partition_id", function(x = "missing") { standardGeneric("spark_partition_id") }) -#' @rdname sd +#' @rdname column_aggregate_functions #' @export +#' @name NULL setGeneric("stddev", function(x) { standardGeneric("stddev") }) -#' @rdname stddev_pop +#' @rdname column_aggregate_functions #' @export +#' @name NULL setGeneric("stddev_pop", function(x) { standardGeneric("stddev_pop") }) -#' @rdname stddev_samp +#' @rdname column_aggregate_functions #' @export +#' @name NULL setGeneric("stddev_samp", function(x) { standardGeneric("stddev_samp") }) #' @rdname struct @@ -1351,8 +1363,9 @@ setGeneric("struct", function(x, ...) { standardGeneric("struct") }) #' @export setGeneric("substring_index", function(x, delim, count) { standardGeneric("substring_index") }) -#' @rdname sumDistinct +#' @rdname column_aggregate_functions #' @export +#' @name NULL setGeneric("sumDistinct", function(x) { standardGeneric("sumDistinct") }) #' @rdname toDegrees @@ -1403,20 +1416,25 @@ setGeneric("unix_timestamp", function(x, format) { standardGeneric("unix_timesta #' @export setGeneric("upper", function(x) { standardGeneric("upper") }) -#' @rdname var +#' @rdname column_aggregate_functions +#' @param y,na.rm,use currently not used. #' @export +#' @name NULL setGeneric("var", function(x, y = NULL, na.rm = FALSE, use) { standardGeneric("var") }) -#' @rdname var +#' @rdname column_aggregate_functions #' @export +#' @name NULL setGeneric("variance", function(x) { standardGeneric("variance") }) -#' @rdname var_pop +#' @rdname column_aggregate_functions #' @export +#' @name NULL setGeneric("var_pop", function(x) { standardGeneric("var_pop") }) -#' @rdname var_samp +#' @rdname column_aggregate_functions #' @export +#' @name NULL setGeneric("var_samp", function(x) { standardGeneric("var_samp") }) #' @rdname weekofyear From 0044b29853c949b0baac7c70ed35658ed6005943 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Wed, 31 May 2017 23:45:27 -0700 Subject: [PATCH 02/11] address comments --- R/pkg/R/functions.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index a9a341c22715..3dc76a56a420 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -193,7 +193,7 @@ setMethod("atan", #' @examples #' #' \dontrun{ -#' head(select(df, avg(df$mpg), mean(df$mpg), min(df$wt), max(df$qsec))) +#' head(select(df, avg(df$mpg), mean(df$mpg), sum(df$mpg), min(df$wt), max(df$qsec))) #' #' # metrics by num of cylinders #' tmp <- agg(groupBy(df, "cyl"), avg(df$mpg), avg(df$hp), avg(df$wt), avg(df$qsec)) @@ -1099,7 +1099,7 @@ setMethod("md5", }) #' @section Details: -#' \code{mean}: Returns the average of the values in a group. Alias for avg. +#' \code{mean}: Returns the average of the values in a group. Alias for \code{avg}. #' #' @rdname column_aggregate_functions #' @aliases mean mean,Column-method From 014b9f3069a6e2075cb8be307c5d74081dabe15a Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Thu, 15 Jun 2017 20:19:31 -0700 Subject: [PATCH 03/11] address comments --- R/pkg/R/functions.R | 49 +++++++++++++++++++++++---------------------- R/pkg/R/generics.R | 1 - 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 3dc76a56a420..f6bec04c5757 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -23,7 +23,8 @@ NULL #' Aggregate functions defined for \code{Column}. #' #' @param x Column to compute on. -#' @param ... additional argument(s). +#' @param y,na.rm,use currently not used. +#' @param ... additional argument(s). For example, it could be used to pass additional Columns. #' @name column_aggregate_functions #' @rdname column_aggregate_functions #' @family aggregate functions @@ -100,7 +101,7 @@ setMethod("acos", column(jc) }) -#' @section Details: +#' @details #' \code{approxCountDistinct}: Returns the approximate number of distinct items in a group. #' #' @rdname column_aggregate_functions @@ -111,7 +112,7 @@ setMethod("acos", #' \dontrun{ #' head(select(df, approxCountDistinct(df$gear))) #' head(select(df, approxCountDistinct(df$gear, 0.02))) -#' head(select(df, countDistinct(df$gear))) +#' head(select(df, countDistinct(df$gear, df$cyl))) #' head(select(df, n_distinct(df$gear))) #' head(distinct(select(df, "gear")))} #' @note approxCountDistinct(Column) since 1.4.0 @@ -184,7 +185,7 @@ setMethod("atan", column(jc) }) -#' @section Details: +#' @details #' \code{avg}: Returns the average of the values in a group. #' #' @rdname column_aggregate_functions @@ -849,7 +850,7 @@ setMethod("isnan", column(jc) }) -#' @section Details: +#' @details #' \code{kurtosis}: Returns the kurtosis of the values in a group. #' #' @rdname column_aggregate_functions @@ -1064,7 +1065,7 @@ setMethod("ltrim", column(jc) }) -#' @section Details: +#' @details #' \code{max}: Returns the maximum value of the expression in a group. #' #' @rdname column_aggregate_functions @@ -1098,7 +1099,7 @@ setMethod("md5", column(jc) }) -#' @section Details: +#' @details #' \code{mean}: Returns the average of the values in a group. Alias for \code{avg}. #' #' @rdname column_aggregate_functions @@ -1112,7 +1113,7 @@ setMethod("mean", column(jc) }) -#' @section Details: +#' @details #' \code{min}: Returns the minimum value of the expression in a group. #' #' @rdname column_aggregate_functions @@ -1343,7 +1344,7 @@ setMethod("rtrim", }) -#' @section Details: +#' @details #' \code{sd}: Alias for \code{stddev_samp}. #' #' @rdname column_aggregate_functions @@ -1462,7 +1463,7 @@ setMethod("sinh", column(jc) }) -#' @section Details: +#' @details #' \code{skewness}: Returns the skewness of the values in a group. #' #' @rdname column_aggregate_functions @@ -1518,7 +1519,7 @@ setMethod("spark_partition_id", column(jc) }) -#' @section Details: +#' @details #' \code{stddev}: Alias for \code{std_dev}. #' #' @rdname column_aggregate_functions @@ -1531,7 +1532,7 @@ setMethod("stddev", column(jc) }) -#' @section Details: +#' @details #' \code{stddev_pop}: Returns the population standard deviation of the expression in a group. #' #' @rdname column_aggregate_functions @@ -1545,7 +1546,7 @@ setMethod("stddev_pop", column(jc) }) -#' @section Details: +#' @details #' \code{stddev_samp}: Returns the unbiased sample standard deviation of the expression in a group. #' #' @rdname column_aggregate_functions @@ -1609,7 +1610,7 @@ setMethod("sqrt", column(jc) }) -#' @section Details: +#' @details #' \code{sum}: Returns the sum of all values in the expression. #' #' @rdname column_aggregate_functions @@ -1623,7 +1624,7 @@ setMethod("sum", column(jc) }) -#' @section Details: +#' @details #' \code{sumDistinct}: Returns the sum of distinct values in the expression. #' #' @rdname column_aggregate_functions @@ -1924,7 +1925,7 @@ setMethod("upper", column(jc) }) -#' @section Details: +#' @details #' \code{var}: Alias for \code{var_samp}. #' #' @rdname column_aggregate_functions @@ -1953,7 +1954,7 @@ setMethod("variance", column(jc) }) -#' @section Details: +#' @details #' \code{var_pop}: Returns the population variance of the values in a group. #' #' @rdname column_aggregate_functions @@ -1967,7 +1968,7 @@ setMethod("var_pop", column(jc) }) -#' @section Details: +#' @details #' \code{var_samp}: Returns the unbiased variance of the values in a group. #' #' @rdname column_aggregate_functions @@ -2198,7 +2199,7 @@ setMethod("approxCountDistinct", column(jc) }) -#' @section Details: +#' @details #' \code{countDistinct}: Returns the number of distinct items in a group. #' #' @rdname column_aggregate_functions @@ -2322,7 +2323,7 @@ setMethod("sign", signature(x = "Column"), signum(x) }) -#' @section Details: +#' @details #' \code{n_distinct}: Returns the number of distinct items in a group. #' #' @rdname column_aggregate_functions @@ -3652,7 +3653,7 @@ setMethod("create_map", column(jc) }) -#' @section Details: +#' @details #' \code{collect_list}: Creates a list of objects with duplicates. #' #' @rdname column_aggregate_functions @@ -3672,7 +3673,7 @@ setMethod("collect_list", column(jc) }) -#' @section Details: +#' @details #' \code{collect_set}: Creates a list of objects with duplicate elements eliminated. #' #' @rdname column_aggregate_functions @@ -3837,7 +3838,7 @@ setMethod("not", column(jc) }) -#' @section Details: +#' @details #' \code{grouping_bit}: Indicates whether a specified column in a GROUP BY list is aggregated or not, #' returns 1 for aggregated or 0 for not aggregated in the result set. Same as \code{GROUPING} in SQL #' and \code{grouping} function in Scala. @@ -3869,7 +3870,7 @@ setMethod("grouping_bit", column(jc) }) -#' @section Details: +#' @details #' \code{grouping_id}: Returns the level of grouping. #' Equals to \code{ #' grouping_bit(c1) * 2^(n - 1) + grouping_bit(c2) * 2^(n - 2) + ... + grouping_bit(cn) diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index d8950d4c2d27..802e76c6b09a 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -1417,7 +1417,6 @@ setGeneric("unix_timestamp", function(x, format) { standardGeneric("unix_timesta setGeneric("upper", function(x) { standardGeneric("upper") }) #' @rdname column_aggregate_functions -#' @param y,na.rm,use currently not used. #' @export #' @name NULL setGeneric("var", function(x, y = NULL, na.rm = FALSE, use) { standardGeneric("var") }) From 36203dfa4a5be1603754130585adeb6c3f233b01 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Thu, 15 Jun 2017 20:43:24 -0700 Subject: [PATCH 04/11] pull avg into its own doc --- R/pkg/R/functions.R | 32 +++++++++++++++++--------------- R/pkg/R/generics.R | 4 ++-- 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index f6bec04c5757..22f22f965d77 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -185,24 +185,14 @@ setMethod("atan", column(jc) }) -#' @details -#' \code{avg}: Returns the average of the values in a group. +#' avg #' -#' @rdname column_aggregate_functions +#' Returns the average of the values in a group. +#' +#' @rdname avg #' @export #' @aliases avg avg,Column-method -#' @examples -#' -#' \dontrun{ -#' head(select(df, avg(df$mpg), mean(df$mpg), sum(df$mpg), min(df$wt), max(df$qsec))) -#' -#' # metrics by num of cylinders -#' tmp <- agg(groupBy(df, "cyl"), avg(df$mpg), avg(df$hp), avg(df$wt), avg(df$qsec)) -#' head(orderBy(tmp, "cyl")) -#' -#' # car with the max mpg -#' mpg_max <- as.numeric(collect(agg(df, max(df$mpg)))) -#' head(where(df, df$mpg == mpg_max))} +#' @family aggregate functions #' @note avg since 1.4.0 setMethod("avg", signature(x = "Column"), @@ -1105,6 +1095,18 @@ setMethod("md5", #' @rdname column_aggregate_functions #' @aliases mean mean,Column-method #' @export +#' @examples +#' +#' \dontrun{ +#' head(select(df, avg(df$mpg), mean(df$mpg), sum(df$mpg), min(df$wt), max(df$qsec))) +#' +#' # metrics by num of cylinders +#' tmp <- agg(groupBy(df, "cyl"), avg(df$mpg), avg(df$hp), avg(df$wt), avg(df$qsec)) +#' head(orderBy(tmp, "cyl")) +#' +#' # car with the max mpg +#' mpg_max <- as.numeric(collect(agg(df, max(df$mpg)))) +#' head(where(df, df$mpg == mpg_max))} #' @note mean since 1.5.0 setMethod("mean", signature(x = "Column"), diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 802e76c6b09a..b17f9a119c9a 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -920,9 +920,9 @@ setGeneric("array_contains", function(x, value) { standardGeneric("array_contain #' @export setGeneric("ascii", function(x) { standardGeneric("ascii") }) -#' @rdname column_aggregate_functions +#' @param x Column to compute on or a GroupedData object. +#' @rdname avg #' @export -#' @name NULL setGeneric("avg", function(x, ...) { standardGeneric("avg") }) #' @rdname base64 From 0a7f5fcac2e0295d92b82d8909c4f1b11c82f016 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Thu, 15 Jun 2017 20:45:31 -0700 Subject: [PATCH 05/11] add back avg example --- R/pkg/R/functions.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 22f22f965d77..e3c36493f01f 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -193,6 +193,7 @@ setMethod("atan", #' @export #' @aliases avg avg,Column-method #' @family aggregate functions +#' @examples \dontrun{avg(df$c)} #' @note avg since 1.4.0 setMethod("avg", signature(x = "Column"), From 19d063c6995fa6bd780830a941f6b1f7c45c1bac Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Thu, 15 Jun 2017 20:48:35 -0700 Subject: [PATCH 06/11] revert avg --- R/pkg/R/functions.R | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index e3c36493f01f..0681b3c84684 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -187,12 +187,13 @@ setMethod("atan", #' avg #' -#' Returns the average of the values in a group. +#' Aggregate function: returns the average of the values in a group. #' #' @rdname avg -#' @export -#' @aliases avg avg,Column-method +#' @name avg #' @family aggregate functions +#' @export +#' @aliases avg,Column-method #' @examples \dontrun{avg(df$c)} #' @note avg since 1.4.0 setMethod("avg", From 978e13b498b492495a2fa21e915c120791b59b9f Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Fri, 16 Jun 2017 13:00:31 -0700 Subject: [PATCH 07/11] fix issue in avg doc --- R/pkg/R/generics.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index b17f9a119c9a..a1774e73c4d1 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -921,6 +921,7 @@ setGeneric("array_contains", function(x, value) { standardGeneric("array_contain setGeneric("ascii", function(x) { standardGeneric("ascii") }) #' @param x Column to compute on or a GroupedData object. +#' @param ... additional argument(s) when \code{x} is a GroupedData object. #' @rdname avg #' @export setGeneric("avg", function(x, ...) { standardGeneric("avg") }) From 875db0dc02e03fab1df57ba105033a6597d45249 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Sat, 17 Jun 2017 11:58:52 -0700 Subject: [PATCH 08/11] update doc for corr cov functions --- R/pkg/R/functions.R | 42 ++++++++++++++++++++---------------------- R/pkg/R/stats.R | 20 +++++++------------- 2 files changed, 27 insertions(+), 35 deletions(-) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 0681b3c84684..ba671e503a39 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -361,10 +361,13 @@ setMethod("column", #' #' @rdname corr #' @name corr -#' @family math functions +#' @family aggregate functions #' @export #' @aliases corr,Column-method -#' @examples \dontrun{corr(df$c, df$d)} +#' @examples +#' \dontrun{ +#' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars)) +#' head(select(df, corr(df$mpg, df$hp)))} #' @note corr since 1.6.0 setMethod("corr", signature(x = "Column"), function(x, col2) { @@ -375,20 +378,22 @@ setMethod("corr", signature(x = "Column"), #' cov #' -#' Compute the sample covariance between two expressions. +#' Compute the covariance between two expressions. +#' +#' @details +#' \code{cov}: Compute the sample covariance between two expressions. #' #' @rdname cov #' @name cov -#' @family math functions +#' @family aggregate functions #' @export #' @aliases cov,characterOrColumn-method #' @examples #' \dontrun{ -#' cov(df$c, df$d) -#' cov("c", "d") -#' covar_samp(df$c, df$d) -#' covar_samp("c", "d") -#' } +#' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars)) +#' head(select(df, cov(df$mpg, df$hp), cov("mpg", "hp"), +#' covar_samp(df$mpg, df$hp), covar_samp("mpg", "hp"), +#' covar_pop(df$mpg, df$hp), covar_pop("mpg", "hp")))} #' @note cov since 1.6.0 setMethod("cov", signature(x = "characterOrColumn"), function(x, col2) { @@ -396,6 +401,9 @@ setMethod("cov", signature(x = "characterOrColumn"), covar_samp(x, col2) }) +#' @details +#' \code{covar_sample}: Alias for \code{cov}. +#' #' @rdname cov #' #' @param col1 the first Column. @@ -414,23 +422,13 @@ setMethod("covar_samp", signature(col1 = "characterOrColumn", col2 = "characterO column(jc) }) -#' covar_pop -#' -#' Compute the population covariance between two expressions. -#' -#' @param col1 First column to compute cov_pop. -#' @param col2 Second column to compute cov_pop. +#' @details +#' \code{covar_pop}: Computes the population covariance between two expressions. #' -#' @rdname covar_pop +#' @rdname cov #' @name covar_pop -#' @family math functions #' @export #' @aliases covar_pop,characterOrColumn,characterOrColumn-method -#' @examples -#' \dontrun{ -#' covar_pop(df$c, df$d) -#' covar_pop("c", "d") -#' } #' @note covar_pop since 2.0.0 setMethod("covar_pop", signature(col1 = "characterOrColumn", col2 = "characterOrColumn"), function(col1, col2) { diff --git a/R/pkg/R/stats.R b/R/pkg/R/stats.R index d78a10893f92..04cbc74fda28 100644 --- a/R/pkg/R/stats.R +++ b/R/pkg/R/stats.R @@ -52,22 +52,17 @@ setMethod("crosstab", collect(dataFrame(sct)) }) -#' Calculate the sample covariance of two numerical columns of a SparkDataFrame. -#' #' @param colName1 the name of the first column #' @param colName2 the name of the second column -#' @return The covariance of the two columns. #' #' @rdname cov -#' @name cov #' @aliases cov,SparkDataFrame-method #' @family stat functions #' @export #' @examples -#'\dontrun{ -#' df <- read.json("/path/to/file.json") -#' cov <- cov(df, "title", "gender") -#' } +#' +#' \dontrun{ +#' cov(df, "mpg", "hp")} #' @note cov since 1.6.0 setMethod("cov", signature(x = "SparkDataFrame"), @@ -93,11 +88,10 @@ setMethod("cov", #' @family stat functions #' @export #' @examples -#'\dontrun{ -#' df <- read.json("/path/to/file.json") -#' corr <- corr(df, "title", "gender") -#' corr <- corr(df, "title", "gender", method = "pearson") -#' } +#' +#' \dontrun{ +#' corr(df, "mpg", "hp") +#' corr(df, "mpg", "hp", method = "pearson")} #' @note corr since 1.6.0 setMethod("corr", signature(x = "SparkDataFrame"), From 79d9fdf424cc24277673f30ec673ed6ae3eafeee Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Sat, 17 Jun 2017 12:06:09 -0700 Subject: [PATCH 09/11] fix issue with covar_pop --- R/pkg/R/generics.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index a1774e73c4d1..0331483a3e83 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -479,7 +479,7 @@ setGeneric("corr", function(x, ...) {standardGeneric("corr") }) #' @export setGeneric("covar_samp", function(col1, col2) {standardGeneric("covar_samp") }) -#' @rdname covar_pop +#' @rdname cov #' @export setGeneric("covar_pop", function(col1, col2) {standardGeneric("covar_pop") }) From 6eae126398e4229aa84130728792f407c67a75e6 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Sun, 18 Jun 2017 20:54:58 -0700 Subject: [PATCH 10/11] add back return value in cov stat --- R/pkg/R/stats.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/pkg/R/stats.R b/R/pkg/R/stats.R index 04cbc74fda28..21422bfc8390 100644 --- a/R/pkg/R/stats.R +++ b/R/pkg/R/stats.R @@ -54,6 +54,7 @@ setMethod("crosstab", #' @param colName1 the name of the first column #' @param colName2 the name of the second column +#' @return The covariance of the two columns. #' #' @rdname cov #' @aliases cov,SparkDataFrame-method From 4cf5ab98771f19924e483ac716bd8a0618ba3f2e Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Mon, 19 Jun 2017 11:00:49 -0700 Subject: [PATCH 11/11] update cov method for SparkDataFrame --- R/pkg/R/stats.R | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/R/pkg/R/stats.R b/R/pkg/R/stats.R index 21422bfc8390..9a9fa84044ce 100644 --- a/R/pkg/R/stats.R +++ b/R/pkg/R/stats.R @@ -52,6 +52,10 @@ setMethod("crosstab", collect(dataFrame(sct)) }) +#' @details +#' \code{cov}: When applied to SparkDataFrame, this calculates the sample covariance of two numerical +#' columns of \emph{one} SparkDataFrame. +#' #' @param colName1 the name of the first column #' @param colName2 the name of the second column #' @return The covariance of the two columns. @@ -63,7 +67,8 @@ setMethod("crosstab", #' @examples #' #' \dontrun{ -#' cov(df, "mpg", "hp")} +#' cov(df, "mpg", "hp") +#' cov(df, df$mpg, df$hp)} #' @note cov since 1.6.0 setMethod("cov", signature(x = "SparkDataFrame"),