apache
diff --git a/‎R/pkg/NAMESPACE‎
Lines changed: 1 addition & 0 deletions b/‎R/pkg/NAMESPACE‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎R/pkg/R/DataFrame.R‎
Lines changed: 51 additions & 3 deletions b/‎R/pkg/R/DataFrame.R‎
Lines changed: 51 additions & 3 deletions
diff --git a/‎R/pkg/R/RDD.R‎
Lines changed: 2 additions & 2 deletions b/‎R/pkg/R/RDD.R‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎R/pkg/R/functions.R‎
Lines changed: 24 additions & 2 deletions b/‎R/pkg/R/functions.R‎
Lines changed: 24 additions & 2 deletions
diff --git a/‎R/pkg/R/generics.R‎
Lines changed: 10 additions & 3 deletions b/‎R/pkg/R/generics.R‎
Lines changed: 10 additions & 3 deletions
diff --git a/‎R/pkg/R/mllib_classification.R‎
Lines changed: 9 additions & 4 deletions b/‎R/pkg/R/mllib_classification.R‎
Lines changed: 9 additions & 4 deletions
diff --git a/‎R/pkg/R/mllib_regression.R‎
Lines changed: 16 additions & 8 deletions b/‎R/pkg/R/mllib_regression.R‎
Lines changed: 16 additions & 8 deletions
diff --git a/‎R/pkg/R/stats.R‎
Lines changed: 17 additions & 8 deletions b/‎R/pkg/R/stats.R‎
Lines changed: 17 additions & 8 deletions
diff --git a/‎R/pkg/inst/tests/testthat/test_mllib_classification.R‎
Lines changed: 9 additions & 1 deletion b/‎R/pkg/inst/tests/testthat/test_mllib_classification.R‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎R/pkg/inst/tests/testthat/test_rdd.R‎
Lines changed: 1 addition & 1 deletion b/‎R/pkg/inst/tests/testthat/test_rdd.R‎
Lines changed: 1 addition & 1 deletion
@@ -82,6 +82,7 @@ exportMethods("arrange",
               "as.data.frame",
               "attach",
               "cache",
+              "coalesce",
               "collect",
               "colnames",
               "colnames<-",
 
@@ -678,14 +678,53 @@ setMethod("storageLevel",
             storageLevelToString(callJMethod(x@sdf, "storageLevel"))
           })
 
+#' Coalesce
+#'
+#' Returns a new SparkDataFrame that has exactly \code{numPartitions} partitions.
+#' This operation results in a narrow dependency, e.g. if you go from 1000 partitions to 100
+#' partitions, there will not be a shuffle, instead each of the 100 new partitions will claim 10 of
+#' the current partitions. If a larger number of partitions is requested, it will stay at the
+#' current number of partitions.
+#'
+#' However, if you're doing a drastic coalesce on a SparkDataFrame, e.g. to numPartitions = 1,
+#' this may result in your computation taking place on fewer nodes than
+#' you like (e.g. one node in the case of numPartitions = 1). To avoid this,
+#' call \code{repartition}. This will add a shuffle step, but means the
+#' current upstream partitions will be executed in parallel (per whatever
+#' the current partitioning is).
+#'
+#' @param numPartitions the number of partitions to use.
+#'
+#' @family SparkDataFrame functions
+#' @rdname coalesce
+#' @name coalesce
+#' @aliases coalesce,SparkDataFrame-method
+#' @seealso \link{repartition}
+#' @export
+#' @examples
+#'\dontrun{
+#' sparkR.session()
+#' path <- "path/to/file.json"
+#' df <- read.json(path)
+#' newDF <- coalesce(df, 1L)
+#'}
+#' @note coalesce(SparkDataFrame) since 2.1.1
+setMethod("coalesce",
+          signature(x = "SparkDataFrame"),
+          function(x, numPartitions) {
+            stopifnot(is.numeric(numPartitions))
+            sdf <- callJMethod(x@sdf, "coalesce", numToInt(numPartitions))
+            dataFrame(sdf)
+          })
+
 #' Repartition
 #'
 #' The following options for repartition are possible:
 #' \itemize{
-#'  \item{1.} {Return a new SparkDataFrame partitioned by
+#'  \item{1.} {Return a new SparkDataFrame that has exactly \code{numPartitions}.}
+#'  \item{2.} {Return a new SparkDataFrame hash partitioned by
 #'                      the given columns into \code{numPartitions}.}
-#'  \item{2.} {Return a new SparkDataFrame that has exactly \code{numPartitions}.}
-#'  \item{3.} {Return a new SparkDataFrame partitioned by the given column(s),
+#'  \item{3.} {Return a new SparkDataFrame hash partitioned by the given column(s),
 #'                      using \code{spark.sql.shuffle.partitions} as number of partitions.}
 #'}
 #' @param x a SparkDataFrame.
@@ -697,6 +736,7 @@ setMethod("storageLevel",
 #' @rdname repartition
 #' @name repartition
 #' @aliases repartition,SparkDataFrame-method
+#' @seealso \link{coalesce}
 #' @export
 #' @examples
 #'\dontrun{
@@ -1764,6 +1804,10 @@ setClassUnion("numericOrcharacter", c("numeric", "character"))
 #' @note [[ since 1.4.0
 setMethod("[[", signature(x = "SparkDataFrame", i = "numericOrcharacter"),
           function(x, i) {
+            if (length(i) > 1) {
+              warning("Subset index has length > 1. Only the first index is used.")
+              i <- i[1]
+            }
             if (is.numeric(i)) {
               cols <- columns(x)
               i <- cols[[i]]
@@ -1777,6 +1821,10 @@ setMethod("[[", signature(x = "SparkDataFrame", i = "numericOrcharacter"),
 #' @note [[<- since 2.1.1
 setMethod("[[<-", signature(x = "SparkDataFrame", i = "numericOrcharacter"),
           function(x, i, value) {
+            if (length(i) > 1) {
+              warning("Subset index has length > 1. Only the first index is used.")
+              i <- i[1]
+            }
             if (is.numeric(i)) {
               cols <- columns(x)
               i <- cols[[i]]
 
@@ -1028,7 +1028,7 @@ setMethod("repartitionRDD",
           signature(x = "RDD"),
           function(x, numPartitions) {
             if (!is.null(numPartitions) && is.numeric(numPartitions)) {
-              coalesce(x, numPartitions, TRUE)
+              coalesceRDD(x, numPartitions, TRUE)
             } else {
               stop("Please, specify the number of partitions")
             }
@@ -1049,7 +1049,7 @@ setMethod("repartitionRDD",
 #' @rdname coalesce
 #' @aliases coalesce,RDD
 #' @noRd
-setMethod("coalesce",
+setMethod("coalesceRDD",
            signature(x = "RDD", numPartitions = "numeric"),
            function(x, numPartitions, shuffle = FALSE) {
              numPartitions <- numToInt(numPartitions)
 
@@ -286,6 +286,28 @@ setMethod("ceil",
             column(jc)
           })
 
+#' Returns the first column that is not NA
+#'
+#' Returns the first column that is not NA, or NA if all inputs are.
+#'
+#' @rdname coalesce
+#' @name coalesce
+#' @family normal_funcs
+#' @export
+#' @aliases coalesce,Column-method
+#' @examples \dontrun{coalesce(df$c, df$d, df$e)}
+#' @note coalesce(Column) since 2.1.1
+setMethod("coalesce",
+          signature(x = "Column"),
+          function(x, ...) {
+            jcols <- lapply(list(x, ...), function (x) {
+              stopifnot(class(x) == "Column")
+              x@jc
+            })
+            jc <- callJStatic("org.apache.spark.sql.functions", "coalesce", jcols)
+            column(jc)
+          })
+
 #' Though scala functions has "col" function, we don't expose it in SparkR
 #' because we don't want to conflict with the "col" function in the R base
 #' package and we also have "column" function exported which is an alias of "col".
@@ -297,15 +319,15 @@ col <- function(x) {
 #' Returns a Column based on the given column name
 #'
 #' Returns a Column based on the given column name.
-#
+#'
 #' @param x Character column name.
 #'
 #' @rdname column
 #' @name column
 #' @family normal_funcs
 #' @export
 #' @aliases column,character-method
-#' @examples \dontrun{column(df)}
+#' @examples \dontrun{column("name")}
 #' @note column since 1.6.0
 setMethod("column",
           signature(x = "character"),
 
@@ -28,7 +28,7 @@ setGeneric("cacheRDD", function(x) { standardGeneric("cacheRDD") })
 # @rdname coalesce
 # @seealso repartition
 # @export
-setGeneric("coalesce", function(x, numPartitions, ...) { standardGeneric("coalesce") })
+setGeneric("coalesceRDD", function(x, numPartitions, ...) { standardGeneric("coalesceRDD") })
 
 # @rdname checkpoint-methods
 # @export
@@ -66,7 +66,7 @@ setGeneric("freqItems", function(x, cols, support = 0.01) { standardGeneric("fre
 # @rdname approxQuantile
 # @export
 setGeneric("approxQuantile",
-           function(x, col, probabilities, relativeError) {
+           function(x, cols, probabilities, relativeError) {
              standardGeneric("approxQuantile")
            })
 
@@ -406,6 +406,13 @@ setGeneric("attach")
 #' @export
 setGeneric("cache", function(x) { standardGeneric("cache") })
 
+#' @rdname coalesce
+#' @param x a Column or a SparkDataFrame.
+#' @param ... additional argument(s). If \code{x} is a Column, additional Columns can be optionally
+#'        provided.
+#' @export
+setGeneric("coalesce", function(x, ...) { standardGeneric("coalesce") })
+
 #' @rdname collect
 #' @export
 setGeneric("collect", function(x, ...) { standardGeneric("collect") })
@@ -1399,7 +1406,7 @@ setGeneric("spark.randomForest",
 
 #' @rdname spark.survreg
 #' @export
-setGeneric("spark.survreg", function(data, formula) { standardGeneric("spark.survreg") })
+setGeneric("spark.survreg", function(data, formula, ...) { standardGeneric("spark.survreg") })
 
 #' @rdname spark.svmLinear
 #' @export
 
@@ -207,6 +207,9 @@ function(object, path, overwrite = FALSE) {
 #'                  excepting that at most one value may be 0. The class with largest value p/t is predicted, where p
 #'                  is the original probability of that class and t is the class's threshold.
 #' @param weightCol The weight column name.
+#' @param aggregationDepth The depth for treeAggregate (greater than or equal to 2). If the dimensions of features
+#'                         or the number of partitions are large, this param could be adjusted to a larger size.
+#'                         This is an expert parameter. Default value should be good for most cases.
 #' @param ... additional arguments passed to the method.
 #' @return \code{spark.logit} returns a fitted logistic regression model.
 #' @rdname spark.logit
@@ -245,19 +248,21 @@ function(object, path, overwrite = FALSE) {
 setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula"),
           function(data, formula, regParam = 0.0, elasticNetParam = 0.0, maxIter = 100,
                    tol = 1E-6, family = "auto", standardization = TRUE,
-                   thresholds = 0.5, weightCol = NULL) {
+                   thresholds = 0.5, weightCol = NULL, aggregationDepth = 2) {
             formula <- paste(deparse(formula), collapse = "")
 
-            if (is.null(weightCol)) {
-              weightCol <- ""
+            if (!is.null(weightCol) && weightCol == "") {
+              weightCol <- NULL
+            } else if (!is.null(weightCol)) {
+              weightCol <- as.character(weightCol)
             }
 
             jobj <- callJStatic("org.apache.spark.ml.r.LogisticRegressionWrapper", "fit",
                                 data@sdf, formula, as.numeric(regParam),
                                 as.numeric(elasticNetParam), as.integer(maxIter),
                                 as.numeric(tol), as.character(family),
                                 as.logical(standardization), as.array(thresholds),
-                                as.character(weightCol))
+                                weightCol, as.integer(aggregationDepth))
             new("LogisticRegressionModel", jobj = jobj)
           })
 
 
@@ -102,14 +102,16 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
             }
 
             formula <- paste(deparse(formula), collapse = "")
-            if (is.null(weightCol)) {
-              weightCol <- ""
+            if (!is.null(weightCol) && weightCol == "") {
+              weightCol <- NULL
+            } else if (!is.null(weightCol)) {
+              weightCol <- as.character(weightCol)
             }
 
             # For known families, Gamma is upper-cased
             jobj <- callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper",
                                 "fit", formula, data@sdf, tolower(family$family), family$link,
-                                tol, as.integer(maxIter), as.character(weightCol), regParam)
+                                tol, as.integer(maxIter), weightCol, regParam)
             new("GeneralizedLinearRegressionModel", jobj = jobj)
           })
 
@@ -305,13 +307,15 @@ setMethod("spark.isoreg", signature(data = "SparkDataFrame", formula = "formula"
           function(data, formula, isotonic = TRUE, featureIndex = 0, weightCol = NULL) {
             formula <- paste(deparse(formula), collapse = "")
 
-            if (is.null(weightCol)) {
-              weightCol <- ""
+            if (!is.null(weightCol) && weightCol == "") {
+              weightCol <- NULL
+            } else if (!is.null(weightCol)) {
+              weightCol <- as.character(weightCol)
             }
 
             jobj <- callJStatic("org.apache.spark.ml.r.IsotonicRegressionWrapper", "fit",
                                 data@sdf, formula, as.logical(isotonic), as.integer(featureIndex),
-                                as.character(weightCol))
+                                weightCol)
             new("IsotonicRegressionModel", jobj = jobj)
           })
 
@@ -372,6 +376,10 @@ setMethod("write.ml", signature(object = "IsotonicRegressionModel", path = "char
 #' @param formula a symbolic description of the model to be fitted. Currently only a few formula
 #'                operators are supported, including '~', ':', '+', and '-'.
 #'                Note that operator '.' is not supported currently.
+#' @param aggregationDepth The depth for treeAggregate (greater than or equal to 2). If the dimensions of features
+#'                         or the number of partitions are large, this param could be adjusted to a larger size.
+#'                         This is an expert parameter. Default value should be good for most cases.
+#' @param ... additional arguments passed to the method.
 #' @return \code{spark.survreg} returns a fitted AFT survival regression model.
 #' @rdname spark.survreg
 #' @seealso survival: \url{https://cran.r-project.org/package=survival}
@@ -396,10 +404,10 @@ setMethod("write.ml", signature(object = "IsotonicRegressionModel", path = "char
 #' }
 #' @note spark.survreg since 2.0.0
 setMethod("spark.survreg", signature(data = "SparkDataFrame", formula = "formula"),
-          function(data, formula) {
+          function(data, formula, aggregationDepth = 2) {
             formula <- paste(deparse(formula), collapse = "")
             jobj <- callJStatic("org.apache.spark.ml.r.AFTSurvivalRegressionWrapper",
-                                "fit", formula, data@sdf)
+                                "fit", formula, data@sdf, as.integer(aggregationDepth))
             new("AFTSurvivalRegressionModel", jobj = jobj)
           })
 
 
@@ -138,9 +138,9 @@ setMethod("freqItems", signature(x = "SparkDataFrame", cols = "character"),
             collect(dataFrame(sct))
           })
 
-#' Calculates the approximate quantiles of a numerical column of a SparkDataFrame
+#' Calculates the approximate quantiles of numerical columns of a SparkDataFrame
 #'
-#' Calculates the approximate quantiles of a numerical column of a SparkDataFrame.
+#' Calculates the approximate quantiles of numerical columns of a SparkDataFrame.
 #' The result of this algorithm has the following deterministic bound:
 #' If the SparkDataFrame has N elements and if we request the quantile at probability p up to
 #' error err, then the algorithm will return a sample x from the SparkDataFrame so that the
@@ -149,15 +149,19 @@ setMethod("freqItems", signature(x = "SparkDataFrame", cols = "character"),
 #' This method implements a variation of the Greenwald-Khanna algorithm (with some speed
 #' optimizations). The algorithm was first present in [[http://dx.doi.org/10.1145/375663.375670
 #' Space-efficient Online Computation of Quantile Summaries]] by Greenwald and Khanna.
+#' Note that rows containing any NA values will be removed before calculation.
 #'
 #' @param x A SparkDataFrame.
-#' @param col The name of the numerical column.
+#' @param cols A single column name, or a list of names for multiple columns.
 #' @param probabilities A list of quantile probabilities. Each number must belong to [0, 1].
 #'                      For example 0 is the minimum, 0.5 is the median, 1 is the maximum.
 #' @param relativeError The relative target precision to achieve (>= 0). If set to zero,
 #'                      the exact quantiles are computed, which could be very expensive.
 #'                      Note that values greater than 1 are accepted but give the same result as 1.
-#' @return The approximate quantiles at the given probabilities.
+#' @return The approximate quantiles at the given probabilities. If the input is a single column name,
+#'         the output is a list of approximate quantiles in that column; If the input is
+#'         multiple column names, the output should be a list, and each element in it is a list of
+#'         numeric values which represents the approximate quantiles in corresponding column.
 #'
 #' @rdname approxQuantile
 #' @name approxQuantile
@@ -171,12 +175,17 @@ setMethod("freqItems", signature(x = "SparkDataFrame", cols = "character"),
 #' }
 #' @note approxQuantile since 2.0.0
 setMethod("approxQuantile",
-          signature(x = "SparkDataFrame", col = "character",
+          signature(x = "SparkDataFrame", cols = "character",
                     probabilities = "numeric", relativeError = "numeric"),
-          function(x, col, probabilities, relativeError) {
+          function(x, cols, probabilities, relativeError) {
             statFunctions <- callJMethod(x@sdf, "stat")
-            callJMethod(statFunctions, "approxQuantile", col,
-                        as.list(probabilities), relativeError)
+            quantiles <- callJMethod(statFunctions, "approxQuantile", as.list(cols),
+                                     as.list(probabilities), relativeError)
+            if (length(cols) == 1) {
+              quantiles[[1]]
+            } else {
+              quantiles
+            }
           })
 
 #' Returns a stratified sample without replacement
 
@@ -211,7 +211,15 @@ test_that("spark.logit", {
   df <- createDataFrame(data)
   model <- spark.logit(df, label ~ feature)
   prediction <- collect(select(predict(model, df), "prediction"))
-  expect_equal(prediction$prediction, c("0.0", "0.0", "1.0", "1.0", "0.0"))
+  expect_equal(sort(prediction$prediction), c("0.0", "0.0", "0.0", "1.0", "1.0"))
+
+  # Test prediction with weightCol
+  weight <- c(2.0, 2.0, 2.0, 1.0, 1.0)
+  data2 <- as.data.frame(cbind(label, feature, weight))
+  df2 <- createDataFrame(data2)
+  model2 <- spark.logit(df2, label ~ feature, weightCol = "weight")
+  prediction2 <- collect(select(predict(model2, df2), "prediction"))
+  expect_equal(sort(prediction2$prediction), c("0.0", "0.0", "0.0", "0.0", "0.0"))
 })
 
 test_that("spark.mlp", {
 
@@ -315,7 +315,7 @@ test_that("repartition/coalesce on RDDs", {
   expect_true(count >= 0 && count <= 4)
 
   # coalesce
-  r3 <- coalesce(rdd, 1)
+  r3 <- coalesceRDD(rdd, 1)
   expect_equal(getNumPartitionsRDD(r3), 1L)
   count <- length(collectPartition(r3, 0L))
   expect_equal(count, 20)