apache · skanjila · Oct 30, 2016 · Oct 30, 2016 · Oct 30, 2016 · Oct 30, 2016
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
@@ -44,7 +44,9 @@ exportMethods("glm",
               "spark.gaussianMixture",
               "spark.als",
               "spark.kstest",
-              "spark.logit")
+              "spark.logit",
+              "spark.randomForest",
+              "spark.gbt")
 
 # Job group lifecycle management methods
 export("setJobGroup",
@@ -350,7 +352,11 @@ export("as.DataFrame",
        "uncacheTable",
        "print.summary.GeneralizedLinearRegressionModel",
        "read.ml",
-       "print.summary.KSTest")
+       "print.summary.KSTest",
+       "print.summary.RandomForestRegressionModel",
+       "print.summary.RandomForestClassificationModel",
+       "print.summary.GBTRegressionModel",
+       "print.summary.GBTClassificationModel")
 
 export("structField",
        "structField.jobj",
@@ -375,6 +381,10 @@ S3method(print, structField)
 S3method(print, structType)
 S3method(print, summary.GeneralizedLinearRegressionModel)
 S3method(print, summary.KSTest)
+S3method(print, summary.RandomForestRegressionModel)
+S3method(print, summary.RandomForestClassificationModel)
+S3method(print, summary.GBTRegressionModel)
+S3method(print, summary.GBTClassificationModel)
 S3method(structField, character)
 S3method(structField, jobj)
 S3method(structType, jobj)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
@@ -788,7 +788,7 @@ setMethod("write.json",
           function(x, path, mode = "error", ...) {
             write <- callJMethod(x@sdf, "write")
             write <- setWriteOptions(write, mode = mode, ...)
-            invisible(callJMethod(write, "json", path))
+            invisible(handledCallJMethod(write, "json", path))
           })
 
 #' Save the contents of SparkDataFrame as an ORC file, preserving the schema.
@@ -819,7 +819,7 @@ setMethod("write.orc",
           function(x, path, mode = "error", ...) {
             write <- callJMethod(x@sdf, "write")
             write <- setWriteOptions(write, mode = mode, ...)
-            invisible(callJMethod(write, "orc", path))
+            invisible(handledCallJMethod(write, "orc", path))
           })
 
 #' Save the contents of SparkDataFrame as a Parquet file, preserving the schema.
@@ -851,7 +851,7 @@ setMethod("write.parquet",
           function(x, path, mode = "error", ...) {
             write <- callJMethod(x@sdf, "write")
             write <- setWriteOptions(write, mode = mode, ...)
-            invisible(callJMethod(write, "parquet", path))
+            invisible(handledCallJMethod(write, "parquet", path))
           })
 
 #' @rdname write.parquet
@@ -895,7 +895,7 @@ setMethod("write.text",
           function(x, path, mode = "error", ...) {
             write <- callJMethod(x@sdf, "write")
             write <- setWriteOptions(write, mode = mode, ...)
-            invisible(callJMethod(write, "text", path))
+            invisible(handledCallJMethod(write, "text", path))
           })
 
 #' Distinct
@@ -3342,7 +3342,7 @@ setMethod("write.jdbc",
             jprops <- varargsToJProperties(...)
             write <- callJMethod(x@sdf, "write")
             write <- callJMethod(write, "mode", jmode)
-            invisible(callJMethod(write, "jdbc", url, tableName, jprops))
+            invisible(handledCallJMethod(write, "jdbc", url, tableName, jprops))
           })
 
 #' randomSplit

diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R
@@ -350,7 +350,7 @@ read.json.default <- function(path, ...) {
   paths <- as.list(suppressWarnings(normalizePath(path)))
   read <- callJMethod(sparkSession, "read")
   read <- callJMethod(read, "options", options)
-  sdf <- callJMethod(read, "json", paths)
+  sdf <- handledCallJMethod(read, "json", paths)
   dataFrame(sdf)
 }
 
@@ -422,7 +422,7 @@ read.orc <- function(path, ...) {
   path <- suppressWarnings(normalizePath(path))
   read <- callJMethod(sparkSession, "read")
   read <- callJMethod(read, "options", options)
-  sdf <- callJMethod(read, "orc", path)
+  sdf <- handledCallJMethod(read, "orc", path)
   dataFrame(sdf)
 }
 
@@ -444,7 +444,7 @@ read.parquet.default <- function(path, ...) {
   paths <- as.list(suppressWarnings(normalizePath(path)))
   read <- callJMethod(sparkSession, "read")
   read <- callJMethod(read, "options", options)
-  sdf <- callJMethod(read, "parquet", paths)
+  sdf <- handledCallJMethod(read, "parquet", paths)
   dataFrame(sdf)
 }
 
@@ -496,7 +496,7 @@ read.text.default <- function(path, ...) {
   paths <- as.list(suppressWarnings(normalizePath(path)))
   read <- callJMethod(sparkSession, "read")
   read <- callJMethod(read, "options", options)
-  sdf <- callJMethod(read, "text", paths)
+  sdf <- handledCallJMethod(read, "text", paths)
   dataFrame(sdf)
 }
 
@@ -914,12 +914,13 @@ read.jdbc <- function(url, tableName,
     } else {
       numPartitions <- numToInt(numPartitions)
     }
-    sdf <- callJMethod(read, "jdbc", url, tableName, as.character(partitionColumn),
-                       numToInt(lowerBound), numToInt(upperBound), numPartitions, jprops)
+    sdf <- handledCallJMethod(read, "jdbc", url, tableName, as.character(partitionColumn),
+                              numToInt(lowerBound), numToInt(upperBound), numPartitions, jprops)
   } else if (length(predicates) > 0) {
-    sdf <- callJMethod(read, "jdbc", url, tableName, as.list(as.character(predicates)), jprops)
+    sdf <- handledCallJMethod(read, "jdbc", url, tableName, as.list(as.character(predicates)),
+                              jprops)
   } else {
-    sdf <- callJMethod(read, "jdbc", url, tableName, jprops)
+    sdf <- handledCallJMethod(read, "jdbc", url, tableName, jprops)
   }
   dataFrame(sdf)
 }
diff --git a/R/pkg/R/backend.R b/R/pkg/R/backend.R
@@ -108,13 +108,27 @@ invokeJava <- function(isStatic, objId, methodName, ...) {
   conn <- get(".sparkRCon", .sparkREnv)
   writeBin(requestMessage, conn)
 
-  # TODO: check the status code to output error information
   returnStatus <- readInt(conn)
+  handleErrors(returnStatus, conn)
+
+  # Backend will send +1 as keep alive value to prevent various connection timeouts
+  # on very long running jobs. See spark.r.heartBeatInterval
+  while (returnStatus == 1) {
+    returnStatus <- readInt(conn)
+    handleErrors(returnStatus, conn)
+  }
+
+  readObject(conn)
+}
+
+# Helper function to check for returned errors and print appropriate error message to user
+handleErrors <- function(returnStatus, conn) {
   if (length(returnStatus) == 0) {
     stop("No status is returned. Java SparkR backend might have failed.")
   }
-  if (returnStatus != 0) {
+
+  # 0 is success and +1 is reserved for heartbeats. Other negative values indicate errors.
+  if (returnStatus < 0) {
     stop(readString(conn))
   }
-  readObject(conn)
 }
diff --git a/R/pkg/R/client.R b/R/pkg/R/client.R
@@ -19,7 +19,7 @@
 
 # Creates a SparkR client connection object
 # if one doesn't already exist
-connectBackend <- function(hostname, port, timeout = 6000) {
+connectBackend <- function(hostname, port, timeout) {
   if (exists(".sparkRcon", envir = .sparkREnv)) {
     if (isOpen(.sparkREnv[[".sparkRCon"]])) {
       cat("SparkRBackend client connection already exists\n")

diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
@@ -1485,7 +1485,7 @@ setMethod("soundex",
 
 #' Return the partition ID as a column
 #'
-#' Return the partition ID of the Spark task as a SparkDataFrame column.
+#' Return the partition ID as a SparkDataFrame column.
 #' Note that this is nondeterministic because it depends on data partitioning and
 #' task scheduling.
 #'
@@ -2317,7 +2317,8 @@ setMethod("date_format", signature(y = "Column", x = "character"),
 
 #' from_utc_timestamp
 #'
-#' Assumes given timestamp is UTC and converts to given timezone.
+#' Given a timestamp, which corresponds to a certain time of day in UTC, returns another timestamp
+#' that corresponds to the same time of day in the given timezone.
 #'
 #' @param y Column to compute on.
 #' @param x time zone to use.
@@ -2340,7 +2341,7 @@ setMethod("from_utc_timestamp", signature(y = "Column", x = "character"),
 #' Locate the position of the first occurrence of substr column in the given string.
 #' Returns null if either of the arguments are null.
 #'
-#' NOTE: The position is not zero based, but 1 based index, returns 0 if substr
+#' NOTE: The position is not zero based, but 1 based index. Returns 0 if substr
 #' could not be found in str.
 #'
 #' @param y column to check
@@ -2391,7 +2392,8 @@ setMethod("next_day", signature(y = "Column", x = "character"),
 
 #' to_utc_timestamp
 #'
-#' Assumes given timestamp is in given timezone and converts to UTC.
+#' Given a timestamp, which corresponds to a certain time of day in the given timezone, returns
+#' another timestamp that corresponds to the same time of day in UTC.
 #'
 #' @param y Column to compute on
 #' @param x timezone to use
@@ -2539,7 +2541,7 @@ setMethod("shiftLeft", signature(y = "Column", x = "numeric"),
 
 #' shiftRight
 #'
-#' Shift the given value numBits right. If the given value is a long value, it will return
+#' (Signed) shift the given value numBits right. If the given value is a long value, it will return
 #' a long value else it will return an integer value.
 #'
 #' @param y column to compute on.
@@ -2777,7 +2779,7 @@ setMethod("window", signature(x = "Column"),
 #' locate
 #'
 #' Locate the position of the first occurrence of substr.
-#' NOTE: The position is not zero based, but 1 based index, returns 0 if substr
+#' NOTE: The position is not zero based, but 1 based index. Returns 0 if substr
 #' could not be found in str.
 #'
 #' @param substr a character string to be matched.
@@ -2823,7 +2825,8 @@ setMethod("lpad", signature(x = "Column", len = "numeric", pad = "character"),
 
 #' rand
 #'
-#' Generate a random column with i.i.d. samples from U[0.0, 1.0].
+#' Generate a random column with independent and identically distributed (i.i.d.) samples
+#' from U[0.0, 1.0].
 #'
 #' @param seed a random seed. Can be missing.
 #' @family normal_funcs
@@ -2852,7 +2855,8 @@ setMethod("rand", signature(seed = "numeric"),
 
 #' randn
 #'
-#' Generate a column with i.i.d. samples from the standard normal distribution.
+#' Generate a column with independent and identically distributed (i.i.d.) samples from
+#' the standard normal distribution.
 #'
 #' @param seed a random seed. Can be missing.
 #' @family normal_funcs
@@ -3442,8 +3446,8 @@ setMethod("size",
 
 #' sort_array
 #'
-#' Sorts the input array for the given column in ascending order,
-#' according to the natural ordering of the array elements.
+#' Sorts the input array in ascending or descending order according
+#' to the natural ordering of the array elements.
 #'
 #' @param x A Column to sort
 #' @param asc A logical flag indicating the sorting order.

diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
@@ -1310,9 +1310,11 @@ setGeneric("window", function(x, ...) { standardGeneric("window") })
 #' @export
 setGeneric("year", function(x) { standardGeneric("year") })
 
-#' @rdname spark.glm
+###################### Spark.ML Methods ##########################
+
+#' @rdname fitted
 #' @export
-setGeneric("spark.glm", function(data, formula, ...) { standardGeneric("spark.glm") })
+setGeneric("fitted")
 
 #' @param x,y For \code{glm}: logical values indicating whether the response vector
 #'          and model matrix used in the fitting process should be returned as
@@ -1332,13 +1334,42 @@ setGeneric("predict", function(object, ...) { standardGeneric("predict") })
 #' @export
 setGeneric("rbind", signature = "...")
 
+#' @rdname spark.als
+#' @export
+setGeneric("spark.als", function(data, ...) { standardGeneric("spark.als") })
+
+#' @rdname spark.gaussianMixture
+#' @export
+setGeneric("spark.gaussianMixture",
+           function(data, formula, ...) { standardGeneric("spark.gaussianMixture") })
+
+#' @rdname spark.gbt
+#' @export
+setGeneric("spark.gbt", function(data, formula, ...) { standardGeneric("spark.gbt") })
+
+#' @rdname spark.glm
+#' @export
+setGeneric("spark.glm", function(data, formula, ...) { standardGeneric("spark.glm") })
+
+#' @rdname spark.isoreg
+#' @export
+setGeneric("spark.isoreg", function(data, formula, ...) { standardGeneric("spark.isoreg") })
+
 #' @rdname spark.kmeans
 #' @export
 setGeneric("spark.kmeans", function(data, formula, ...) { standardGeneric("spark.kmeans") })
 
-#' @rdname fitted
+#' @rdname spark.kstest
 #' @export
-setGeneric("fitted")
+setGeneric("spark.kstest", function(data, ...) { standardGeneric("spark.kstest") })
+
+#' @rdname spark.lda
+#' @export
+setGeneric("spark.lda", function(data, ...) { standardGeneric("spark.lda") })
+
+#' @rdname spark.logit
+#' @export
+setGeneric("spark.logit", function(data, formula, ...) { standardGeneric("spark.logit") })
 
 #' @rdname spark.mlp
 #' @export
@@ -1348,13 +1379,14 @@ setGeneric("spark.mlp", function(data, ...) { standardGeneric("spark.mlp") })
 #' @export
 setGeneric("spark.naiveBayes", function(data, formula, ...) { standardGeneric("spark.naiveBayes") })
 
-#' @rdname spark.survreg
+#' @rdname spark.randomForest
 #' @export
-setGeneric("spark.survreg", function(data, formula) { standardGeneric("spark.survreg") })
+setGeneric("spark.randomForest",
+           function(data, formula, ...) { standardGeneric("spark.randomForest") })
 
-#' @rdname spark.lda
+#' @rdname spark.survreg
 #' @export
-setGeneric("spark.lda", function(data, ...) { standardGeneric("spark.lda") })
+setGeneric("spark.survreg", function(data, formula) { standardGeneric("spark.survreg") })
 
 #' @rdname spark.lda
 #' @export
@@ -1364,32 +1396,10 @@ setGeneric("spark.posterior", function(object, newData) { standardGeneric("spark
 #' @export
 setGeneric("spark.perplexity", function(object, data) { standardGeneric("spark.perplexity") })
 
-#' @rdname spark.isoreg
-#' @export
-setGeneric("spark.isoreg", function(data, formula, ...) { standardGeneric("spark.isoreg") })
-
-#' @rdname spark.gaussianMixture
-#' @export
-setGeneric("spark.gaussianMixture",
-           function(data, formula, ...) {
-             standardGeneric("spark.gaussianMixture")
-           })
-
-#' @rdname spark.logit
-#' @export
-setGeneric("spark.logit", function(data, formula, ...) { standardGeneric("spark.logit") })
 
 #' @param object a fitted ML model object.
 #' @param path the directory where the model is saved.
 #' @param ... additional argument(s) passed to the method.
 #' @rdname write.ml
 #' @export
 setGeneric("write.ml", function(object, path, ...) { standardGeneric("write.ml") })
-
-#' @rdname spark.als
-#' @export
-setGeneric("spark.als", function(data, ...) { standardGeneric("spark.als") })
-
-#' @rdname spark.kstest
-#' @export
-setGeneric("spark.kstest", function(data, ...) { standardGeneric("spark.kstest") })