apache · crjk21 · Apr 18, 2017 · Apr 18, 2017 · Apr 18, 2017 · Apr 19, 2017
diff --git a/LICENSE b/LICENSE
@@ -297,3 +297,4 @@ The text of each license is also included at licenses/LICENSE-[project].txt.
      (MIT License) RowsGroup (http://datatables.net/license/mit)
      (MIT License) jsonFormatter (http://www.jqueryscript.net/other/jQuery-Plugin-For-Pretty-JSON-Formatting-jsonFormatter.html)
      (MIT License) modernizr (https://github.com/Modernizr/Modernizr/blob/master/LICENSE)
+     (MIT License) machinist (https://github.com/typelevel/machinist)
diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: SparkR
 Type: Package
-Version: 2.2.0
+Version: 2.2.1
 Title: R Frontend for Apache Spark
 Description: The SparkR package provides an R Frontend for Apache Spark.
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
@@ -122,6 +122,7 @@ exportMethods("arrange",
               "group_by",
               "groupBy",
               "head",
+              "hint",
               "insertInto",
               "intersect",
               "isLocal",

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
@@ -3642,3 +3642,33 @@ setMethod("checkpoint",
             df <- callJMethod(x@sdf, "checkpoint", as.logical(eager))
             dataFrame(df)
           })
+
+#' hint
+#'
+#' Specifies execution plan hint and return a new SparkDataFrame.
+#'
+#' @param x a SparkDataFrame.
+#' @param name a name of the hint.
+#' @param ... optional parameters for the hint.
+#' @return A SparkDataFrame.
+#' @family SparkDataFrame functions
+#' @aliases hint,SparkDataFrame,character-method
+#' @rdname hint
+#' @name hint
+#' @export
+#' @examples
+#' \dontrun{
+#' df <- createDataFrame(mtcars)
+#' avg_mpg <- mean(groupBy(createDataFrame(mtcars), "cyl"), "mpg")
+#'
+#' head(join(df, hint(avg_mpg, "broadcast"), df$cyl == avg_mpg$cyl))
+#' }
+#' @note hint since 2.2.0
+setMethod("hint",
+          signature(x = "SparkDataFrame", name = "character"),
+          function(x, name, ...) {
+            parameters <- list(...)
+            stopifnot(all(sapply(parameters, is.character)))
+            jdf <- callJMethod(x@sdf, "hint", name, parameters)
+            dataFrame(jdf)
+          })
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
@@ -572,6 +572,10 @@ setGeneric("group_by", function(x, ...) { standardGeneric("group_by") })
 #' @export
 setGeneric("groupBy", function(x, ...) { standardGeneric("groupBy") })
 
+#' @rdname hint
+#' @export
+setGeneric("hint", function(x, name, ...) { standardGeneric("hint") })
+
 #' @rdname insertInto
 #' @export
 setGeneric("insertInto", function(x, tableName, ...) { standardGeneric("insertInto") })
@@ -1469,7 +1473,7 @@ setGeneric("write.ml", function(object, path, ...) { standardGeneric("write.ml")
 
 #' @rdname awaitTermination
 #' @export
-setGeneric("awaitTermination", function(x, timeout) { standardGeneric("awaitTermination") })
+setGeneric("awaitTermination", function(x, timeout = NULL) { standardGeneric("awaitTermination") })
 
 #' @rdname isActive
 #' @export

diff --git a/R/pkg/R/streaming.R b/R/pkg/R/streaming.R
@@ -169,8 +169,10 @@ setMethod("isActive",
 #' immediately.
 #'
 #' @param x a StreamingQuery.
-#' @param timeout time to wait in milliseconds
-#' @return TRUE if query has terminated within the timeout period.
+#' @param timeout time to wait in milliseconds, if omitted, wait indefinitely until \code{stopQuery}
+#'                is called or an error has occured.
+#' @return TRUE if query has terminated within the timeout period; nothing if timeout is not
+#'         specified.
 #' @rdname awaitTermination
 #' @name awaitTermination
 #' @aliases awaitTermination,StreamingQuery-method
@@ -182,8 +184,12 @@ setMethod("isActive",
 #' @note experimental
 setMethod("awaitTermination",
           signature(x = "StreamingQuery"),
-          function(x, timeout) {
-            handledCallJMethod(x@ssq, "awaitTermination", as.integer(timeout))
+          function(x, timeout = NULL) {
+            if (is.null(timeout)) {
+              invisible(handledCallJMethod(x@ssq, "awaitTermination"))
+            } else {
+              handledCallJMethod(x@ssq, "awaitTermination", as.integer(timeout))
+            }
           })
 
 #' stopQuery

diff --git a/R/pkg/inst/tests/testthat/test_Serde.R b/R/pkg/inst/tests/testthat/test_Serde.R
@@ -20,6 +20,8 @@ context("SerDe functionality")
 sparkSession <- sparkR.session(enableHiveSupport = FALSE)
 
 test_that("SerDe of primitive types", {
+  skip_on_cran()
+
   x <- callJStatic("SparkRHandler", "echo", 1L)
   expect_equal(x, 1L)
   expect_equal(class(x), "integer")
@@ -38,6 +40,8 @@ test_that("SerDe of primitive types", {
 })
 
 test_that("SerDe of list of primitive types", {
+  skip_on_cran()
+
   x <- list(1L, 2L, 3L)
   y <- callJStatic("SparkRHandler", "echo", x)
   expect_equal(x, y)
@@ -65,6 +69,8 @@ test_that("SerDe of list of primitive types", {
 })
 
 test_that("SerDe of list of lists", {
+  skip_on_cran()
+
   x <- list(list(1L, 2L, 3L), list(1, 2, 3),
             list(TRUE, FALSE), list("a", "b", "c"))
   y <- callJStatic("SparkRHandler", "echo", x)

diff --git a/R/pkg/inst/tests/testthat/test_Windows.R b/R/pkg/inst/tests/testthat/test_Windows.R
@@ -17,6 +17,8 @@
 context("Windows-specific tests")
 
 test_that("sparkJars tag in SparkContext", {
+  skip_on_cran()
+
   if (.Platform$OS.type != "windows") {
     skip("This test is only for Windows, skipped")
   }

diff --git a/R/pkg/inst/tests/testthat/test_binaryFile.R b/R/pkg/inst/tests/testthat/test_binaryFile.R
@@ -24,6 +24,8 @@ sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext",
 mockFile <- c("Spark is pretty.", "Spark is awesome.")
 
 test_that("saveAsObjectFile()/objectFile() following textFile() works", {
+  skip_on_cran()
+
   fileName1 <- tempfile(pattern = "spark-test", fileext = ".tmp")
   fileName2 <- tempfile(pattern = "spark-test", fileext = ".tmp")
   writeLines(mockFile, fileName1)
@@ -38,6 +40,8 @@ test_that("saveAsObjectFile()/objectFile() following textFile() works", {
 })
 
 test_that("saveAsObjectFile()/objectFile() works on a parallelized list", {
+  skip_on_cran()
+
   fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
 
   l <- list(1, 2, 3)
@@ -50,6 +54,8 @@ test_that("saveAsObjectFile()/objectFile() works on a parallelized list", {
 })
 
 test_that("saveAsObjectFile()/objectFile() following RDD transformations works", {
+  skip_on_cran()
+
   fileName1 <- tempfile(pattern = "spark-test", fileext = ".tmp")
   fileName2 <- tempfile(pattern = "spark-test", fileext = ".tmp")
   writeLines(mockFile, fileName1)
@@ -74,6 +80,8 @@ test_that("saveAsObjectFile()/objectFile() following RDD transformations works",
 })
 
 test_that("saveAsObjectFile()/objectFile() works with multiple paths", {
+  skip_on_cran()
+
   fileName1 <- tempfile(pattern = "spark-test", fileext = ".tmp")
   fileName2 <- tempfile(pattern = "spark-test", fileext = ".tmp")
 

diff --git a/R/pkg/inst/tests/testthat/test_binary_function.R b/R/pkg/inst/tests/testthat/test_binary_function.R
@@ -29,6 +29,8 @@ rdd <- parallelize(sc, nums, 2L)
 mockFile <- c("Spark is pretty.", "Spark is awesome.")
 
 test_that("union on two RDDs", {
+  skip_on_cran()
+
   actual <- collectRDD(unionRDD(rdd, rdd))
   expect_equal(actual, as.list(rep(nums, 2)))
 
@@ -51,6 +53,8 @@ test_that("union on two RDDs", {
 })
 
 test_that("cogroup on two RDDs", {
+  skip_on_cran()
+
   rdd1 <- parallelize(sc, list(list(1, 1), list(2, 4)))
   rdd2 <- parallelize(sc, list(list(1, 2), list(1, 3)))
   cogroup.rdd <- cogroup(rdd1, rdd2, numPartitions = 2L)
@@ -69,6 +73,8 @@ test_that("cogroup on two RDDs", {
 })
 
 test_that("zipPartitions() on RDDs", {
+  skip_on_cran()
+
   rdd1 <- parallelize(sc, 1:2, 2L)  # 1, 2
   rdd2 <- parallelize(sc, 1:4, 2L)  # 1:2, 3:4
   rdd3 <- parallelize(sc, 1:6, 2L)  # 1:3, 4:6

diff --git a/R/pkg/inst/tests/testthat/test_broadcast.R b/R/pkg/inst/tests/testthat/test_broadcast.R
@@ -26,6 +26,8 @@ nums <- 1:2
 rrdd <- parallelize(sc, nums, 2L)
 
 test_that("using broadcast variable", {
+  skip_on_cran()
+
   randomMat <- matrix(nrow = 10, ncol = 10, data = rnorm(100))
   randomMatBr <- broadcast(sc, randomMat)
 
@@ -38,6 +40,8 @@ test_that("using broadcast variable", {
 })
 
 test_that("without using broadcast variable", {
+  skip_on_cran()
+
   randomMat <- matrix(nrow = 10, ncol = 10, data = rnorm(100))
 
   useBroadcast <- function(x) {

diff --git a/R/pkg/inst/tests/testthat/test_client.R b/R/pkg/inst/tests/testthat/test_client.R
@@ -18,6 +18,8 @@
 context("functions in client.R")
 
 test_that("adding spark-testing-base as a package works", {
+  skip_on_cran()
+
   args <- generateSparkSubmitArgs("", "", "", "",
                                   "holdenk:spark-testing-base:1.3.0_0.0.5")
   expect_equal(gsub("[[:space:]]", "", args),
@@ -26,16 +28,22 @@ test_that("adding spark-testing-base as a package works", {
 })
 
 test_that("no package specified doesn't add packages flag", {
+  skip_on_cran()
+
   args <- generateSparkSubmitArgs("", "", "", "", "")
   expect_equal(gsub("[[:space:]]", "", args),
                "")
 })
 
 test_that("multiple packages don't produce a warning", {
+  skip_on_cran()
+
   expect_warning(generateSparkSubmitArgs("", "", "", "", c("A", "B")), NA)
 })
 
 test_that("sparkJars sparkPackages as character vectors", {
+  skip_on_cran()
+
   args <- generateSparkSubmitArgs("", "", c("one.jar", "two.jar", "three.jar"), "",
                                   c("com.databricks:spark-avro_2.10:2.0.1"))
   expect_match(args, "--jars one.jar,two.jar,three.jar")

diff --git a/R/pkg/inst/tests/testthat/test_context.R b/R/pkg/inst/tests/testthat/test_context.R
@@ -18,6 +18,8 @@
 context("test functions in sparkR.R")
 
 test_that("Check masked functions", {
+  skip_on_cran()
+
   # Check that we are not masking any new function from base, stats, testthat unexpectedly
   # NOTE: We should avoid adding entries to *namesOfMaskedCompletely* as masked functions make it
   # hard for users to use base R functions. Please check when in doubt.
@@ -55,6 +57,8 @@ test_that("Check masked functions", {
 })
 
 test_that("repeatedly starting and stopping SparkR", {
+  skip_on_cran()
+
   for (i in 1:4) {
     sc <- suppressWarnings(sparkR.init())
     rdd <- parallelize(sc, 1:20, 2L)
@@ -73,6 +77,8 @@ test_that("repeatedly starting and stopping SparkSession", {
 })
 
 test_that("rdd GC across sparkR.stop", {
+  skip_on_cran()
+
   sc <- sparkR.sparkContext() # sc should get id 0
   rdd1 <- parallelize(sc, 1:20, 2L) # rdd1 should get id 1
   rdd2 <- parallelize(sc, 1:10, 2L) # rdd2 should get id 2
@@ -96,6 +102,8 @@ test_that("rdd GC across sparkR.stop", {
 })
 
 test_that("job group functions can be called", {
+  skip_on_cran()
+
   sc <- sparkR.sparkContext()
   setJobGroup("groupId", "job description", TRUE)
   cancelJobGroup("groupId")
@@ -108,12 +116,16 @@ test_that("job group functions can be called", {
 })
 
 test_that("utility function can be called", {
+  skip_on_cran()
+
   sparkR.sparkContext()
   setLogLevel("ERROR")
   sparkR.session.stop()
 })
 
 test_that("getClientModeSparkSubmitOpts() returns spark-submit args from whitelist", {
+  skip_on_cran()
+
   e <- new.env()
   e[["spark.driver.memory"]] <- "512m"
   ops <- getClientModeSparkSubmitOpts("sparkrmain", e)
@@ -141,6 +153,8 @@ test_that("getClientModeSparkSubmitOpts() returns spark-submit args from whiteli
 })
 
 test_that("sparkJars sparkPackages as comma-separated strings", {
+  skip_on_cran()
+
   expect_warning(processSparkJars(" a, b "))
   jars <- suppressWarnings(processSparkJars(" a, b "))
   expect_equal(lapply(jars, basename), list("a", "b"))
@@ -168,6 +182,8 @@ test_that("spark.lapply should perform simple transforms", {
 })
 
 test_that("add and get file to be downloaded with Spark job on every node", {
+  skip_on_cran()
+
   sparkR.sparkContext()
   # Test add file.
   path <- tempfile(pattern = "hello", fileext = ".txt")

diff --git a/R/pkg/inst/tests/testthat/test_includePackage.R b/R/pkg/inst/tests/testthat/test_includePackage.R
@@ -26,6 +26,8 @@ nums <- 1:2
 rdd <- parallelize(sc, nums, 2L)
 
 test_that("include inside function", {
+  skip_on_cran()
+
   # Only run the test if plyr is installed.
   if ("plyr" %in% rownames(installed.packages())) {
     suppressPackageStartupMessages(library(plyr))
@@ -42,6 +44,8 @@ test_that("include inside function", {
 })
 
 test_that("use include package", {
+  skip_on_cran()
+
   # Only run the test if plyr is installed.
   if ("plyr" %in% rownames(installed.packages())) {
     suppressPackageStartupMessages(library(plyr))

diff --git a/R/pkg/inst/tests/testthat/test_mllib_classification.R b/R/pkg/inst/tests/testthat/test_mllib_classification.R
@@ -284,22 +284,11 @@ test_that("spark.mlp", {
                c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "2.0", "2.0", "1.0", "0.0"))
 
   # test initialWeights
-  model <- spark.mlp(df, label ~ features, layers = c(4, 3), maxIter = 2, initialWeights =
+  model <- spark.mlp(df, label ~ features, layers = c(4, 3), initialWeights =
     c(0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 9, 9, 9, 9, 9))
   mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
   expect_equal(head(mlpPredictions$prediction, 10),
-               c("1.0", "1.0", "1.0", "1.0", "2.0", "1.0", "2.0", "2.0", "1.0", "0.0"))
-
-  model <- spark.mlp(df, label ~ features, layers = c(4, 3), maxIter = 2, initialWeights =
-    c(0.0, 0.0, 0.0, 0.0, 0.0, 5.0, 5.0, 5.0, 5.0, 5.0, 9.0, 9.0, 9.0, 9.0, 9.0))
-  mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
-  expect_equal(head(mlpPredictions$prediction, 10),
-               c("1.0", "1.0", "1.0", "1.0", "2.0", "1.0", "2.0", "2.0", "1.0", "0.0"))
-
-  model <- spark.mlp(df, label ~ features, layers = c(4, 3), maxIter = 2)
-  mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
-  expect_equal(head(mlpPredictions$prediction, 10),
-               c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "0.0", "2.0", "1.0", "0.0"))
+               c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "2.0", "2.0", "1.0", "0.0"))
 
   # Test formula works well
   df <- suppressWarnings(createDataFrame(iris))
@@ -310,8 +299,6 @@ test_that("spark.mlp", {
   expect_equal(summary$numOfOutputs, 3)
   expect_equal(summary$layers, c(4, 3))
   expect_equal(length(summary$weights), 15)
-  expect_equal(head(summary$weights, 5), list(-1.1957257, -5.2693685, 7.4489734, -6.3751413,
-               -10.2376130), tolerance = 1e-6)
 })
 
 test_that("spark.naiveBayes", {

diff --git a/R/pkg/inst/tests/testthat/test_mllib_clustering.R b/R/pkg/inst/tests/testthat/test_mllib_clustering.R
@@ -255,6 +255,8 @@ test_that("spark.lda with libsvm", {
 })
 
 test_that("spark.lda with text input", {
+  skip_on_cran()
+
   text <- read.text(absoluteSparkPath("data/mllib/sample_lda_data.txt"))
   model <- spark.lda(text, optimizer = "online", features = "value")
 
@@ -297,6 +299,8 @@ test_that("spark.lda with text input", {
 })
 
 test_that("spark.posterior and spark.perplexity", {
+  skip_on_cran()
+
   text <- read.text(absoluteSparkPath("data/mllib/sample_lda_data.txt"))
   model <- spark.lda(text, features = "value", k = 3)