From 928c2f15db207704843eaa4573c647fccd417ede Mon Sep 17 00:00:00 2001 From: Felix Cheung Date: Sun, 18 Dec 2016 14:44:51 -0800 Subject: [PATCH 1/7] set derby dir --- R/pkg/R/sparkR.R | 4 ++ R/pkg/inst/tests/testthat/test_aFirstTest.R | 39 +++++++++++++++++++ R/pkg/inst/tests/testthat/test_sparkSQL.R | 1 + .../scala/org/apache/spark/api/r/RRDD.scala | 7 ++++ .../apache/spark/deploy/SparkHadoopUtil.scala | 6 +++ 5 files changed, 57 insertions(+) create mode 100644 R/pkg/inst/tests/testthat/test_aFirstTest.R diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index 61773ed3ee8c..d8b60739642e 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -381,6 +381,10 @@ sparkR.session <- function( deployMode <- sparkConfigMap[["spark.submit.deployMode"]] } + if (!exists("spark.sql.default.derby.dir", envir = sparkConfigMap)) { + sparkConfigMap[["spark.sql.default.derby.dir"]] <- tempdir() + } + if (!exists(".sparkRjsc", envir = .sparkREnv)) { retHome <- sparkCheckInstall(sparkHome, master, deployMode) if (!is.null(retHome)) sparkHome <- retHome diff --git a/R/pkg/inst/tests/testthat/test_aFirstTest.R b/R/pkg/inst/tests/testthat/test_aFirstTest.R new file mode 100644 index 000000000000..95a601e668be --- /dev/null +++ b/R/pkg/inst/tests/testthat/test_aFirstTest.R @@ -0,0 +1,39 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# With testthat, tests are run in alphabetic order on the test filenames. +# Name this test starting with an `a` to make sure it is run first +context("a first test for package") + +filesBefore <- list.files(path = file.path(Sys.getenv("SPARK_HOME"), "R"), all.files = TRUE) + +sparkSession <- sparkR.session(enableHiveSupport = FALSE) + +test_that("No extra files are created in SPARK_HOME", { + # Check that it is not creating any extra file. + # Does not check the tempdir which would be cleaned up after. + filesAfter <- list.files(path = file.path(Sys.getenv("SPARK_HOME"), "R"), all.files = TRUE) + + # get testthat to show the diff by first making the 2 lists equal in length + expect_equal(length(filesBefore), length(filesAfter)) + l <- max(length(filesBefore), length(filesAfter)) + length(filesBefore) <- l + length(filesAfter) <- l + expect_equal(sort(filesBefore, na.last = TRUE), sort(filesAfter, na.last = TRUE)) +}) + +sparkR.session.stop() diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index f7081cb1d4e5..84187f796efc 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -60,6 +60,7 @@ unsetHiveContext <- function() { # Tests for SparkSQL functions in SparkR +filesBefore <- list.files(path = file.path(Sys.getenv("SPARK_HOME"), "R"), all.files = TRUE) sparkSession <- sparkR.session() sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession) diff --git a/core/src/main/scala/org/apache/spark/api/r/RRDD.scala b/core/src/main/scala/org/apache/spark/api/r/RRDD.scala index a1a5eb8cf55e..47a94c2f273a 100644 --- a/core/src/main/scala/org/apache/spark/api/r/RRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/r/RRDD.scala @@ -127,6 +127,13 @@ private[r] object RRDD { sparkConf.setExecutorEnv(name.toString, value.toString) } + if (sparkEnvirMap.containsKey("spark.sql.default.derby.dir") && + System.getProperty("derby.system.home") == null) { + // This must be set before SparkContext is instantiated. + System.setProperty("derby.system.home", + sparkEnvirMap.get("spark.sql.default.derby.dir").toString) + } + val jsc = new JavaSparkContext(sparkConf) jars.foreach { jar => jsc.addJar(jar) diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala index f475ce87540a..94152fb3a5b5 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala @@ -105,6 +105,12 @@ class SparkHadoopUtil extends Logging { } val bufferSize = conf.get("spark.buffer.size", "65536") hadoopConf.set("io.file.buffer.size", bufferSize) + + if (conf.contains("spark.sql.default.derby.dir")) { + val jdoDir = conf.get("spark.sql.default.derby.dir") + val jdoConf = s"jdbc:derby:;databaseName=${jdoDir}/metastore_db;create=true" + hadoopConf.setIfUnset("javax.jdo.option.ConnectionURL", jdoConf) + } } } From 8fc70338603953e8ddf92dbd2c844b237c12f463 Mon Sep 17 00:00:00 2001 From: Felix Cheung Date: Tue, 20 Dec 2016 15:29:12 -0800 Subject: [PATCH 2/7] move test --- R/pkg/inst/tests/testthat/test_aFirstTest.R | 43 +++++++++++++++++++-- R/pkg/inst/tests/testthat/test_context.R | 37 ------------------ 2 files changed, 40 insertions(+), 40 deletions(-) diff --git a/R/pkg/inst/tests/testthat/test_aFirstTest.R b/R/pkg/inst/tests/testthat/test_aFirstTest.R index 95a601e668be..edf9255cc6dc 100644 --- a/R/pkg/inst/tests/testthat/test_aFirstTest.R +++ b/R/pkg/inst/tests/testthat/test_aFirstTest.R @@ -16,14 +16,14 @@ # # With testthat, tests are run in alphabetic order on the test filenames. -# Name this test starting with an `a` to make sure it is run first -context("a first test for package") +# Name this test file starting with an `a` to make sure it is run first. +context("a first set of tests for the package") filesBefore <- list.files(path = file.path(Sys.getenv("SPARK_HOME"), "R"), all.files = TRUE) sparkSession <- sparkR.session(enableHiveSupport = FALSE) -test_that("No extra files are created in SPARK_HOME", { +test_that("No extra files are created in SPARK_HOME by starting the session", { # Check that it is not creating any extra file. # Does not check the tempdir which would be cleaned up after. filesAfter <- list.files(path = file.path(Sys.getenv("SPARK_HOME"), "R"), all.files = TRUE) @@ -36,4 +36,41 @@ test_that("No extra files are created in SPARK_HOME", { expect_equal(sort(filesBefore, na.last = TRUE), sort(filesAfter, na.last = TRUE)) }) +test_that("Check masked functions", { + # Check that we are not masking any new function from base, stats, testthat unexpectedly + # NOTE: We should avoid adding entries to *namesOfMaskedCompletely* as masked functions make it + # hard for users to use base R functions. Please check when in doubt. + namesOfMaskedCompletely <- c("cov", "filter", "sample") + namesOfMasked <- c("describe", "cov", "filter", "lag", "na.omit", "predict", "sd", "var", + "colnames", "colnames<-", "intersect", "rank", "rbind", "sample", "subset", + "summary", "transform", "drop", "window", "as.data.frame", "union") + if (as.numeric(R.version$major) >= 3 && as.numeric(R.version$minor) >= 3) { + namesOfMasked <- c("endsWith", "startsWith", namesOfMasked) + } + masked <- conflicts(detail = TRUE)$`package:SparkR` + expect_true("describe" %in% masked) # only when with testthat.. + func <- lapply(masked, function(x) { capture.output(showMethods(x))[[1]] }) + funcSparkROrEmpty <- grepl("\\(package SparkR\\)$|^$", func) + maskedBySparkR <- masked[funcSparkROrEmpty] + expect_equal(length(maskedBySparkR), length(namesOfMasked)) + # make the 2 lists the same length so expect_equal will print their content + l <- max(length(maskedBySparkR), length(namesOfMasked)) + length(maskedBySparkR) <- l + length(namesOfMasked) <- l + expect_equal(sort(maskedBySparkR, na.last = TRUE), sort(namesOfMasked, na.last = TRUE)) + # above are those reported as masked when `library(SparkR)` + # note that many of these methods are still callable without base:: or stats:: prefix + # there should be a test for each of these, except followings, which are currently "broken" + funcHasAny <- unlist(lapply(masked, function(x) { + any(grepl("=\"ANY\"", capture.output(showMethods(x)[-1]))) + })) + maskedCompletely <- masked[!funcHasAny] + expect_equal(length(maskedCompletely), length(namesOfMaskedCompletely)) + l <- max(length(maskedCompletely), length(namesOfMaskedCompletely)) + length(maskedCompletely) <- l + length(namesOfMaskedCompletely) <- l + expect_equal(sort(maskedCompletely, na.last = TRUE), + sort(namesOfMaskedCompletely, na.last = TRUE)) +}) + sparkR.session.stop() diff --git a/R/pkg/inst/tests/testthat/test_context.R b/R/pkg/inst/tests/testthat/test_context.R index caca06933952..c3453bbf2937 100644 --- a/R/pkg/inst/tests/testthat/test_context.R +++ b/R/pkg/inst/tests/testthat/test_context.R @@ -17,43 +17,6 @@ context("test functions in sparkR.R") -test_that("Check masked functions", { - # Check that we are not masking any new function from base, stats, testthat unexpectedly - # NOTE: We should avoid adding entries to *namesOfMaskedCompletely* as masked functions make it - # hard for users to use base R functions. Please check when in doubt. - namesOfMaskedCompletely <- c("cov", "filter", "sample") - namesOfMasked <- c("describe", "cov", "filter", "lag", "na.omit", "predict", "sd", "var", - "colnames", "colnames<-", "intersect", "rank", "rbind", "sample", "subset", - "summary", "transform", "drop", "window", "as.data.frame", "union") - if (as.numeric(R.version$major) >= 3 && as.numeric(R.version$minor) >= 3) { - namesOfMasked <- c("endsWith", "startsWith", namesOfMasked) - } - masked <- conflicts(detail = TRUE)$`package:SparkR` - expect_true("describe" %in% masked) # only when with testthat.. - func <- lapply(masked, function(x) { capture.output(showMethods(x))[[1]] }) - funcSparkROrEmpty <- grepl("\\(package SparkR\\)$|^$", func) - maskedBySparkR <- masked[funcSparkROrEmpty] - expect_equal(length(maskedBySparkR), length(namesOfMasked)) - # make the 2 lists the same length so expect_equal will print their content - l <- max(length(maskedBySparkR), length(namesOfMasked)) - length(maskedBySparkR) <- l - length(namesOfMasked) <- l - expect_equal(sort(maskedBySparkR, na.last = TRUE), sort(namesOfMasked, na.last = TRUE)) - # above are those reported as masked when `library(SparkR)` - # note that many of these methods are still callable without base:: or stats:: prefix - # there should be a test for each of these, except followings, which are currently "broken" - funcHasAny <- unlist(lapply(masked, function(x) { - any(grepl("=\"ANY\"", capture.output(showMethods(x)[-1]))) - })) - maskedCompletely <- masked[!funcHasAny] - expect_equal(length(maskedCompletely), length(namesOfMaskedCompletely)) - l <- max(length(maskedCompletely), length(namesOfMaskedCompletely)) - length(maskedCompletely) <- l - length(namesOfMaskedCompletely) <- l - expect_equal(sort(maskedCompletely, na.last = TRUE), - sort(namesOfMaskedCompletely, na.last = TRUE)) -}) - test_that("repeatedly starting and stopping SparkR", { for (i in 1:4) { sc <- suppressWarnings(sparkR.init()) From b00173094806eecc08893df89acceaed6820e6c1 Mon Sep 17 00:00:00 2001 From: Felix Cheung Date: Sat, 11 Mar 2017 10:55:24 -0800 Subject: [PATCH 3/7] change to only set derby home --- R/pkg/R/sparkR.R | 4 +- R/pkg/inst/tests/testthat/test_aFirstTest.R | 76 ------------------- R/pkg/inst/tests/testthat/test_context.R | 37 +++++++++ R/pkg/inst/tests/testthat/test_sparkSQL.R | 21 +++++ R/pkg/tests/run-all.R | 1 + .../scala/org/apache/spark/api/r/RRDD.scala | 4 +- .../apache/spark/deploy/SparkHadoopUtil.scala | 6 -- 7 files changed, 63 insertions(+), 86 deletions(-) delete mode 100644 R/pkg/inst/tests/testthat/test_aFirstTest.R diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index d8b60739642e..d9b2bb2f57c7 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -381,8 +381,8 @@ sparkR.session <- function( deployMode <- sparkConfigMap[["spark.submit.deployMode"]] } - if (!exists("spark.sql.default.derby.dir", envir = sparkConfigMap)) { - sparkConfigMap[["spark.sql.default.derby.dir"]] <- tempdir() + if (!exists("spark.r.sql.default.derby.dir", envir = sparkConfigMap)) { + sparkConfigMap[["spark.r.sql.default.derby.dir"]] <- tempdir() } if (!exists(".sparkRjsc", envir = .sparkREnv)) { diff --git a/R/pkg/inst/tests/testthat/test_aFirstTest.R b/R/pkg/inst/tests/testthat/test_aFirstTest.R deleted file mode 100644 index edf9255cc6dc..000000000000 --- a/R/pkg/inst/tests/testthat/test_aFirstTest.R +++ /dev/null @@ -1,76 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# With testthat, tests are run in alphabetic order on the test filenames. -# Name this test file starting with an `a` to make sure it is run first. -context("a first set of tests for the package") - -filesBefore <- list.files(path = file.path(Sys.getenv("SPARK_HOME"), "R"), all.files = TRUE) - -sparkSession <- sparkR.session(enableHiveSupport = FALSE) - -test_that("No extra files are created in SPARK_HOME by starting the session", { - # Check that it is not creating any extra file. - # Does not check the tempdir which would be cleaned up after. - filesAfter <- list.files(path = file.path(Sys.getenv("SPARK_HOME"), "R"), all.files = TRUE) - - # get testthat to show the diff by first making the 2 lists equal in length - expect_equal(length(filesBefore), length(filesAfter)) - l <- max(length(filesBefore), length(filesAfter)) - length(filesBefore) <- l - length(filesAfter) <- l - expect_equal(sort(filesBefore, na.last = TRUE), sort(filesAfter, na.last = TRUE)) -}) - -test_that("Check masked functions", { - # Check that we are not masking any new function from base, stats, testthat unexpectedly - # NOTE: We should avoid adding entries to *namesOfMaskedCompletely* as masked functions make it - # hard for users to use base R functions. Please check when in doubt. - namesOfMaskedCompletely <- c("cov", "filter", "sample") - namesOfMasked <- c("describe", "cov", "filter", "lag", "na.omit", "predict", "sd", "var", - "colnames", "colnames<-", "intersect", "rank", "rbind", "sample", "subset", - "summary", "transform", "drop", "window", "as.data.frame", "union") - if (as.numeric(R.version$major) >= 3 && as.numeric(R.version$minor) >= 3) { - namesOfMasked <- c("endsWith", "startsWith", namesOfMasked) - } - masked <- conflicts(detail = TRUE)$`package:SparkR` - expect_true("describe" %in% masked) # only when with testthat.. - func <- lapply(masked, function(x) { capture.output(showMethods(x))[[1]] }) - funcSparkROrEmpty <- grepl("\\(package SparkR\\)$|^$", func) - maskedBySparkR <- masked[funcSparkROrEmpty] - expect_equal(length(maskedBySparkR), length(namesOfMasked)) - # make the 2 lists the same length so expect_equal will print their content - l <- max(length(maskedBySparkR), length(namesOfMasked)) - length(maskedBySparkR) <- l - length(namesOfMasked) <- l - expect_equal(sort(maskedBySparkR, na.last = TRUE), sort(namesOfMasked, na.last = TRUE)) - # above are those reported as masked when `library(SparkR)` - # note that many of these methods are still callable without base:: or stats:: prefix - # there should be a test for each of these, except followings, which are currently "broken" - funcHasAny <- unlist(lapply(masked, function(x) { - any(grepl("=\"ANY\"", capture.output(showMethods(x)[-1]))) - })) - maskedCompletely <- masked[!funcHasAny] - expect_equal(length(maskedCompletely), length(namesOfMaskedCompletely)) - l <- max(length(maskedCompletely), length(namesOfMaskedCompletely)) - length(maskedCompletely) <- l - length(namesOfMaskedCompletely) <- l - expect_equal(sort(maskedCompletely, na.last = TRUE), - sort(namesOfMaskedCompletely, na.last = TRUE)) -}) - -sparkR.session.stop() diff --git a/R/pkg/inst/tests/testthat/test_context.R b/R/pkg/inst/tests/testthat/test_context.R index c3453bbf2937..caca06933952 100644 --- a/R/pkg/inst/tests/testthat/test_context.R +++ b/R/pkg/inst/tests/testthat/test_context.R @@ -17,6 +17,43 @@ context("test functions in sparkR.R") +test_that("Check masked functions", { + # Check that we are not masking any new function from base, stats, testthat unexpectedly + # NOTE: We should avoid adding entries to *namesOfMaskedCompletely* as masked functions make it + # hard for users to use base R functions. Please check when in doubt. + namesOfMaskedCompletely <- c("cov", "filter", "sample") + namesOfMasked <- c("describe", "cov", "filter", "lag", "na.omit", "predict", "sd", "var", + "colnames", "colnames<-", "intersect", "rank", "rbind", "sample", "subset", + "summary", "transform", "drop", "window", "as.data.frame", "union") + if (as.numeric(R.version$major) >= 3 && as.numeric(R.version$minor) >= 3) { + namesOfMasked <- c("endsWith", "startsWith", namesOfMasked) + } + masked <- conflicts(detail = TRUE)$`package:SparkR` + expect_true("describe" %in% masked) # only when with testthat.. + func <- lapply(masked, function(x) { capture.output(showMethods(x))[[1]] }) + funcSparkROrEmpty <- grepl("\\(package SparkR\\)$|^$", func) + maskedBySparkR <- masked[funcSparkROrEmpty] + expect_equal(length(maskedBySparkR), length(namesOfMasked)) + # make the 2 lists the same length so expect_equal will print their content + l <- max(length(maskedBySparkR), length(namesOfMasked)) + length(maskedBySparkR) <- l + length(namesOfMasked) <- l + expect_equal(sort(maskedBySparkR, na.last = TRUE), sort(namesOfMasked, na.last = TRUE)) + # above are those reported as masked when `library(SparkR)` + # note that many of these methods are still callable without base:: or stats:: prefix + # there should be a test for each of these, except followings, which are currently "broken" + funcHasAny <- unlist(lapply(masked, function(x) { + any(grepl("=\"ANY\"", capture.output(showMethods(x)[-1]))) + })) + maskedCompletely <- masked[!funcHasAny] + expect_equal(length(maskedCompletely), length(namesOfMaskedCompletely)) + l <- max(length(maskedCompletely), length(namesOfMaskedCompletely)) + length(maskedCompletely) <- l + length(namesOfMaskedCompletely) <- l + expect_equal(sort(maskedCompletely, na.last = TRUE), + sort(namesOfMaskedCompletely, na.last = TRUE)) +}) + test_that("repeatedly starting and stopping SparkR", { for (i in 1:4) { sc <- suppressWarnings(sparkR.init()) diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 84187f796efc..cab8f71b9449 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -2910,6 +2910,27 @@ test_that("Collect on DataFrame when NAs exists at the top of a timestamp column expect_equal(class(ldf3$col3), c("POSIXct", "POSIXt")) }) +compare_list <- function(list1, list2) { + # get testthat to show the diff by first making the 2 lists equal in length + expect_equal(length(list1), length(list2)) + l <- max(length(list1), length(list2)) + length(list1) <- l + length(list2) <- l + expect_equal(sort(list1, na.last = TRUE), sort(list2, na.last = TRUE)) +} + +# This should always be the last test in this test file. +test_that("No extra files are created in SPARK_HOME by starting session and making calls", { + # Check that it is not creating any extra file. + # Does not check the tempdir which would be cleaned up after. + filesAfter <- list.files(path = file.path(Sys.getenv("SPARK_HOME"), "R"), all.files = TRUE) + + expect_true(length(sparkHomeFileBefore) > 0) + compare_list(sparkHomeFileBefore, filesBefore) + + compare_list(filesBefore, filesAfter) +}) + unlink(parquetPath) unlink(orcPath) unlink(jsonPath) diff --git a/R/pkg/tests/run-all.R b/R/pkg/tests/run-all.R index ab8d1ca01994..46c3ca3504b9 100644 --- a/R/pkg/tests/run-all.R +++ b/R/pkg/tests/run-all.R @@ -22,6 +22,7 @@ library(SparkR) options("warn" = 2) # Setup global test environment +sparkHomeFileBefore <- list.files(path = file.path(Sys.getenv("SPARK_HOME"), "R"), all.files = TRUE) install.spark() test_package("SparkR") diff --git a/core/src/main/scala/org/apache/spark/api/r/RRDD.scala b/core/src/main/scala/org/apache/spark/api/r/RRDD.scala index 47a94c2f273a..b8e4b5de776c 100644 --- a/core/src/main/scala/org/apache/spark/api/r/RRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/r/RRDD.scala @@ -127,11 +127,11 @@ private[r] object RRDD { sparkConf.setExecutorEnv(name.toString, value.toString) } - if (sparkEnvirMap.containsKey("spark.sql.default.derby.dir") && + if (sparkEnvirMap.containsKey("spark.r.sql.default.derby.dir") && System.getProperty("derby.system.home") == null) { // This must be set before SparkContext is instantiated. System.setProperty("derby.system.home", - sparkEnvirMap.get("spark.sql.default.derby.dir").toString) + sparkEnvirMap.get("spark.r.sql.default.derby.dir").toString) } val jsc = new JavaSparkContext(sparkConf) diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala index 94152fb3a5b5..f475ce87540a 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala @@ -105,12 +105,6 @@ class SparkHadoopUtil extends Logging { } val bufferSize = conf.get("spark.buffer.size", "65536") hadoopConf.set("io.file.buffer.size", bufferSize) - - if (conf.contains("spark.sql.default.derby.dir")) { - val jdoDir = conf.get("spark.sql.default.derby.dir") - val jdoConf = s"jdbc:derby:;databaseName=${jdoDir}/metastore_db;create=true" - hadoopConf.setIfUnset("javax.jdo.option.ConnectionURL", jdoConf) - } } } From ca4f08e3d8d574025f36d53eef7681a95912dd4d Mon Sep 17 00:00:00 2001 From: Felix Cheung Date: Sat, 11 Mar 2017 11:28:46 -0800 Subject: [PATCH 4/7] update api doc --- R/pkg/R/sparkR.R | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index d9b2bb2f57c7..aab322e7b4cd 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -322,10 +322,19 @@ sparkRHive.init <- function(jsc = NULL) { #' SparkSession or initializes a new SparkSession. #' Additional Spark properties can be set in \code{...}, and these named parameters take priority #' over values in \code{master}, \code{appName}, named lists of \code{sparkConfig}. -#' When called in an interactive session, this checks for the Spark installation, and, if not +#' +#' When called in an interactive session, this method checks for the Spark installation, and, if not #' found, it will be downloaded and cached automatically. Alternatively, \code{install.spark} can #' be called manually. #' +#' A default warehouse is created automatically in the current directory when a managed table is +#' created via \code{sql} statement \code{CREATE TABLE}, for example. To change the location of the +#' warehouse, set the named parameter \code{spark.sql.warehouse.dir} to the SparkSession. Along with +#' the warehouse, an accompanied metastore may also be automatically created in the current +#' directory when a new SparkSession is initialized with \code{enableHiveSupport} set to +#' \code{TRUE}, which is the default. For more details, refer to Hive configuration at +#' \url{http://spark.apache.org/docs/latest/sql-programming-guide.html#hive-tables}. +#' #' For details on how to initialize and use SparkR, refer to SparkR programming guide at #' \url{http://spark.apache.org/docs/latest/sparkr.html#starting-up-sparksession}. #' From 4604a531c321dbd18ba5a5f9cac055137db49670 Mon Sep 17 00:00:00 2001 From: Felix Cheung Date: Sun, 12 Mar 2017 11:49:45 -0700 Subject: [PATCH 5/7] change property to use to move derby.log, clean start in test, update test --- R/pkg/R/sparkR.R | 4 ++-- R/pkg/inst/tests/testthat/test_sparkSQL.R | 17 ++++++++++------- R/pkg/tests/run-all.R | 7 ++++++- .../scala/org/apache/spark/api/r/RRDD.scala | 10 ++++++---- 4 files changed, 24 insertions(+), 14 deletions(-) diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index aab322e7b4cd..d0a12b7ecec6 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -390,8 +390,8 @@ sparkR.session <- function( deployMode <- sparkConfigMap[["spark.submit.deployMode"]] } - if (!exists("spark.r.sql.default.derby.dir", envir = sparkConfigMap)) { - sparkConfigMap[["spark.r.sql.default.derby.dir"]] <- tempdir() + if (!exists("spark.r.sql.derby.temp.dir", envir = sparkConfigMap)) { + sparkConfigMap[["spark.r.sql.derby.temp.dir"]] <- tempdir() } if (!exists(".sparkRjsc", envir = .sparkREnv)) { diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index cab8f71b9449..920a81b92d5c 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -60,7 +60,7 @@ unsetHiveContext <- function() { # Tests for SparkSQL functions in SparkR -filesBefore <- list.files(path = file.path(Sys.getenv("SPARK_HOME"), "R"), all.files = TRUE) +filesBefore <- list.files(path = sparkRDir, all.files = TRUE) sparkSession <- sparkR.session() sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession) @@ -2923,12 +2923,15 @@ compare_list <- function(list1, list2) { test_that("No extra files are created in SPARK_HOME by starting session and making calls", { # Check that it is not creating any extra file. # Does not check the tempdir which would be cleaned up after. - filesAfter <- list.files(path = file.path(Sys.getenv("SPARK_HOME"), "R"), all.files = TRUE) - - expect_true(length(sparkHomeFileBefore) > 0) - compare_list(sparkHomeFileBefore, filesBefore) - - compare_list(filesBefore, filesAfter) + filesAfter <- list.files(path = sparkRDir, all.files = TRUE) + + expect_true(length(sparkRFilesBefore) > 0) + # first, ensure derby.log is not there + expect_false("derby.log" %in% filesAfter) + # second, ensure only spark-warehouse is created when calling SparkSession, enableHiveSupport = F + compare_list(sparkRFilesBefore, setdiff(filesBefore, sparkRWhitelistSQLDirs[[1]])) + # third, ensure only spark-warehouse and metastore_db are created when enableHiveSupport = T + compare_list(sparkRFilesBefore, setdiff(filesAfter, sparkRWhitelistSQLDirs)) }) unlink(parquetPath) diff --git a/R/pkg/tests/run-all.R b/R/pkg/tests/run-all.R index 46c3ca3504b9..cefaadda6e21 100644 --- a/R/pkg/tests/run-all.R +++ b/R/pkg/tests/run-all.R @@ -22,7 +22,12 @@ library(SparkR) options("warn" = 2) # Setup global test environment -sparkHomeFileBefore <- list.files(path = file.path(Sys.getenv("SPARK_HOME"), "R"), all.files = TRUE) +sparkRDir <- file.path(Sys.getenv("SPARK_HOME"), "R") +sparkRFilesBefore <- list.files(path = sparkRDir, all.files = TRUE) +sparkRWhitelistSQLDirs <- c("spark-warehouse", "metastore_db") +invisible(lapply(sparkRWhitelistSQLDirs, + function(x) { unlink(file.path(sparkRDir, x), recursive = TRUE, force = TRUE)})) + install.spark() test_package("SparkR") diff --git a/core/src/main/scala/org/apache/spark/api/r/RRDD.scala b/core/src/main/scala/org/apache/spark/api/r/RRDD.scala index b8e4b5de776c..fd8d77d8cc72 100644 --- a/core/src/main/scala/org/apache/spark/api/r/RRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/r/RRDD.scala @@ -18,6 +18,7 @@ package org.apache.spark.api.r import java.util.{Map => JMap} +import java.io.File import scala.collection.JavaConverters._ import scala.reflect.ClassTag @@ -127,11 +128,12 @@ private[r] object RRDD { sparkConf.setExecutorEnv(name.toString, value.toString) } - if (sparkEnvirMap.containsKey("spark.r.sql.default.derby.dir") && - System.getProperty("derby.system.home") == null) { + if (sparkEnvirMap.containsKey("spark.r.sql.derby.temp.dir") && + System.getProperty("derby.stream.error.file") == null) { // This must be set before SparkContext is instantiated. - System.setProperty("derby.system.home", - sparkEnvirMap.get("spark.r.sql.default.derby.dir").toString) + System.setProperty("derby.stream.error.file", + Seq(sparkEnvirMap.get("spark.r.sql.derby.temp.dir").toString, "derby.log") + .mkString(File.separator)) } val jsc = new JavaSparkContext(sparkConf) From 2eb75f8d73ec3ce1eb85d7c501e4c072499b8f44 Mon Sep 17 00:00:00 2001 From: Felix Cheung Date: Sun, 12 Mar 2017 12:11:46 -0700 Subject: [PATCH 6/7] import order --- core/src/main/scala/org/apache/spark/api/r/RRDD.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/scala/org/apache/spark/api/r/RRDD.scala b/core/src/main/scala/org/apache/spark/api/r/RRDD.scala index fd8d77d8cc72..72ae0340aa3d 100644 --- a/core/src/main/scala/org/apache/spark/api/r/RRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/r/RRDD.scala @@ -17,8 +17,8 @@ package org.apache.spark.api.r -import java.util.{Map => JMap} import java.io.File +import java.util.{Map => JMap} import scala.collection.JavaConverters._ import scala.reflect.ClassTag From ac9fbfc5d511877f7775c620ff8e1c672880ee50 Mon Sep 17 00:00:00 2001 From: Felix Cheung Date: Sat, 18 Mar 2017 16:53:23 -0700 Subject: [PATCH 7/7] add more comment on test --- R/pkg/inst/tests/testthat/test_sparkSQL.R | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 920a81b92d5c..32856b399cdd 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -2919,7 +2919,7 @@ compare_list <- function(list1, list2) { expect_equal(sort(list1, na.last = TRUE), sort(list2, na.last = TRUE)) } -# This should always be the last test in this test file. +# This should always be the **very last test** in this test file. test_that("No extra files are created in SPARK_HOME by starting session and making calls", { # Check that it is not creating any extra file. # Does not check the tempdir which would be cleaned up after. @@ -2929,8 +2929,17 @@ test_that("No extra files are created in SPARK_HOME by starting session and maki # first, ensure derby.log is not there expect_false("derby.log" %in% filesAfter) # second, ensure only spark-warehouse is created when calling SparkSession, enableHiveSupport = F + # note: currently all other test files have enableHiveSupport = F, so we capture the list of files + # before creating a SparkSession with enableHiveSupport = T at the top of this test file + # (filesBefore). The test here is to compare that (filesBefore) against the list of files before + # any test is run in run-all.R (sparkRFilesBefore). + # sparkRWhitelistSQLDirs is also defined in run-all.R, and should contain only 2 whitelisted dirs, + # here allow the first value, spark-warehouse, in the diff, everything else should be exactly the + # same as before any test is run. compare_list(sparkRFilesBefore, setdiff(filesBefore, sparkRWhitelistSQLDirs[[1]])) # third, ensure only spark-warehouse and metastore_db are created when enableHiveSupport = T + # note: as the note above, after running all tests in this file while enableHiveSupport = T, we + # check the list of files again. This time we allow both whitelisted dirs to be in the diff. compare_list(sparkRFilesBefore, setdiff(filesAfter, sparkRWhitelistSQLDirs)) })