From 928c2f15db207704843eaa4573c647fccd417ede Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Sun, 18 Dec 2016 14:44:51 -0800
Subject: [PATCH 1/7] set derby dir

---
 R/pkg/R/sparkR.R                              |  4 ++
 R/pkg/inst/tests/testthat/test_aFirstTest.R   | 39 +++++++++++++++++++
 R/pkg/inst/tests/testthat/test_sparkSQL.R     |  1 +
 .../scala/org/apache/spark/api/r/RRDD.scala   |  7 ++++
 .../apache/spark/deploy/SparkHadoopUtil.scala |  6 +++
 5 files changed, 57 insertions(+)
 create mode 100644 R/pkg/inst/tests/testthat/test_aFirstTest.R

diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R
index 61773ed3ee8c..d8b60739642e 100644
--- a/R/pkg/R/sparkR.R
+++ b/R/pkg/R/sparkR.R
@@ -381,6 +381,10 @@ sparkR.session <- function(
     deployMode <- sparkConfigMap[["spark.submit.deployMode"]]
   }
 
+  if (!exists("spark.sql.default.derby.dir", envir = sparkConfigMap)) {
+    sparkConfigMap[["spark.sql.default.derby.dir"]] <- tempdir()
+  }
+
   if (!exists(".sparkRjsc", envir = .sparkREnv)) {
     retHome <- sparkCheckInstall(sparkHome, master, deployMode)
     if (!is.null(retHome)) sparkHome <- retHome
diff --git a/R/pkg/inst/tests/testthat/test_aFirstTest.R b/R/pkg/inst/tests/testthat/test_aFirstTest.R
new file mode 100644
index 000000000000..95a601e668be
--- /dev/null
+++ b/R/pkg/inst/tests/testthat/test_aFirstTest.R
@@ -0,0 +1,39 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# With testthat, tests are run in alphabetic order on the test filenames.
+# Name this test starting with an `a` to make sure it is run first
+context("a first test for package")
+
+filesBefore <- list.files(path = file.path(Sys.getenv("SPARK_HOME"), "R"), all.files = TRUE)
+
+sparkSession <- sparkR.session(enableHiveSupport = FALSE)
+
+test_that("No extra files are created in SPARK_HOME", {
+  # Check that it is not creating any extra file.
+  # Does not check the tempdir which would be cleaned up after.
+  filesAfter <- list.files(path = file.path(Sys.getenv("SPARK_HOME"), "R"), all.files = TRUE)
+
+  # get testthat to show the diff by first making the 2 lists equal in length
+  expect_equal(length(filesBefore), length(filesAfter))
+  l <- max(length(filesBefore), length(filesAfter))
+  length(filesBefore) <- l
+  length(filesAfter) <- l
+  expect_equal(sort(filesBefore, na.last = TRUE), sort(filesAfter, na.last = TRUE))
+})
+
+sparkR.session.stop()
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index f7081cb1d4e5..84187f796efc 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -60,6 +60,7 @@ unsetHiveContext <- function() {
 
 # Tests for SparkSQL functions in SparkR
 
+filesBefore <- list.files(path = file.path(Sys.getenv("SPARK_HOME"), "R"), all.files = TRUE)
 sparkSession <- sparkR.session()
 sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
 
diff --git a/core/src/main/scala/org/apache/spark/api/r/RRDD.scala b/core/src/main/scala/org/apache/spark/api/r/RRDD.scala
index a1a5eb8cf55e..47a94c2f273a 100644
--- a/core/src/main/scala/org/apache/spark/api/r/RRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/RRDD.scala
@@ -127,6 +127,13 @@ private[r] object RRDD {
       sparkConf.setExecutorEnv(name.toString, value.toString)
     }
 
+    if (sparkEnvirMap.containsKey("spark.sql.default.derby.dir") &&
+        System.getProperty("derby.system.home") == null) {
+      // This must be set before SparkContext is instantiated.
+      System.setProperty("derby.system.home",
+                         sparkEnvirMap.get("spark.sql.default.derby.dir").toString)
+    }
+
     val jsc = new JavaSparkContext(sparkConf)
     jars.foreach { jar =>
       jsc.addJar(jar)
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
index f475ce87540a..94152fb3a5b5 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
@@ -105,6 +105,12 @@ class SparkHadoopUtil extends Logging {
       }
       val bufferSize = conf.get("spark.buffer.size", "65536")
       hadoopConf.set("io.file.buffer.size", bufferSize)
+
+      if (conf.contains("spark.sql.default.derby.dir")) {
+        val jdoDir = conf.get("spark.sql.default.derby.dir")
+        val jdoConf = s"jdbc:derby:;databaseName=${jdoDir}/metastore_db;create=true"
+        hadoopConf.setIfUnset("javax.jdo.option.ConnectionURL", jdoConf)
+      }
     }
   }
 

From 8fc70338603953e8ddf92dbd2c844b237c12f463 Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Tue, 20 Dec 2016 15:29:12 -0800
Subject: [PATCH 2/7] move test

---
 R/pkg/inst/tests/testthat/test_aFirstTest.R | 43 +++++++++++++++++++--
 R/pkg/inst/tests/testthat/test_context.R    | 37 ------------------
 2 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/R/pkg/inst/tests/testthat/test_aFirstTest.R b/R/pkg/inst/tests/testthat/test_aFirstTest.R
index 95a601e668be..edf9255cc6dc 100644
--- a/R/pkg/inst/tests/testthat/test_aFirstTest.R
+++ b/R/pkg/inst/tests/testthat/test_aFirstTest.R
@@ -16,14 +16,14 @@
 #
 
 # With testthat, tests are run in alphabetic order on the test filenames.
-# Name this test starting with an `a` to make sure it is run first
-context("a first test for package")
+# Name this test file starting with an `a` to make sure it is run first.
+context("a first set of tests for the package")
 
 filesBefore <- list.files(path = file.path(Sys.getenv("SPARK_HOME"), "R"), all.files = TRUE)
 
 sparkSession <- sparkR.session(enableHiveSupport = FALSE)
 
-test_that("No extra files are created in SPARK_HOME", {
+test_that("No extra files are created in SPARK_HOME by starting the session", {
   # Check that it is not creating any extra file.
   # Does not check the tempdir which would be cleaned up after.
   filesAfter <- list.files(path = file.path(Sys.getenv("SPARK_HOME"), "R"), all.files = TRUE)
@@ -36,4 +36,41 @@ test_that("No extra files are created in SPARK_HOME", {
   expect_equal(sort(filesBefore, na.last = TRUE), sort(filesAfter, na.last = TRUE))
 })
 
+test_that("Check masked functions", {
+  # Check that we are not masking any new function from base, stats, testthat unexpectedly
+  # NOTE: We should avoid adding entries to *namesOfMaskedCompletely* as masked functions make it
+  # hard for users to use base R functions. Please check when in doubt.
+  namesOfMaskedCompletely <- c("cov", "filter", "sample")
+  namesOfMasked <- c("describe", "cov", "filter", "lag", "na.omit", "predict", "sd", "var",
+                     "colnames", "colnames<-", "intersect", "rank", "rbind", "sample", "subset",
+                     "summary", "transform", "drop", "window", "as.data.frame", "union")
+  if (as.numeric(R.version$major) >= 3 && as.numeric(R.version$minor) >= 3) {
+    namesOfMasked <- c("endsWith", "startsWith", namesOfMasked)
+  }
+  masked <- conflicts(detail = TRUE)$`package:SparkR`
+  expect_true("describe" %in% masked)  # only when with testthat..
+  func <- lapply(masked, function(x) { capture.output(showMethods(x))[[1]] })
+  funcSparkROrEmpty <- grepl("\\(package SparkR\\)$|^$", func)
+  maskedBySparkR <- masked[funcSparkROrEmpty]
+  expect_equal(length(maskedBySparkR), length(namesOfMasked))
+  # make the 2 lists the same length so expect_equal will print their content
+  l <- max(length(maskedBySparkR), length(namesOfMasked))
+  length(maskedBySparkR) <- l
+  length(namesOfMasked) <- l
+  expect_equal(sort(maskedBySparkR, na.last = TRUE), sort(namesOfMasked, na.last = TRUE))
+  # above are those reported as masked when `library(SparkR)`
+  # note that many of these methods are still callable without base:: or stats:: prefix
+  # there should be a test for each of these, except followings, which are currently "broken"
+  funcHasAny <- unlist(lapply(masked, function(x) {
+                                        any(grepl("=\"ANY\"", capture.output(showMethods(x)[-1])))
+                                      }))
+  maskedCompletely <- masked[!funcHasAny]
+  expect_equal(length(maskedCompletely), length(namesOfMaskedCompletely))
+  l <- max(length(maskedCompletely), length(namesOfMaskedCompletely))
+  length(maskedCompletely) <- l
+  length(namesOfMaskedCompletely) <- l
+  expect_equal(sort(maskedCompletely, na.last = TRUE),
+               sort(namesOfMaskedCompletely, na.last = TRUE))
+})
+
 sparkR.session.stop()
diff --git a/R/pkg/inst/tests/testthat/test_context.R b/R/pkg/inst/tests/testthat/test_context.R
index caca06933952..c3453bbf2937 100644
--- a/R/pkg/inst/tests/testthat/test_context.R
+++ b/R/pkg/inst/tests/testthat/test_context.R
@@ -17,43 +17,6 @@
 
 context("test functions in sparkR.R")
 
-test_that("Check masked functions", {
-  # Check that we are not masking any new function from base, stats, testthat unexpectedly
-  # NOTE: We should avoid adding entries to *namesOfMaskedCompletely* as masked functions make it
-  # hard for users to use base R functions. Please check when in doubt.
-  namesOfMaskedCompletely <- c("cov", "filter", "sample")
-  namesOfMasked <- c("describe", "cov", "filter", "lag", "na.omit", "predict", "sd", "var",
-                     "colnames", "colnames<-", "intersect", "rank", "rbind", "sample", "subset",
-                     "summary", "transform", "drop", "window", "as.data.frame", "union")
-  if (as.numeric(R.version$major) >= 3 && as.numeric(R.version$minor) >= 3) {
-    namesOfMasked <- c("endsWith", "startsWith", namesOfMasked)
-  }
-  masked <- conflicts(detail = TRUE)$`package:SparkR`
-  expect_true("describe" %in% masked)  # only when with testthat..
-  func <- lapply(masked, function(x) { capture.output(showMethods(x))[[1]] })
-  funcSparkROrEmpty <- grepl("\\(package SparkR\\)$|^$", func)
-  maskedBySparkR <- masked[funcSparkROrEmpty]
-  expect_equal(length(maskedBySparkR), length(namesOfMasked))
-  # make the 2 lists the same length so expect_equal will print their content
-  l <- max(length(maskedBySparkR), length(namesOfMasked))
-  length(maskedBySparkR) <- l
-  length(namesOfMasked) <- l
-  expect_equal(sort(maskedBySparkR, na.last = TRUE), sort(namesOfMasked, na.last = TRUE))
-  # above are those reported as masked when `library(SparkR)`
-  # note that many of these methods are still callable without base:: or stats:: prefix
-  # there should be a test for each of these, except followings, which are currently "broken"
-  funcHasAny <- unlist(lapply(masked, function(x) {
-                                        any(grepl("=\"ANY\"", capture.output(showMethods(x)[-1])))
-                                      }))
-  maskedCompletely <- masked[!funcHasAny]
-  expect_equal(length(maskedCompletely), length(namesOfMaskedCompletely))
-  l <- max(length(maskedCompletely), length(namesOfMaskedCompletely))
-  length(maskedCompletely) <- l
-  length(namesOfMaskedCompletely) <- l
-  expect_equal(sort(maskedCompletely, na.last = TRUE),
-               sort(namesOfMaskedCompletely, na.last = TRUE))
-})
-
 test_that("repeatedly starting and stopping SparkR", {
   for (i in 1:4) {
     sc <- suppressWarnings(sparkR.init())

From b00173094806eecc08893df89acceaed6820e6c1 Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Sat, 11 Mar 2017 10:55:24 -0800
Subject: [PATCH 3/7] change to only set derby home

---
 R/pkg/R/sparkR.R                              |  4 +-
 R/pkg/inst/tests/testthat/test_aFirstTest.R   | 76 -------------------
 R/pkg/inst/tests/testthat/test_context.R      | 37 +++++++++
 R/pkg/inst/tests/testthat/test_sparkSQL.R     | 21 +++++
 R/pkg/tests/run-all.R                         |  1 +
 .../scala/org/apache/spark/api/r/RRDD.scala   |  4 +-
 .../apache/spark/deploy/SparkHadoopUtil.scala |  6 --
 7 files changed, 63 insertions(+), 86 deletions(-)
 delete mode 100644 R/pkg/inst/tests/testthat/test_aFirstTest.R

diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R
index d8b60739642e..d9b2bb2f57c7 100644
--- a/R/pkg/R/sparkR.R
+++ b/R/pkg/R/sparkR.R
@@ -381,8 +381,8 @@ sparkR.session <- function(
     deployMode <- sparkConfigMap[["spark.submit.deployMode"]]
   }
 
-  if (!exists("spark.sql.default.derby.dir", envir = sparkConfigMap)) {
-    sparkConfigMap[["spark.sql.default.derby.dir"]] <- tempdir()
+  if (!exists("spark.r.sql.default.derby.dir", envir = sparkConfigMap)) {
+    sparkConfigMap[["spark.r.sql.default.derby.dir"]] <- tempdir()
   }
 
   if (!exists(".sparkRjsc", envir = .sparkREnv)) {
diff --git a/R/pkg/inst/tests/testthat/test_aFirstTest.R b/R/pkg/inst/tests/testthat/test_aFirstTest.R
deleted file mode 100644
index edf9255cc6dc..000000000000
--- a/R/pkg/inst/tests/testthat/test_aFirstTest.R
+++ /dev/null
@@ -1,76 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# With testthat, tests are run in alphabetic order on the test filenames.
-# Name this test file starting with an `a` to make sure it is run first.
-context("a first set of tests for the package")
-
-filesBefore <- list.files(path = file.path(Sys.getenv("SPARK_HOME"), "R"), all.files = TRUE)
-
-sparkSession <- sparkR.session(enableHiveSupport = FALSE)
-
-test_that("No extra files are created in SPARK_HOME by starting the session", {
-  # Check that it is not creating any extra file.
-  # Does not check the tempdir which would be cleaned up after.
-  filesAfter <- list.files(path = file.path(Sys.getenv("SPARK_HOME"), "R"), all.files = TRUE)
-
-  # get testthat to show the diff by first making the 2 lists equal in length
-  expect_equal(length(filesBefore), length(filesAfter))
-  l <- max(length(filesBefore), length(filesAfter))
-  length(filesBefore) <- l
-  length(filesAfter) <- l
-  expect_equal(sort(filesBefore, na.last = TRUE), sort(filesAfter, na.last = TRUE))
-})
-
-test_that("Check masked functions", {
-  # Check that we are not masking any new function from base, stats, testthat unexpectedly
-  # NOTE: We should avoid adding entries to *namesOfMaskedCompletely* as masked functions make it
-  # hard for users to use base R functions. Please check when in doubt.
-  namesOfMaskedCompletely <- c("cov", "filter", "sample")
-  namesOfMasked <- c("describe", "cov", "filter", "lag", "na.omit", "predict", "sd", "var",
-                     "colnames", "colnames<-", "intersect", "rank", "rbind", "sample", "subset",
-                     "summary", "transform", "drop", "window", "as.data.frame", "union")
-  if (as.numeric(R.version$major) >= 3 && as.numeric(R.version$minor) >= 3) {
-    namesOfMasked <- c("endsWith", "startsWith", namesOfMasked)
-  }
-  masked <- conflicts(detail = TRUE)$`package:SparkR`
-  expect_true("describe" %in% masked)  # only when with testthat..
-  func <- lapply(masked, function(x) { capture.output(showMethods(x))[[1]] })
-  funcSparkROrEmpty <- grepl("\\(package SparkR\\)$|^$", func)
-  maskedBySparkR <- masked[funcSparkROrEmpty]
-  expect_equal(length(maskedBySparkR), length(namesOfMasked))
-  # make the 2 lists the same length so expect_equal will print their content
-  l <- max(length(maskedBySparkR), length(namesOfMasked))
-  length(maskedBySparkR) <- l
-  length(namesOfMasked) <- l
-  expect_equal(sort(maskedBySparkR, na.last = TRUE), sort(namesOfMasked, na.last = TRUE))
-  # above are those reported as masked when `library(SparkR)`
-  # note that many of these methods are still callable without base:: or stats:: prefix
-  # there should be a test for each of these, except followings, which are currently "broken"
-  funcHasAny <- unlist(lapply(masked, function(x) {
-                                        any(grepl("=\"ANY\"", capture.output(showMethods(x)[-1])))
-                                      }))
-  maskedCompletely <- masked[!funcHasAny]
-  expect_equal(length(maskedCompletely), length(namesOfMaskedCompletely))
-  l <- max(length(maskedCompletely), length(namesOfMaskedCompletely))
-  length(maskedCompletely) <- l
-  length(namesOfMaskedCompletely) <- l
-  expect_equal(sort(maskedCompletely, na.last = TRUE),
-               sort(namesOfMaskedCompletely, na.last = TRUE))
-})
-
-sparkR.session.stop()
diff --git a/R/pkg/inst/tests/testthat/test_context.R b/R/pkg/inst/tests/testthat/test_context.R
index c3453bbf2937..caca06933952 100644
--- a/R/pkg/inst/tests/testthat/test_context.R
+++ b/R/pkg/inst/tests/testthat/test_context.R
@@ -17,6 +17,43 @@
 
 context("test functions in sparkR.R")
 
+test_that("Check masked functions", {
+  # Check that we are not masking any new function from base, stats, testthat unexpectedly
+  # NOTE: We should avoid adding entries to *namesOfMaskedCompletely* as masked functions make it
+  # hard for users to use base R functions. Please check when in doubt.
+  namesOfMaskedCompletely <- c("cov", "filter", "sample")
+  namesOfMasked <- c("describe", "cov", "filter", "lag", "na.omit", "predict", "sd", "var",
+                     "colnames", "colnames<-", "intersect", "rank", "rbind", "sample", "subset",
+                     "summary", "transform", "drop", "window", "as.data.frame", "union")
+  if (as.numeric(R.version$major) >= 3 && as.numeric(R.version$minor) >= 3) {
+    namesOfMasked <- c("endsWith", "startsWith", namesOfMasked)
+  }
+  masked <- conflicts(detail = TRUE)$`package:SparkR`
+  expect_true("describe" %in% masked)  # only when with testthat..
+  func <- lapply(masked, function(x) { capture.output(showMethods(x))[[1]] })
+  funcSparkROrEmpty <- grepl("\\(package SparkR\\)$|^$", func)
+  maskedBySparkR <- masked[funcSparkROrEmpty]
+  expect_equal(length(maskedBySparkR), length(namesOfMasked))
+  # make the 2 lists the same length so expect_equal will print their content
+  l <- max(length(maskedBySparkR), length(namesOfMasked))
+  length(maskedBySparkR) <- l
+  length(namesOfMasked) <- l
+  expect_equal(sort(maskedBySparkR, na.last = TRUE), sort(namesOfMasked, na.last = TRUE))
+  # above are those reported as masked when `library(SparkR)`
+  # note that many of these methods are still callable without base:: or stats:: prefix
+  # there should be a test for each of these, except followings, which are currently "broken"
+  funcHasAny <- unlist(lapply(masked, function(x) {
+                                        any(grepl("=\"ANY\"", capture.output(showMethods(x)[-1])))
+                                      }))
+  maskedCompletely <- masked[!funcHasAny]
+  expect_equal(length(maskedCompletely), length(namesOfMaskedCompletely))
+  l <- max(length(maskedCompletely), length(namesOfMaskedCompletely))
+  length(maskedCompletely) <- l
+  length(namesOfMaskedCompletely) <- l
+  expect_equal(sort(maskedCompletely, na.last = TRUE),
+               sort(namesOfMaskedCompletely, na.last = TRUE))
+})
+
 test_that("repeatedly starting and stopping SparkR", {
   for (i in 1:4) {
     sc <- suppressWarnings(sparkR.init())
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index 84187f796efc..cab8f71b9449 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -2910,6 +2910,27 @@ test_that("Collect on DataFrame when NAs exists at the top of a timestamp column
   expect_equal(class(ldf3$col3), c("POSIXct", "POSIXt"))
 })
 
+compare_list <- function(list1, list2) {
+  # get testthat to show the diff by first making the 2 lists equal in length
+  expect_equal(length(list1), length(list2))
+  l <- max(length(list1), length(list2))
+  length(list1) <- l
+  length(list2) <- l
+  expect_equal(sort(list1, na.last = TRUE), sort(list2, na.last = TRUE))
+}
+
+# This should always be the last test in this test file.
+test_that("No extra files are created in SPARK_HOME by starting session and making calls", {
+  # Check that it is not creating any extra file.
+  # Does not check the tempdir which would be cleaned up after.
+  filesAfter <- list.files(path = file.path(Sys.getenv("SPARK_HOME"), "R"), all.files = TRUE)
+
+  expect_true(length(sparkHomeFileBefore) > 0)
+  compare_list(sparkHomeFileBefore, filesBefore)
+
+  compare_list(filesBefore, filesAfter)
+})
+
 unlink(parquetPath)
 unlink(orcPath)
 unlink(jsonPath)
diff --git a/R/pkg/tests/run-all.R b/R/pkg/tests/run-all.R
index ab8d1ca01994..46c3ca3504b9 100644
--- a/R/pkg/tests/run-all.R
+++ b/R/pkg/tests/run-all.R
@@ -22,6 +22,7 @@ library(SparkR)
 options("warn" = 2)
 
 # Setup global test environment
+sparkHomeFileBefore <- list.files(path = file.path(Sys.getenv("SPARK_HOME"), "R"), all.files = TRUE)
 install.spark()
 
 test_package("SparkR")
diff --git a/core/src/main/scala/org/apache/spark/api/r/RRDD.scala b/core/src/main/scala/org/apache/spark/api/r/RRDD.scala
index 47a94c2f273a..b8e4b5de776c 100644
--- a/core/src/main/scala/org/apache/spark/api/r/RRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/RRDD.scala
@@ -127,11 +127,11 @@ private[r] object RRDD {
       sparkConf.setExecutorEnv(name.toString, value.toString)
     }
 
-    if (sparkEnvirMap.containsKey("spark.sql.default.derby.dir") &&
+    if (sparkEnvirMap.containsKey("spark.r.sql.default.derby.dir") &&
         System.getProperty("derby.system.home") == null) {
       // This must be set before SparkContext is instantiated.
       System.setProperty("derby.system.home",
-                         sparkEnvirMap.get("spark.sql.default.derby.dir").toString)
+                         sparkEnvirMap.get("spark.r.sql.default.derby.dir").toString)
     }
 
     val jsc = new JavaSparkContext(sparkConf)
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
index 94152fb3a5b5..f475ce87540a 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
@@ -105,12 +105,6 @@ class SparkHadoopUtil extends Logging {
       }
       val bufferSize = conf.get("spark.buffer.size", "65536")
       hadoopConf.set("io.file.buffer.size", bufferSize)
-
-      if (conf.contains("spark.sql.default.derby.dir")) {
-        val jdoDir = conf.get("spark.sql.default.derby.dir")
-        val jdoConf = s"jdbc:derby:;databaseName=${jdoDir}/metastore_db;create=true"
-        hadoopConf.setIfUnset("javax.jdo.option.ConnectionURL", jdoConf)
-      }
     }
   }
 

From ca4f08e3d8d574025f36d53eef7681a95912dd4d Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Sat, 11 Mar 2017 11:28:46 -0800
Subject: [PATCH 4/7] update api doc

---
 R/pkg/R/sparkR.R | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R
index d9b2bb2f57c7..aab322e7b4cd 100644
--- a/R/pkg/R/sparkR.R
+++ b/R/pkg/R/sparkR.R
@@ -322,10 +322,19 @@ sparkRHive.init <- function(jsc = NULL) {
 #' SparkSession or initializes a new SparkSession.
 #' Additional Spark properties can be set in \code{...}, and these named parameters take priority
 #' over values in \code{master}, \code{appName}, named lists of \code{sparkConfig}.
-#' When called in an interactive session, this checks for the Spark installation, and, if not
+#'
+#' When called in an interactive session, this method checks for the Spark installation, and, if not
 #' found, it will be downloaded and cached automatically. Alternatively, \code{install.spark} can
 #' be called manually.
 #'
+#' A default warehouse is created automatically in the current directory when a managed table is
+#' created via \code{sql} statement \code{CREATE TABLE}, for example. To change the location of the
+#' warehouse, set the named parameter \code{spark.sql.warehouse.dir} to the SparkSession. Along with
+#' the warehouse, an accompanied metastore may also be automatically created in the current
+#' directory when a new SparkSession is initialized with \code{enableHiveSupport} set to
+#' \code{TRUE}, which is the default. For more details, refer to Hive configuration at
+#' \url{http://spark.apache.org/docs/latest/sql-programming-guide.html#hive-tables}.
+#'
 #' For details on how to initialize and use SparkR, refer to SparkR programming guide at
 #' \url{http://spark.apache.org/docs/latest/sparkr.html#starting-up-sparksession}.
 #'

From 4604a531c321dbd18ba5a5f9cac055137db49670 Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Sun, 12 Mar 2017 11:49:45 -0700
Subject: [PATCH 5/7] change property to use to move derby.log, clean start in
 test, update test

---
 R/pkg/R/sparkR.R                                |  4 ++--
 R/pkg/inst/tests/testthat/test_sparkSQL.R       | 17 ++++++++++-------
 R/pkg/tests/run-all.R                           |  7 ++++++-
 .../scala/org/apache/spark/api/r/RRDD.scala     | 10 ++++++----
 4 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R
index aab322e7b4cd..d0a12b7ecec6 100644
--- a/R/pkg/R/sparkR.R
+++ b/R/pkg/R/sparkR.R
@@ -390,8 +390,8 @@ sparkR.session <- function(
     deployMode <- sparkConfigMap[["spark.submit.deployMode"]]
   }
 
-  if (!exists("spark.r.sql.default.derby.dir", envir = sparkConfigMap)) {
-    sparkConfigMap[["spark.r.sql.default.derby.dir"]] <- tempdir()
+  if (!exists("spark.r.sql.derby.temp.dir", envir = sparkConfigMap)) {
+    sparkConfigMap[["spark.r.sql.derby.temp.dir"]] <- tempdir()
   }
 
   if (!exists(".sparkRjsc", envir = .sparkREnv)) {
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index cab8f71b9449..920a81b92d5c 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -60,7 +60,7 @@ unsetHiveContext <- function() {
 
 # Tests for SparkSQL functions in SparkR
 
-filesBefore <- list.files(path = file.path(Sys.getenv("SPARK_HOME"), "R"), all.files = TRUE)
+filesBefore <- list.files(path = sparkRDir, all.files = TRUE)
 sparkSession <- sparkR.session()
 sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
 
@@ -2923,12 +2923,15 @@ compare_list <- function(list1, list2) {
 test_that("No extra files are created in SPARK_HOME by starting session and making calls", {
   # Check that it is not creating any extra file.
   # Does not check the tempdir which would be cleaned up after.
-  filesAfter <- list.files(path = file.path(Sys.getenv("SPARK_HOME"), "R"), all.files = TRUE)
-
-  expect_true(length(sparkHomeFileBefore) > 0)
-  compare_list(sparkHomeFileBefore, filesBefore)
-
-  compare_list(filesBefore, filesAfter)
+  filesAfter <- list.files(path = sparkRDir, all.files = TRUE)
+
+  expect_true(length(sparkRFilesBefore) > 0)
+  # first, ensure derby.log is not there
+  expect_false("derby.log" %in% filesAfter)
+  # second, ensure only spark-warehouse is created when calling SparkSession, enableHiveSupport = F
+  compare_list(sparkRFilesBefore, setdiff(filesBefore, sparkRWhitelistSQLDirs[[1]]))
+  # third, ensure only spark-warehouse and metastore_db are created when enableHiveSupport = T
+  compare_list(sparkRFilesBefore, setdiff(filesAfter, sparkRWhitelistSQLDirs))
 })
 
 unlink(parquetPath)
diff --git a/R/pkg/tests/run-all.R b/R/pkg/tests/run-all.R
index 46c3ca3504b9..cefaadda6e21 100644
--- a/R/pkg/tests/run-all.R
+++ b/R/pkg/tests/run-all.R
@@ -22,7 +22,12 @@ library(SparkR)
 options("warn" = 2)
 
 # Setup global test environment
-sparkHomeFileBefore <- list.files(path = file.path(Sys.getenv("SPARK_HOME"), "R"), all.files = TRUE)
+sparkRDir <- file.path(Sys.getenv("SPARK_HOME"), "R")
+sparkRFilesBefore <- list.files(path = sparkRDir, all.files = TRUE)
+sparkRWhitelistSQLDirs <- c("spark-warehouse", "metastore_db")
+invisible(lapply(sparkRWhitelistSQLDirs,
+                 function(x) { unlink(file.path(sparkRDir, x), recursive = TRUE, force = TRUE)}))
+
 install.spark()
 
 test_package("SparkR")
diff --git a/core/src/main/scala/org/apache/spark/api/r/RRDD.scala b/core/src/main/scala/org/apache/spark/api/r/RRDD.scala
index b8e4b5de776c..fd8d77d8cc72 100644
--- a/core/src/main/scala/org/apache/spark/api/r/RRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/RRDD.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.api.r
 
 import java.util.{Map => JMap}
+import java.io.File
 
 import scala.collection.JavaConverters._
 import scala.reflect.ClassTag
@@ -127,11 +128,12 @@ private[r] object RRDD {
       sparkConf.setExecutorEnv(name.toString, value.toString)
     }
 
-    if (sparkEnvirMap.containsKey("spark.r.sql.default.derby.dir") &&
-        System.getProperty("derby.system.home") == null) {
+    if (sparkEnvirMap.containsKey("spark.r.sql.derby.temp.dir") &&
+        System.getProperty("derby.stream.error.file") == null) {
       // This must be set before SparkContext is instantiated.
-      System.setProperty("derby.system.home",
-                         sparkEnvirMap.get("spark.r.sql.default.derby.dir").toString)
+      System.setProperty("derby.stream.error.file",
+                         Seq(sparkEnvirMap.get("spark.r.sql.derby.temp.dir").toString, "derby.log")
+                         .mkString(File.separator))
     }
 
     val jsc = new JavaSparkContext(sparkConf)

From 2eb75f8d73ec3ce1eb85d7c501e4c072499b8f44 Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Sun, 12 Mar 2017 12:11:46 -0700
Subject: [PATCH 6/7] import order

---
 core/src/main/scala/org/apache/spark/api/r/RRDD.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/api/r/RRDD.scala b/core/src/main/scala/org/apache/spark/api/r/RRDD.scala
index fd8d77d8cc72..72ae0340aa3d 100644
--- a/core/src/main/scala/org/apache/spark/api/r/RRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/RRDD.scala
@@ -17,8 +17,8 @@
 
 package org.apache.spark.api.r
 
-import java.util.{Map => JMap}
 import java.io.File
+import java.util.{Map => JMap}
 
 import scala.collection.JavaConverters._
 import scala.reflect.ClassTag

From ac9fbfc5d511877f7775c620ff8e1c672880ee50 Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Sat, 18 Mar 2017 16:53:23 -0700
Subject: [PATCH 7/7] add more comment on test

---
 R/pkg/inst/tests/testthat/test_sparkSQL.R | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index 920a81b92d5c..32856b399cdd 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -2919,7 +2919,7 @@ compare_list <- function(list1, list2) {
   expect_equal(sort(list1, na.last = TRUE), sort(list2, na.last = TRUE))
 }
 
-# This should always be the last test in this test file.
+# This should always be the **very last test** in this test file.
 test_that("No extra files are created in SPARK_HOME by starting session and making calls", {
   # Check that it is not creating any extra file.
   # Does not check the tempdir which would be cleaned up after.
@@ -2929,8 +2929,17 @@ test_that("No extra files are created in SPARK_HOME by starting session and maki
   # first, ensure derby.log is not there
   expect_false("derby.log" %in% filesAfter)
   # second, ensure only spark-warehouse is created when calling SparkSession, enableHiveSupport = F
+  # note: currently all other test files have enableHiveSupport = F, so we capture the list of files
+  # before creating a SparkSession with enableHiveSupport = T at the top of this test file
+  # (filesBefore). The test here is to compare that (filesBefore) against the list of files before
+  # any test is run in run-all.R (sparkRFilesBefore).
+  # sparkRWhitelistSQLDirs is also defined in run-all.R, and should contain only 2 whitelisted dirs,
+  # here allow the first value, spark-warehouse, in the diff, everything else should be exactly the
+  # same as before any test is run.
   compare_list(sparkRFilesBefore, setdiff(filesBefore, sparkRWhitelistSQLDirs[[1]]))
   # third, ensure only spark-warehouse and metastore_db are created when enableHiveSupport = T
+  # note: as the note above, after running all tests in this file while enableHiveSupport = T, we
+  # check the list of files again. This time we allow both whitelisted dirs to be in the diff.
   compare_list(sparkRFilesBefore, setdiff(filesAfter, sparkRWhitelistSQLDirs))
 })