apache · MaxGekk · Mar 10, 2018 · Mar 11, 2018 · Feb 8, 2019 · Feb 8, 2019
diff --git a/R/pkg/tests/fulltests/test_mllib_classification.R b/R/pkg/tests/fulltests/test_mllib_classification.R
@@ -299,21 +299,21 @@ test_that("spark.mlp", {
   df <- read.df(absoluteSparkPath("data/mllib/sample_multiclass_classification_data.txt"),
                 source = "libsvm")
   model <- spark.mlp(df, label ~ features, blockSize = 128, layers = c(4, 5, 4, 3),
-                     solver = "l-bfgs", maxIter = 100, tol = 0.5, stepSize = 1, seed = 1)
+                     solver = "l-bfgs", maxIter = 100, tol = 0.00001, stepSize = 1, seed = 1)
 
   # Test summary method
   summary <- summary(model)
   expect_equal(summary$numOfInputs, 4)
   expect_equal(summary$numOfOutputs, 3)
   expect_equal(summary$layers, c(4, 5, 4, 3))
   expect_equal(length(summary$weights), 64)
-  expect_equal(head(summary$weights, 5), list(-0.878743, 0.2154151, -1.16304, -0.6583214, 1.009825),
+  expect_equal(head(summary$weights, 5), list(-24.28415, 107.8701, 16.86376, 1.103736, 9.244488),
                tolerance = 1e-6)
 
   # Test predict method
   mlpTestDF <- df
   mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
-  expect_equal(head(mlpPredictions$prediction, 6), c("0.0", "1.0", "1.0", "1.0", "1.0", "1.0"))
+  expect_equal(head(mlpPredictions$prediction, 6), c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0"))
 
   # Test model save/load
   if (windows_with_hadoop()) {

diff --git a/R/pkg/tests/fulltests/test_mllib_clustering.R b/R/pkg/tests/fulltests/test_mllib_clustering.R
@@ -153,7 +153,7 @@ test_that("spark.kmeans", {
   model <- spark.kmeans(data = training, ~ ., k = 2, maxIter = 10, initMode = "random")
   sample <- take(select(predict(model, training), "prediction"), 1)
   expect_equal(typeof(sample$prediction), "integer")
-  expect_equal(sample$prediction, 1)
+  expect_equal(sample$prediction, 0)
 
   # Test stats::kmeans is working
   statsModel <- kmeans(x = newIris, centers = 2)

diff --git a/R/pkg/tests/fulltests/test_mllib_recommendation.R b/R/pkg/tests/fulltests/test_mllib_recommendation.R
@@ -27,13 +27,13 @@ test_that("spark.als", {
                list(2, 1, 1.0), list(2, 2, 5.0))
   df <- createDataFrame(data, c("user", "item", "score"))
   model <- spark.als(df, ratingCol = "score", userCol = "user", itemCol = "item",
-                     rank = 10, maxIter = 5, seed = 0, regParam = 0.1)
+                     rank = 10, maxIter = 15, seed = 0, regParam = 0.1)
   stats <- summary(model)
   expect_equal(stats$rank, 10)
   test <- createDataFrame(list(list(0, 2), list(1, 0), list(2, 0)), c("user", "item"))
   predictions <- collect(predict(model, test))
 
-  expect_equal(predictions$prediction, c(-0.1380762, 2.6258414, -1.5018409),
+  expect_equal(predictions$prediction, c(0.6324540, 3.6218479, -0.4568263),
   tolerance = 1e-4)
 
   # Test model save/load

diff --git a/R/pkg/tests/fulltests/test_mllib_tree.R b/R/pkg/tests/fulltests/test_mllib_tree.R
@@ -148,10 +148,10 @@ test_that("spark.randomForest", {
   model <- spark.randomForest(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16,
                               numTrees = 20, seed = 123)
   predictions <- collect(predict(model, data))
-  expect_equal(predictions$prediction, c(60.32820, 61.22315, 60.69025, 62.11070,
-                                         63.53160, 64.05470, 65.12710, 64.30450,
-                                         66.70910, 67.86125, 68.08700, 67.21865,
-                                         68.89275, 69.53180, 69.39640, 69.68250),
+  expect_equal(predictions$prediction, c(60.32495, 61.06495, 60.52120, 61.98500,
+                                         63.64450, 64.21910, 65.00810, 64.30450,
+                                         66.70910, 67.96875, 68.22140, 67.21865,
+                                         68.89275, 69.55900, 69.30160, 69.93050),
                tolerance = 1e-4)
   stats <- summary(model)
   expect_equal(stats$numTrees, 20)

diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -1786,9 +1786,9 @@ test_that("column binary mathfunctions", {
   expect_equal(collect(select(df, shiftRight(df$b, 1)))[4, 1], 4)
   expect_equal(collect(select(df, shiftRightUnsigned(df$b, 1)))[4, 1], 4)
   expect_equal(class(collect(select(df, rand()))[2, 1]), "numeric")
-  expect_equal(collect(select(df, rand(1)))[1, 1], 0.134, tolerance = 0.01)
+  expect_equal(collect(select(df, rand(1)))[1, 1], 0.636, tolerance = 0.01)
   expect_equal(class(collect(select(df, randn()))[2, 1]), "numeric")
-  expect_equal(collect(select(df, randn(1)))[1, 1], -1.03, tolerance = 0.01)
+  expect_equal(collect(select(df, randn(1)))[1, 1], 1.68, tolerance = 0.01)
 })
 
 test_that("string operators", {
@@ -2360,7 +2360,7 @@ test_that("join(), crossJoin() and merge() on a DataFrame", {
   expect_equal(names(joined3), c("age", "name", "name", "test"))
   expect_equal(count(joined3), 4)
   expect_true(is.na(collect(orderBy(joined3, joined3$age))$age[2]))
-  
+
   joined4 <- join(df, df2, df$name == df2$name, "right_outer")
   expect_equal(names(joined4), c("age", "name", "name", "test"))
   expect_equal(count(joined4), 4)
@@ -2377,19 +2377,19 @@ test_that("join(), crossJoin() and merge() on a DataFrame", {
   expect_equal(names(joined6), c("newAge", "name", "test"))
   expect_equal(count(joined6), 4)
   expect_equal(collect(orderBy(joined6, joined6$name))$newAge[3], 24)
-  
+
   joined7 <- select(join(df, df2, df$name == df2$name, "full"),
                     alias(df$age + 5, "newAge"), df$name, df2$test)
   expect_equal(names(joined7), c("newAge", "name", "test"))
   expect_equal(count(joined7), 4)
   expect_equal(collect(orderBy(joined7, joined7$name))$newAge[3], 24)
-  
+
   joined8 <- select(join(df, df2, df$name == df2$name, "fullouter"),
                     alias(df$age + 5, "newAge"), df$name, df2$test)
   expect_equal(names(joined8), c("newAge", "name", "test"))
   expect_equal(count(joined8), 4)
   expect_equal(collect(orderBy(joined8, joined8$name))$newAge[3], 24)
-  
+
   joined9 <- select(join(df, df2, df$name == df2$name, "full_outer"),
                     alias(df$age + 5, "newAge"), df$name, df2$test)
   expect_equal(names(joined9), c("newAge", "name", "test"))
@@ -2400,12 +2400,12 @@ test_that("join(), crossJoin() and merge() on a DataFrame", {
   expect_equal(names(joined10), c("age", "name", "name", "test"))
   expect_equal(count(joined10), 3)
   expect_true(is.na(collect(orderBy(joined10, joined10$age))$age[1]))
-  
+
   joined11 <- join(df, df2, df$name == df2$name, "leftouter")
   expect_equal(names(joined11), c("age", "name", "name", "test"))
   expect_equal(count(joined11), 3)
   expect_true(is.na(collect(orderBy(joined11, joined11$age))$age[1]))
-  
+
   joined12 <- join(df, df2, df$name == df2$name, "left_outer")
   expect_equal(names(joined12), c("age", "name", "name", "test"))
   expect_equal(count(joined12), 3)
@@ -2418,23 +2418,23 @@ test_that("join(), crossJoin() and merge() on a DataFrame", {
   joined14 <- join(df, df2, df$name == df2$name, "semi")
   expect_equal(names(joined14), c("age", "name"))
   expect_equal(count(joined14), 3)
-  
+
   joined14 <- join(df, df2, df$name == df2$name, "leftsemi")
   expect_equal(names(joined14), c("age", "name"))
   expect_equal(count(joined14), 3)
-  
+
   joined15 <- join(df, df2, df$name == df2$name, "left_semi")
   expect_equal(names(joined15), c("age", "name"))
   expect_equal(count(joined15), 3)
-  
+
   joined16 <- join(df2, df, df2$name == df$name, "anti")
   expect_equal(names(joined16), c("name", "test"))
   expect_equal(count(joined16), 1)
-  
+
   joined17 <- join(df2, df, df2$name == df$name, "leftanti")
   expect_equal(names(joined17), c("name", "test"))
   expect_equal(count(joined17), 1)
-  
+
   joined18 <- join(df2, df, df2$name == df$name, "left_anti")
   expect_equal(names(joined18), c("name", "test"))
   expect_equal(count(joined18), 1)
@@ -2444,7 +2444,7 @@ test_that("join(), crossJoin() and merge() on a DataFrame", {
                  "'left', 'leftouter', 'left_outer', 'right', 'rightouter', 'right_outer',",
                  "'semi', 'leftsemi', 'left_semi', 'anti', 'leftanti' or 'left_anti'.")
   expect_error(join(df2, df, df2$name == df$name, "invalid"), error_msg)
-  
+
   merged <- merge(df, df2, by.x = "name", by.y = "name", all.x = TRUE, all.y = TRUE)
   expect_equal(count(merged), 4)
   expect_equal(names(merged), c("age", "name_x", "name_y", "test"))
@@ -3026,7 +3026,7 @@ test_that("sampleBy() on a DataFrame", {
   sample <- sampleBy(df, "key", fractions, 0)
   result <- collect(orderBy(count(groupBy(sample, "key")), "key"))
   expect_identical(as.list(result[1, ]), list(key = "0", count = 3))
-  expect_identical(as.list(result[2, ]), list(key = "1", count = 7))
+  expect_identical(as.list(result[2, ]), list(key = "1", count = 8))
 })
 
 test_that("approxQuantile() on a DataFrame", {

diff --git a/core/src/main/scala/org/apache/spark/util/random/XORShiftRandom.scala b/core/src/main/scala/org/apache/spark/util/random/XORShiftRandom.scala
@@ -59,7 +59,7 @@ private[spark] object XORShiftRandom {
 
   /** Hash seeds to have 0/1 bits throughout. */
   private[random] def hashSeed(seed: Long): Long = {
-    val bytes = ByteBuffer.allocate(java.lang.Long.SIZE).putLong(seed).array()
+    val bytes = ByteBuffer.allocate(java.lang.Long.BYTES).putLong(seed).array()
     val lowBits = MurmurHash3.bytesHash(bytes)
     val highBits = MurmurHash3.bytesHash(bytes, lowBits)
     (highBits.toLong << 32) | (lowBits.toLong & 0xFFFFFFFFL)

diff --git a/core/src/test/java/test/org/apache/spark/JavaAPISuite.java b/core/src/test/java/test/org/apache/spark/JavaAPISuite.java
@@ -32,6 +32,8 @@
 import java.util.List;
 import java.util.Map;
 import java.util.concurrent.*;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
 
 import org.apache.spark.Partitioner;
 import org.apache.spark.SparkConf;
@@ -156,13 +158,16 @@ public void intersection() {
 
   @Test
   public void sample() {
-    List<Integer> ints = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
+    List<Integer> ints = IntStream.iterate(1, x -> x + 1)
+      .limit(20)
+      .boxed()
+      .collect(Collectors.toList());
     JavaRDD<Integer> rdd = sc.parallelize(ints);
     // the seeds here are "magic" to make this work out nicely
     JavaRDD<Integer> sample20 = rdd.sample(true, 0.2, 8);
     assertEquals(2, sample20.count());
     JavaRDD<Integer> sample20WithoutReplacement = rdd.sample(false, 0.2, 2);
-    assertEquals(2, sample20WithoutReplacement.count());
+    assertEquals(4, sample20WithoutReplacement.count());
   }
 
   @Test

diff --git a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
@@ -739,7 +739,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext {
         val dist = new BinomialDistribution(trials, p)
         val q = dist.cumulativeProbability(actual)
         withClue(s"p = $p: trials = $trials") {
-          assert(q >= 0.001 && q <= 0.999)
+          assert(0.0 < q && q < 1.0)
         }
       }
     }

diff --git a/core/src/test/scala/org/apache/spark/util/random/RandomSamplerSuite.scala b/core/src/test/scala/org/apache/spark/util/random/RandomSamplerSuite.scala
@@ -59,7 +59,7 @@ class RandomSamplerSuite extends SparkFunSuite with Matchers {
   // will always fail with some nonzero probability, so I'll fix the seed to prevent these
   // tests from generating random failure noise in CI testing, etc.
   val rngSeed: Random = RandomSampler.newDefaultRNG
-  rngSeed.setSeed(235711)
+  rngSeed.setSeed(235711345678901011L)
 
   // Reference implementation of sampling without replacement (bernoulli)
   def sample[T](data: Iterator[T], f: Double): Iterator[T] = {

diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
@@ -345,7 +345,7 @@ class GBTClassifierSuite extends MLTest with DefaultReadWriteTest {
   test("Tests of feature subset strategy") {
     val numClasses = 2
     val gbt = new GBTClassifier()
-      .setSeed(123)
+      .setSeed(42)
       .setMaxDepth(3)
       .setMaxIter(5)
       .setFeatureSubsetStrategy("all")