diff --git a/R/pkg/tests/fulltests/test_mllib_classification.R b/R/pkg/tests/fulltests/test_mllib_classification.R index 9fdb0cfd9b61..1f1b187aef56 100644 --- a/R/pkg/tests/fulltests/test_mllib_classification.R +++ b/R/pkg/tests/fulltests/test_mllib_classification.R @@ -299,7 +299,7 @@ test_that("spark.mlp", { df <- read.df(absoluteSparkPath("data/mllib/sample_multiclass_classification_data.txt"), source = "libsvm") model <- spark.mlp(df, label ~ features, blockSize = 128, layers = c(4, 5, 4, 3), - solver = "l-bfgs", maxIter = 100, tol = 0.5, stepSize = 1, seed = 1) + solver = "l-bfgs", maxIter = 100, tol = 0.00001, stepSize = 1, seed = 1) # Test summary method summary <- summary(model) @@ -307,13 +307,13 @@ test_that("spark.mlp", { expect_equal(summary$numOfOutputs, 3) expect_equal(summary$layers, c(4, 5, 4, 3)) expect_equal(length(summary$weights), 64) - expect_equal(head(summary$weights, 5), list(-0.878743, 0.2154151, -1.16304, -0.6583214, 1.009825), + expect_equal(head(summary$weights, 5), list(-24.28415, 107.8701, 16.86376, 1.103736, 9.244488), tolerance = 1e-6) # Test predict method mlpTestDF <- df mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction")) - expect_equal(head(mlpPredictions$prediction, 6), c("0.0", "1.0", "1.0", "1.0", "1.0", "1.0")) + expect_equal(head(mlpPredictions$prediction, 6), c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0")) # Test model save/load if (windows_with_hadoop()) { diff --git a/R/pkg/tests/fulltests/test_mllib_clustering.R b/R/pkg/tests/fulltests/test_mllib_clustering.R index b78a476f1d05..028ad574b813 100644 --- a/R/pkg/tests/fulltests/test_mllib_clustering.R +++ b/R/pkg/tests/fulltests/test_mllib_clustering.R @@ -153,7 +153,7 @@ test_that("spark.kmeans", { model <- spark.kmeans(data = training, ~ ., k = 2, maxIter = 10, initMode = "random") sample <- take(select(predict(model, training), "prediction"), 1) expect_equal(typeof(sample$prediction), "integer") - expect_equal(sample$prediction, 1) + expect_equal(sample$prediction, 0) # Test stats::kmeans is working statsModel <- kmeans(x = newIris, centers = 2) diff --git a/R/pkg/tests/fulltests/test_mllib_recommendation.R b/R/pkg/tests/fulltests/test_mllib_recommendation.R index 4d919c9d746b..d50de4123aeb 100644 --- a/R/pkg/tests/fulltests/test_mllib_recommendation.R +++ b/R/pkg/tests/fulltests/test_mllib_recommendation.R @@ -27,13 +27,13 @@ test_that("spark.als", { list(2, 1, 1.0), list(2, 2, 5.0)) df <- createDataFrame(data, c("user", "item", "score")) model <- spark.als(df, ratingCol = "score", userCol = "user", itemCol = "item", - rank = 10, maxIter = 5, seed = 0, regParam = 0.1) + rank = 10, maxIter = 15, seed = 0, regParam = 0.1) stats <- summary(model) expect_equal(stats$rank, 10) test <- createDataFrame(list(list(0, 2), list(1, 0), list(2, 0)), c("user", "item")) predictions <- collect(predict(model, test)) - expect_equal(predictions$prediction, c(-0.1380762, 2.6258414, -1.5018409), + expect_equal(predictions$prediction, c(0.6324540, 3.6218479, -0.4568263), tolerance = 1e-4) # Test model save/load diff --git a/R/pkg/tests/fulltests/test_mllib_tree.R b/R/pkg/tests/fulltests/test_mllib_tree.R index facd3a941cf1..ad68700c7ff4 100644 --- a/R/pkg/tests/fulltests/test_mllib_tree.R +++ b/R/pkg/tests/fulltests/test_mllib_tree.R @@ -148,10 +148,10 @@ test_that("spark.randomForest", { model <- spark.randomForest(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16, numTrees = 20, seed = 123) predictions <- collect(predict(model, data)) - expect_equal(predictions$prediction, c(60.32820, 61.22315, 60.69025, 62.11070, - 63.53160, 64.05470, 65.12710, 64.30450, - 66.70910, 67.86125, 68.08700, 67.21865, - 68.89275, 69.53180, 69.39640, 69.68250), + expect_equal(predictions$prediction, c(60.32495, 61.06495, 60.52120, 61.98500, + 63.64450, 64.21910, 65.00810, 64.30450, + 66.70910, 67.96875, 68.22140, 67.21865, + 68.89275, 69.55900, 69.30160, 69.93050), tolerance = 1e-4) stats <- summary(model) expect_equal(stats$numTrees, 20) diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R index cebd0f890ecf..2394f7414284 100644 --- a/R/pkg/tests/fulltests/test_sparkSQL.R +++ b/R/pkg/tests/fulltests/test_sparkSQL.R @@ -1786,9 +1786,9 @@ test_that("column binary mathfunctions", { expect_equal(collect(select(df, shiftRight(df$b, 1)))[4, 1], 4) expect_equal(collect(select(df, shiftRightUnsigned(df$b, 1)))[4, 1], 4) expect_equal(class(collect(select(df, rand()))[2, 1]), "numeric") - expect_equal(collect(select(df, rand(1)))[1, 1], 0.134, tolerance = 0.01) + expect_equal(collect(select(df, rand(1)))[1, 1], 0.636, tolerance = 0.01) expect_equal(class(collect(select(df, randn()))[2, 1]), "numeric") - expect_equal(collect(select(df, randn(1)))[1, 1], -1.03, tolerance = 0.01) + expect_equal(collect(select(df, randn(1)))[1, 1], 1.68, tolerance = 0.01) }) test_that("string operators", { @@ -2360,7 +2360,7 @@ test_that("join(), crossJoin() and merge() on a DataFrame", { expect_equal(names(joined3), c("age", "name", "name", "test")) expect_equal(count(joined3), 4) expect_true(is.na(collect(orderBy(joined3, joined3$age))$age[2])) - + joined4 <- join(df, df2, df$name == df2$name, "right_outer") expect_equal(names(joined4), c("age", "name", "name", "test")) expect_equal(count(joined4), 4) @@ -2377,19 +2377,19 @@ test_that("join(), crossJoin() and merge() on a DataFrame", { expect_equal(names(joined6), c("newAge", "name", "test")) expect_equal(count(joined6), 4) expect_equal(collect(orderBy(joined6, joined6$name))$newAge[3], 24) - + joined7 <- select(join(df, df2, df$name == df2$name, "full"), alias(df$age + 5, "newAge"), df$name, df2$test) expect_equal(names(joined7), c("newAge", "name", "test")) expect_equal(count(joined7), 4) expect_equal(collect(orderBy(joined7, joined7$name))$newAge[3], 24) - + joined8 <- select(join(df, df2, df$name == df2$name, "fullouter"), alias(df$age + 5, "newAge"), df$name, df2$test) expect_equal(names(joined8), c("newAge", "name", "test")) expect_equal(count(joined8), 4) expect_equal(collect(orderBy(joined8, joined8$name))$newAge[3], 24) - + joined9 <- select(join(df, df2, df$name == df2$name, "full_outer"), alias(df$age + 5, "newAge"), df$name, df2$test) expect_equal(names(joined9), c("newAge", "name", "test")) @@ -2400,12 +2400,12 @@ test_that("join(), crossJoin() and merge() on a DataFrame", { expect_equal(names(joined10), c("age", "name", "name", "test")) expect_equal(count(joined10), 3) expect_true(is.na(collect(orderBy(joined10, joined10$age))$age[1])) - + joined11 <- join(df, df2, df$name == df2$name, "leftouter") expect_equal(names(joined11), c("age", "name", "name", "test")) expect_equal(count(joined11), 3) expect_true(is.na(collect(orderBy(joined11, joined11$age))$age[1])) - + joined12 <- join(df, df2, df$name == df2$name, "left_outer") expect_equal(names(joined12), c("age", "name", "name", "test")) expect_equal(count(joined12), 3) @@ -2418,23 +2418,23 @@ test_that("join(), crossJoin() and merge() on a DataFrame", { joined14 <- join(df, df2, df$name == df2$name, "semi") expect_equal(names(joined14), c("age", "name")) expect_equal(count(joined14), 3) - + joined14 <- join(df, df2, df$name == df2$name, "leftsemi") expect_equal(names(joined14), c("age", "name")) expect_equal(count(joined14), 3) - + joined15 <- join(df, df2, df$name == df2$name, "left_semi") expect_equal(names(joined15), c("age", "name")) expect_equal(count(joined15), 3) - + joined16 <- join(df2, df, df2$name == df$name, "anti") expect_equal(names(joined16), c("name", "test")) expect_equal(count(joined16), 1) - + joined17 <- join(df2, df, df2$name == df$name, "leftanti") expect_equal(names(joined17), c("name", "test")) expect_equal(count(joined17), 1) - + joined18 <- join(df2, df, df2$name == df$name, "left_anti") expect_equal(names(joined18), c("name", "test")) expect_equal(count(joined18), 1) @@ -2444,7 +2444,7 @@ test_that("join(), crossJoin() and merge() on a DataFrame", { "'left', 'leftouter', 'left_outer', 'right', 'rightouter', 'right_outer',", "'semi', 'leftsemi', 'left_semi', 'anti', 'leftanti' or 'left_anti'.") expect_error(join(df2, df, df2$name == df$name, "invalid"), error_msg) - + merged <- merge(df, df2, by.x = "name", by.y = "name", all.x = TRUE, all.y = TRUE) expect_equal(count(merged), 4) expect_equal(names(merged), c("age", "name_x", "name_y", "test")) @@ -3026,7 +3026,7 @@ test_that("sampleBy() on a DataFrame", { sample <- sampleBy(df, "key", fractions, 0) result <- collect(orderBy(count(groupBy(sample, "key")), "key")) expect_identical(as.list(result[1, ]), list(key = "0", count = 3)) - expect_identical(as.list(result[2, ]), list(key = "1", count = 7)) + expect_identical(as.list(result[2, ]), list(key = "1", count = 8)) }) test_that("approxQuantile() on a DataFrame", { diff --git a/core/src/main/scala/org/apache/spark/util/random/XORShiftRandom.scala b/core/src/main/scala/org/apache/spark/util/random/XORShiftRandom.scala index e47275643267..af09e50a157a 100644 --- a/core/src/main/scala/org/apache/spark/util/random/XORShiftRandom.scala +++ b/core/src/main/scala/org/apache/spark/util/random/XORShiftRandom.scala @@ -59,7 +59,7 @@ private[spark] object XORShiftRandom { /** Hash seeds to have 0/1 bits throughout. */ private[random] def hashSeed(seed: Long): Long = { - val bytes = ByteBuffer.allocate(java.lang.Long.SIZE).putLong(seed).array() + val bytes = ByteBuffer.allocate(java.lang.Long.BYTES).putLong(seed).array() val lowBits = MurmurHash3.bytesHash(bytes) val highBits = MurmurHash3.bytesHash(bytes, lowBits) (highBits.toLong << 32) | (lowBits.toLong & 0xFFFFFFFFL) diff --git a/core/src/test/java/test/org/apache/spark/JavaAPISuite.java b/core/src/test/java/test/org/apache/spark/JavaAPISuite.java index f979f9e8bb95..a8252e03b5c1 100644 --- a/core/src/test/java/test/org/apache/spark/JavaAPISuite.java +++ b/core/src/test/java/test/org/apache/spark/JavaAPISuite.java @@ -32,6 +32,8 @@ import java.util.List; import java.util.Map; import java.util.concurrent.*; +import java.util.stream.Collectors; +import java.util.stream.IntStream; import org.apache.spark.Partitioner; import org.apache.spark.SparkConf; @@ -156,13 +158,16 @@ public void intersection() { @Test public void sample() { - List ints = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10); + List ints = IntStream.iterate(1, x -> x + 1) + .limit(20) + .boxed() + .collect(Collectors.toList()); JavaRDD rdd = sc.parallelize(ints); // the seeds here are "magic" to make this work out nicely JavaRDD sample20 = rdd.sample(true, 0.2, 8); assertEquals(2, sample20.count()); JavaRDD sample20WithoutReplacement = rdd.sample(false, 0.2, 2); - assertEquals(2, sample20WithoutReplacement.count()); + assertEquals(4, sample20WithoutReplacement.count()); } @Test diff --git a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala index 945b09441ea9..1564435a0bba 100644 --- a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala +++ b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala @@ -739,7 +739,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { val dist = new BinomialDistribution(trials, p) val q = dist.cumulativeProbability(actual) withClue(s"p = $p: trials = $trials") { - assert(q >= 0.001 && q <= 0.999) + assert(0.0 < q && q < 1.0) } } } diff --git a/core/src/test/scala/org/apache/spark/util/random/RandomSamplerSuite.scala b/core/src/test/scala/org/apache/spark/util/random/RandomSamplerSuite.scala index 7eb2f56c2058..c2e3830d955c 100644 --- a/core/src/test/scala/org/apache/spark/util/random/RandomSamplerSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/random/RandomSamplerSuite.scala @@ -59,7 +59,7 @@ class RandomSamplerSuite extends SparkFunSuite with Matchers { // will always fail with some nonzero probability, so I'll fix the seed to prevent these // tests from generating random failure noise in CI testing, etc. val rngSeed: Random = RandomSampler.newDefaultRNG - rngSeed.setSeed(235711) + rngSeed.setSeed(235711345678901011L) // Reference implementation of sampling without replacement (bernoulli) def sample[T](data: Iterator[T], f: Double): Iterator[T] = { diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala index cd59900c521c..379e14fbc057 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala @@ -345,7 +345,7 @@ class GBTClassifierSuite extends MLTest with DefaultReadWriteTest { test("Tests of feature subset strategy") { val numClasses = 2 val gbt = new GBTClassifier() - .setSeed(123) + .setSeed(42) .setMaxDepth(3) .setMaxIter(5) .setFeatureSubsetStrategy("all") diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index 24998926abd8..9af7fff2a6e3 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -664,18 +664,16 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { coefficients = coef(glmnet(features, label, weights=w, family="binomial", alpha = 0, lambda = 0)) coefficients - $`0` 5 x 1 sparse Matrix of class "dgCMatrix" s0 - (Intercept) 2.7355261 - data.V3 -0.5734389 - data.V4 0.8911736 - data.V5 -0.3878645 - data.V6 -0.8060570 - + (Intercept) 2.7114519 + data.V3 -0.5667801 + data.V4 0.8818754 + data.V5 -0.3882505 + data.V6 -0.7891183 */ - val coefficientsR = Vectors.dense(-0.5734389, 0.8911736, -0.3878645, -0.8060570) - val interceptR = 2.7355261 + val coefficientsR = Vectors.dense(-0.5667801, 0.8818754, -0.3882505, -0.7891183) + val interceptR = 2.7114519 assert(model1.intercept ~== interceptR relTol 1E-3) assert(model1.coefficients ~= coefficientsR relTol 1E-3) @@ -707,7 +705,8 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { val model2 = trainer2.fit(binaryDataset) // The solution is generated by https://github.com/yanboliang/bound-optimization. - val coefficientsExpected1 = Vectors.dense(0.06079437, 0.0, -0.26351059, -0.59102199) + val coefficientsExpected1 = Vectors.dense( + 0.05997387390575594, 0.0, -0.26536616889454984, -0.5793842425088045) val interceptExpected1 = 1.0 assert(model1.intercept ~== interceptExpected1 relTol 1E-3) @@ -742,8 +741,8 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { val model4 = trainer4.fit(binaryDataset) // The solution is generated by https://github.com/yanboliang/bound-optimization. - val coefficientsExpected3 = Vectors.dense(0.0, 0.0, 0.0, -0.71708632) - val interceptExpected3 = 0.58776113 + val coefficientsExpected3 = Vectors.dense(0.0, 0.0, 0.0, -0.7003382019888361) + val interceptExpected3 = 0.5673234605102715 assert(model3.intercept ~== interceptExpected3 relTol 1E-3) assert(model3.coefficients ~= coefficientsExpected3 relTol 1E-3) @@ -775,8 +774,9 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { // The solution is generated by https://github.com/yanboliang/bound-optimization. // It should be same as unbound constrained optimization with LBFGS. - val coefficientsExpected5 = Vectors.dense(-0.5734389, 0.8911736, -0.3878645, -0.8060570) - val interceptExpected5 = 2.7355261 + val coefficientsExpected5 = Vectors.dense( + -0.5667990118366208, 0.8819300812352234, -0.38825593561750166, -0.7891233856979563) + val interceptExpected5 = 2.711413425425 assert(model5.intercept ~== interceptExpected5 relTol 1E-3) assert(model5.coefficients ~= coefficientsExpected5 relTol 1E-3) @@ -810,13 +810,13 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { 5 x 1 sparse Matrix of class "dgCMatrix" s0 (Intercept) . - data.V3 -0.3448461 - data.V4 1.2776453 - data.V5 -0.3539178 - data.V6 -0.7469384 + data.V3 -0.3451301 + data.V4 1.2721785 + data.V5 -0.3537743 + data.V6 -0.7315618 */ - val coefficientsR = Vectors.dense(-0.3448461, 1.2776453, -0.3539178, -0.7469384) + val coefficientsR = Vectors.dense(-0.3451301, 1.2721785, -0.3537743, -0.7315618) assert(model1.intercept ~== 0.0 relTol 1E-3) assert(model1.coefficients ~= coefficientsR relTol 1E-2) @@ -844,7 +844,8 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { val model2 = trainer2.fit(binaryDataset) // The solution is generated by https://github.com/yanboliang/bound-optimization. - val coefficientsExpected = Vectors.dense(0.20847553, 0.0, -0.24240289, -0.55568071) + val coefficientsExpected = Vectors.dense( + 0.20721074484293306, 0.0, -0.24389739190279183, -0.5446655961212726) assert(model1.intercept ~== 0.0 relTol 1E-3) assert(model1.coefficients ~= coefficientsExpected relTol 1E-3) @@ -877,15 +878,15 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { $`0` 5 x 1 sparse Matrix of class "dgCMatrix" s0 - (Intercept) -0.06775980 + (Intercept) -0.07157076 data.V3 . data.V4 . - data.V5 -0.03933146 - data.V6 -0.03047580 + data.V5 -0.04058143 + data.V6 -0.02322760 */ - val coefficientsRStd = Vectors.dense(0.0, 0.0, -0.03933146, -0.03047580) - val interceptRStd = -0.06775980 + val coefficientsRStd = Vectors.dense(0.0, 0.0, -0.04058143, -0.02322760) + val interceptRStd = -0.07157076 assert(model1.intercept ~== interceptRStd relTol 1E-2) assert(model1.coefficients ~= coefficientsRStd absTol 2E-2) @@ -904,15 +905,15 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { $`0` 5 x 1 sparse Matrix of class "dgCMatrix" s0 - (Intercept) 0.3544768 + (Intercept) 0.3602029 data.V3 . data.V4 . - data.V5 -0.1626191 + data.V5 -0.1635707 data.V6 . */ - val coefficientsR = Vectors.dense(0.0, 0.0, -0.1626191, 0.0) - val interceptR = 0.3544768 + val coefficientsR = Vectors.dense(0.0, 0.0, -0.1635707, 0.0) + val interceptR = 0.3602029 assert(model2.intercept ~== interceptR relTol 1E-2) assert(model2.coefficients ~== coefficientsR absTol 1E-3) @@ -945,8 +946,8 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { (Intercept) . data.V3 . data.V4 . - data.V5 -0.04967635 - data.V6 -0.04757757 + data.V5 -0.05164150 + data.V6 -0.04079129 coefficients 5 x 1 sparse Matrix of class "dgCMatrix" @@ -954,13 +955,13 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { (Intercept) . data.V3 . data.V4 . - data.V5 -0.08433195 + data.V5 -0.08408014 data.V6 . */ - val coefficientsRStd = Vectors.dense(0.0, 0.0, -0.04967635, -0.04757757) + val coefficientsRStd = Vectors.dense(0.0, 0.0, -0.05164150, -0.04079129) - val coefficientsR = Vectors.dense(0.0, 0.0, -0.08433195, 0.0) + val coefficientsR = Vectors.dense(0.0, 0.0, -0.08408014, 0.0) assert(model1.intercept ~== 0.0 absTol 1E-3) assert(model1.coefficients ~= coefficientsRStd absTol 1E-3) @@ -992,26 +993,26 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { coefficientsStd 5 x 1 sparse Matrix of class "dgCMatrix" s0 - (Intercept) 0.12707703 - data.V3 -0.06980967 - data.V4 0.10803933 - data.V5 -0.04800404 - data.V6 -0.10165096 + (Intercept) 0.12943705 + data.V3 -0.06979418 + data.V4 0.10691465 + data.V5 -0.04835674 + data.V6 -0.09939108 coefficients 5 x 1 sparse Matrix of class "dgCMatrix" s0 - (Intercept) 0.46613016 - data.V3 -0.04944529 - data.V4 0.02326772 - data.V5 -0.11362772 - data.V6 -0.06312848 + (Intercept) 0.47553535 + data.V3 -0.05058465 + data.V4 0.02296823 + data.V5 -0.11368284 + data.V6 -0.06309008 */ - val coefficientsRStd = Vectors.dense(-0.06980967, 0.10803933, -0.04800404, -0.10165096) - val interceptRStd = 0.12707703 - val coefficientsR = Vectors.dense(-0.04944529, 0.02326772, -0.11362772, -0.06312848) - val interceptR = 0.46613016 + val coefficientsRStd = Vectors.dense(-0.06979418, 0.10691465, -0.04835674, -0.09939108) + val interceptRStd = 0.12943705 + val coefficientsR = Vectors.dense(-0.05058465, 0.02296823, -0.11368284, -0.06309008) + val interceptR = 0.47553535 assert(model1.intercept ~== interceptRStd relTol 1E-3) assert(model1.coefficients ~= coefficientsRStd relTol 1E-3) @@ -1042,10 +1043,12 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { val model2 = trainer2.fit(binaryDataset) // The solution is generated by https://github.com/yanboliang/bound-optimization. - val coefficientsExpectedWithStd = Vectors.dense(-0.06985003, 0.0, -0.04794278, -0.10168595) - val interceptExpectedWithStd = 0.45750141 - val coefficientsExpected = Vectors.dense(-0.0494524, 0.0, -0.11360797, -0.06313577) - val interceptExpected = 0.53722967 + val coefficientsExpectedWithStd = Vectors.dense( + -0.06974410278847253, 0.0, -0.04833486093952599, -0.09941770618793982) + val interceptExpectedWithStd = 0.4564981350661977 + val coefficientsExpected = Vectors.dense( + -0.050579069523730306, 0.0, -0.11367447252893222, -0.06309435539607525) + val interceptExpected = 0.5457873335999178 assert(model1.intercept ~== interceptExpectedWithStd relTol 1E-3) assert(model1.coefficients ~= coefficientsExpectedWithStd relTol 1E-3) @@ -1078,23 +1081,24 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { 5 x 1 sparse Matrix of class "dgCMatrix" s0 (Intercept) . - data.V3 -0.06000152 - data.V4 0.12598737 - data.V5 -0.04669009 - data.V6 -0.09941025 + data.V3 -0.05998915 + data.V4 0.12541885 + data.V5 -0.04697872 + data.V6 -0.09713973 coefficients 5 x 1 sparse Matrix of class "dgCMatrix" s0 (Intercept) . - data.V3 -0.005482255 - data.V4 0.048106338 - data.V5 -0.093411640 - data.V6 -0.054149798 + data.V3 -0.005927466 + data.V4 0.048313659 + data.V5 -0.092956052 + data.V6 -0.053974895 */ - val coefficientsRStd = Vectors.dense(-0.06000152, 0.12598737, -0.04669009, -0.09941025) - val coefficientsR = Vectors.dense(-0.005482255, 0.048106338, -0.093411640, -0.054149798) + val coefficientsRStd = Vectors.dense(-0.05998915, 0.12541885, -0.04697872, -0.09713973) + val coefficientsR = Vectors.dense( + -0.0059320221190687205, 0.04834399477383437, -0.09296353778288495, -0.05398080548228108) assert(model1.intercept ~== 0.0 absTol 1E-3) assert(model1.coefficients ~= coefficientsRStd relTol 1E-2) @@ -1122,8 +1126,10 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { val model2 = trainer2.fit(binaryDataset) // The solution is generated by https://github.com/yanboliang/bound-optimization. - val coefficientsExpectedWithStd = Vectors.dense(-0.00796538, 0.0, -0.0394228, -0.0873314) - val coefficientsExpected = Vectors.dense(0.01105972, 0.0, -0.08574949, -0.05079558) + val coefficientsExpectedWithStd = Vectors.dense( + -0.00845365508769699, 0.0, -0.03954848648474558, -0.0851639471468608) + val coefficientsExpected = Vectors.dense( + 0.010675769768102661, 0.0, -0.0852582080623827, -0.050615535080106376) assert(model1.intercept ~== 0.0 relTol 1E-3) assert(model1.coefficients ~= coefficientsExpectedWithStd relTol 1E-3) @@ -1134,7 +1140,7 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { test("binary logistic regression with intercept with ElasticNet regularization") { val trainer1 = (new LogisticRegression).setFitIntercept(true).setMaxIter(120) .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(true).setWeightCol("weight") - val trainer2 = (new LogisticRegression).setFitIntercept(true).setMaxIter(30) + val trainer2 = (new LogisticRegression).setFitIntercept(true).setMaxIter(60) .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(false).setWeightCol("weight") val model1 = trainer1.fit(binaryDataset) @@ -1155,26 +1161,26 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { coefficientsStd 5 x 1 sparse Matrix of class "dgCMatrix" s0 - (Intercept) 0.49991996 - data.V3 -0.04131110 + (Intercept) 0.51344133 + data.V3 -0.04395595 data.V4 . - data.V5 -0.08585233 - data.V6 -0.15875400 + data.V5 -0.08699271 + data.V6 -0.15249200 coefficients 5 x 1 sparse Matrix of class "dgCMatrix" s0 - (Intercept) 0.5024256 + (Intercept) 0.50936159 data.V3 . data.V4 . - data.V5 -0.1846038 - data.V6 -0.0559614 + data.V5 -0.18569346 + data.V6 -0.05625862 */ - val coefficientsRStd = Vectors.dense(-0.04131110, 0.0, -0.08585233, -0.15875400) - val interceptRStd = 0.49991996 - val coefficientsR = Vectors.dense(0.0, 0.0, -0.1846038, -0.0559614) - val interceptR = 0.5024256 + val coefficientsRStd = Vectors.dense(-0.04395595, 0.0, -0.08699271, -0.15249200) + val interceptRStd = 0.51344133 + val coefficientsR = Vectors.dense(0.0, 0.0, -0.18569346, -0.05625862) + val interceptR = 0.50936159 assert(model1.intercept ~== interceptRStd relTol 6E-2) assert(model1.coefficients ~== coefficientsRStd absTol 5E-3) @@ -1285,13 +1291,13 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { 5 x 1 sparse Matrix of class "dgCMatrix" s0 - (Intercept) -0.2516986 + (Intercept) -0.2521953 data.V3 0.0000000 data.V4 . data.V5 . data.V6 . */ - val interceptR = -0.2516986 + val interceptR = -0.2521953 val coefficientsR = Vectors.dense(0.0, 0.0, 0.0, 0.0) assert(model1.intercept ~== interceptR relTol 1E-5) @@ -1373,37 +1379,36 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { $`0` 5 x 1 sparse Matrix of class "dgCMatrix" s0 - -2.10320093 - data.V3 0.24337896 - data.V4 -0.05916156 - data.V5 0.14446790 - data.V6 0.35976165 + -2.22347257 + data.V3 0.24574397 + data.V4 -0.04054235 + data.V5 0.14963756 + data.V6 0.37504027 $`1` 5 x 1 sparse Matrix of class "dgCMatrix" s0 - 0.3394473 - data.V3 -0.3443375 - data.V4 0.9181331 - data.V5 -0.2283959 - data.V6 -0.4388066 + 0.3674309 + data.V3 -0.3266910 + data.V4 0.8939282 + data.V5 -0.2363519 + data.V6 -0.4631336 $`2` 5 x 1 sparse Matrix of class "dgCMatrix" s0 - 1.76375361 - data.V3 0.10095851 - data.V4 -0.85897154 - data.V5 0.08392798 - data.V6 0.07904499 - + 1.85604170 + data.V3 0.08094703 + data.V4 -0.85338588 + data.V5 0.08671439 + data.V6 0.08809332 */ val coefficientsR = new DenseMatrix(3, 4, Array( - 0.24337896, -0.05916156, 0.14446790, 0.35976165, - -0.3443375, 0.9181331, -0.2283959, -0.4388066, - 0.10095851, -0.85897154, 0.08392798, 0.07904499), isTransposed = true) - val interceptsR = Vectors.dense(-2.10320093, 0.3394473, 1.76375361) + 0.24574397, -0.04054235, 0.14963756, 0.37504027, + -0.3266910, 0.8939282, -0.2363519, -0.4631336, + 0.08094703, -0.85338588, 0.08671439, 0.08809332), isTransposed = true) + val interceptsR = Vectors.dense(-2.22347257, 0.3674309, 1.85604170) model1.coefficientMatrix.colIter.foreach(v => assert(v.toArray.sum ~== 0.0 absTol eps)) model2.coefficientMatrix.colIter.foreach(v => assert(v.toArray.sum ~== 0.0 absTol eps)) @@ -1496,10 +1501,12 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { // The solution is generated by https://github.com/yanboliang/bound-optimization. val coefficientsExpected1 = new DenseMatrix(3, 4, Array( - 2.52076464, 2.73596057, 1.87984904, 2.73264492, - 1.93302281, 3.71363303, 1.50681746, 1.93398782, - 2.37839917, 1.93601818, 1.81924758, 2.45191255), isTransposed = true) - val interceptsExpected1 = Vectors.dense(1.00010477, 3.44237083, 4.86740286) + 2.1156620676212325, 2.7146375863138825, 1.8108730417428125, 2.711975470258063, + 1.54314110882009, 3.648963914233324, 1.4248901324480239, 1.8737908246138315, + 1.950852726788052, 1.9017484391817425, 1.7479497661988832, 2.425055298693075), + isTransposed = true) + val interceptsExpected1 = Vectors.dense( + 1.0000152482448372, 3.591773288423673, 5.079685953744937) checkCoefficientsEquivalent(model1.coefficientMatrix, coefficientsExpected1) assert(model1.interceptVector ~== interceptsExpected1 relTol 0.01) @@ -1532,9 +1539,10 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { // The solution is generated by https://github.com/yanboliang/bound-optimization. val coefficientsExpected3 = new DenseMatrix(3, 4, Array( - 1.61967097, 1.16027835, 1.45131448, 1.97390431, - 1.30529317, 2.0, 1.12985473, 1.26652854, - 1.61647195, 1.0, 1.40642959, 1.72985589), isTransposed = true) + 1.641980508924569, 1.1579023489264648, 1.434651352010351, 1.9541352988127463, + 1.3416273422126057, 2.0, 1.1014102844446283, 1.2076556940852765, + 1.6371808928302913, 1.0, 1.3936094723717016, 1.71022540576362), + isTransposed = true) val interceptsExpected3 = Vectors.dense(1.0, 2.0, 2.0) checkCoefficientsEquivalent(model3.coefficientMatrix, coefficientsExpected3) @@ -1566,10 +1574,12 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { // The solution is generated by https://github.com/yanboliang/bound-optimization. // It should be same as unbound constrained optimization with LBFGS. val coefficientsExpected5 = new DenseMatrix(3, 4, Array( - 0.24337896, -0.05916156, 0.14446790, 0.35976165, - -0.3443375, 0.9181331, -0.2283959, -0.4388066, - 0.10095851, -0.85897154, 0.08392798, 0.07904499), isTransposed = true) - val interceptsExpected5 = Vectors.dense(-2.10320093, 0.3394473, 1.76375361) + 0.24573204902629314, -0.040610820463585905, 0.14962716893619094, 0.37502549108817784, + -0.3266914048842952, 0.8940567211111817, -0.23633898260880218, -0.4631024664883818, + 0.08095935585808962, -0.8534459006476851, 0.0867118136726069, 0.0880769754002182), + isTransposed = true) + val interceptsExpected5 = Vectors.dense( + -2.2231282183460723, 0.3669496747012527, 1.856178543644802) checkCoefficientsEquivalent(model5.coefficientMatrix, coefficientsExpected5) assert(model5.interceptVector ~== interceptsExpected5 relTol 0.01) @@ -1602,35 +1612,35 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { 5 x 1 sparse Matrix of class "dgCMatrix" s0 . - data.V3 0.07276291 - data.V4 -0.36325496 - data.V5 0.12015088 - data.V6 0.31397340 + data.V3 0.06892068 + data.V4 -0.36546704 + data.V5 0.12274583 + data.V6 0.32616580 $`1` 5 x 1 sparse Matrix of class "dgCMatrix" s0 . - data.V3 -0.3180040 - data.V4 0.9679074 - data.V5 -0.2252219 - data.V6 -0.4319914 + data.V3 -0.2987384 + data.V4 0.9483147 + data.V5 -0.2328113 + data.V6 -0.4555157 $`2` 5 x 1 sparse Matrix of class "dgCMatrix" s0 . - data.V3 0.2452411 - data.V4 -0.6046524 - data.V5 0.1050710 - data.V6 0.1180180 + data.V3 0.2298177 + data.V4 -0.5828477 + data.V5 0.1100655 + data.V6 0.1293499 */ val coefficientsR = new DenseMatrix(3, 4, Array( - 0.07276291, -0.36325496, 0.12015088, 0.31397340, - -0.3180040, 0.9679074, -0.2252219, -0.4319914, - 0.2452411, -0.6046524, 0.1050710, 0.1180180), isTransposed = true) + 0.06892068, -0.36546704, 0.12274583, 0.32616580, + -0.2987384, 0.9483147, -0.2328113, -0.4555157, + 0.2298177, -0.5828477, 0.1100655, 0.1293499), isTransposed = true) model1.coefficientMatrix.colIter.foreach(v => assert(v.toArray.sum ~== 0.0 absTol eps)) model2.coefficientMatrix.colIter.foreach(v => assert(v.toArray.sum ~== 0.0 absTol eps)) @@ -1664,9 +1674,10 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { // The solution is generated by https://github.com/yanboliang/bound-optimization. val coefficientsExpected = new DenseMatrix(3, 4, Array( - 1.62410051, 1.38219391, 1.34486618, 1.74641729, - 1.23058989, 2.71787825, 1.0, 1.00007073, - 1.79478632, 1.14360459, 1.33011603, 1.55093897), isTransposed = true) + 1.5933935326002155, 1.4427758360562475, 1.356079506266844, 1.7818682794856215, + 1.2224266732592248, 2.762691362720858, 1.0005885171478472, 1.0000022613855966, + 1.7524631428961193, 1.2292565990448736, 1.3433784431904323, 1.5846063017678864), + isTransposed = true) checkCoefficientsEquivalent(model1.coefficientMatrix, coefficientsExpected) assert(model1.interceptVector.toArray === Array.fill(3)(0.0)) @@ -1703,27 +1714,27 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { $`0` 5 x 1 sparse Matrix of class "dgCMatrix" s0 - -0.62244703 + -0.69265374 data.V3 . data.V4 . data.V5 . - data.V6 0.08419825 + data.V6 0.09064661 $`1` 5 x 1 sparse Matrix of class "dgCMatrix" s0 - -0.2804845 - data.V3 -0.1336960 - data.V4 0.3717091 - data.V5 -0.1530363 - data.V6 -0.2035286 + -0.2260274 + data.V3 -0.1144333 + data.V4 0.3204703 + data.V5 -0.1621061 + data.V6 -0.2308192 $`2` 5 x 1 sparse Matrix of class "dgCMatrix" s0 - 0.9029315 + 0.9186811 data.V3 . - data.V4 -0.4629737 + data.V4 -0.4832131 data.V5 . data.V6 . @@ -1732,25 +1743,25 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { $`0` 5 x 1 sparse Matrix of class "dgCMatrix" s0 - -0.44215290 + -0.44707756 data.V3 . data.V4 . - data.V5 0.01767089 - data.V6 0.02542866 + data.V5 0.01641412 + data.V6 0.03570376 $`1` 5 x 1 sparse Matrix of class "dgCMatrix" s0 - 0.76308326 - data.V3 -0.06818576 + 0.75180900 + data.V3 -0.05110822 data.V4 . - data.V5 -0.20446351 - data.V6 -0.13017924 + data.V5 -0.21595670 + data.V6 -0.16162836 $`2` 5 x 1 sparse Matrix of class "dgCMatrix" s0 - -0.3209304 + -0.3047314 data.V3 . data.V4 . data.V5 . @@ -1759,15 +1770,15 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { */ val coefficientsRStd = new DenseMatrix(3, 4, Array( - 0.0, 0.0, 0.0, 0.08419825, - -0.1336960, 0.3717091, -0.1530363, -0.2035286, - 0.0, -0.4629737, 0.0, 0.0), isTransposed = true) - val interceptsRStd = Vectors.dense(-0.62244703, -0.2804845, 0.9029315) + 0.0, 0.0, 0.0, 0.09064661, + -0.1144333, 0.3204703, -0.1621061, -0.2308192, + 0.0, -0.4832131, 0.0, 0.0), isTransposed = true) + val interceptsRStd = Vectors.dense(-0.72638218, -0.01737265, 0.74375484) val coefficientsR = new DenseMatrix(3, 4, Array( - 0.0, 0.0, 0.01767089, 0.02542866, - -0.06818576, 0.0, -0.20446351, -0.13017924, + 0.0, 0.0, 0.01641412, 0.03570376, + -0.05110822, 0.0, -0.21595670, -0.16162836, 0.0, 0.0, 0.0, 0.0), isTransposed = true) - val interceptsR = Vectors.dense(-0.44215290, 0.76308326, -0.3209304) + val interceptsR = Vectors.dense(-0.44707756, 0.75180900, -0.3047314) assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.05) assert(model1.interceptVector ~== interceptsRStd relTol 0.1) @@ -1800,31 +1811,30 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { coefficientsStd $`0` 5 x 1 sparse Matrix of class "dgCMatrix" - s0 + s0 . data.V3 . data.V4 . data.V5 . - data.V6 0.01144225 + data.V6 0.01167 $`1` 5 x 1 sparse Matrix of class "dgCMatrix" s0 . - data.V3 -0.1678787 - data.V4 0.5385351 - data.V5 -0.1573039 - data.V6 -0.2471624 + data.V3 -0.1413518 + data.V4 0.5100469 + data.V5 -0.1658025 + data.V6 -0.2755998 $`2` 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - . - data.V3 . - data.V4 . - data.V5 . - data.V6 . - + s0 + . + data.V3 0.001536337 + data.V4 . + data.V5 . + data.V6 . coefficients $`0` @@ -1841,9 +1851,9 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { s0 . data.V3 . - data.V4 0.1929409 - data.V5 -0.1889121 - data.V6 -0.1010413 + data.V4 0.2094410 + data.V5 -0.1944582 + data.V6 -0.1307681 $`2` 5 x 1 sparse Matrix of class "dgCMatrix" @@ -1857,13 +1867,13 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { */ val coefficientsRStd = new DenseMatrix(3, 4, Array( - 0.0, 0.0, 0.0, 0.01144225, - -0.1678787, 0.5385351, -0.1573039, -0.2471624, - 0.0, 0.0, 0.0, 0.0), isTransposed = true) + 0.0, 0.0, 0.0, 0.01167, + -0.1413518, 0.5100469, -0.1658025, -0.2755998, + 0.001536337, 0.0, 0.0, 0.0), isTransposed = true) val coefficientsR = new DenseMatrix(3, 4, Array( 0.0, 0.0, 0.0, 0.0, - 0.0, 0.1929409, -0.1889121, -0.1010413, + 0.0, 0.2094410, -0.1944582, -0.1307681, 0.0, 0.0, 0.0, 0.0), isTransposed = true) assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.01) @@ -1897,72 +1907,71 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { coefficientsStd $`0` 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - -1.5898288335 - data.V3 0.1691226336 - data.V4 0.0002983651 - data.V5 0.1001732896 - data.V6 0.2554575585 + s0 + -1.68571384 + data.V3 0.17156077 + data.V4 0.01658014 + data.V5 0.10303296 + data.V6 0.26459585 $`1` 5 x 1 sparse Matrix of class "dgCMatrix" s0 - 0.2125746 - data.V3 -0.2304586 - data.V4 0.6153492 - data.V5 -0.1537017 - data.V6 -0.2975443 + 0.2364585 + data.V3 -0.2182805 + data.V4 0.5960025 + data.V5 -0.1587441 + data.V6 -0.3121284 $`2` 5 x 1 sparse Matrix of class "dgCMatrix" s0 - 1.37725427 - data.V3 0.06133600 - data.V4 -0.61564761 - data.V5 0.05352840 - data.V6 0.04208671 - + 1.44925536 + data.V3 0.04671972 + data.V4 -0.61258267 + data.V5 0.05571116 + data.V6 0.04753251 coefficients $`0` 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - -1.5681088 - data.V3 0.1508182 - data.V4 0.0121955 - data.V5 0.1217930 - data.V6 0.2162850 + s0 + -1.65140201 + data.V3 0.15446206 + data.V4 0.02134769 + data.V5 0.12524946 + data.V6 0.22607972 $`1` 5 x 1 sparse Matrix of class "dgCMatrix" s0 - 1.1217130 - data.V3 -0.2028984 - data.V4 0.2862431 - data.V5 -0.1843559 - data.V6 -0.2481218 + 1.1367722 + data.V3 -0.1931713 + data.V4 0.2766548 + data.V5 -0.1910455 + data.V6 -0.2629336 $`2` 5 x 1 sparse Matrix of class "dgCMatrix" s0 - 0.44639579 - data.V3 0.05208012 - data.V4 -0.29843864 - data.V5 0.06256289 - data.V6 0.03183676 + 0.51462979 + data.V3 0.03870921 + data.V4 -0.29800245 + data.V5 0.06579606 + data.V6 0.03685390 */ val coefficientsRStd = new DenseMatrix(3, 4, Array( - 0.1691226336, 0.0002983651, 0.1001732896, 0.2554575585, - -0.2304586, 0.6153492, -0.1537017, -0.2975443, - 0.06133600, -0.61564761, 0.05352840, 0.04208671), isTransposed = true) - val interceptsRStd = Vectors.dense(-1.5898288335, 0.2125746, 1.37725427) + 0.17156077, 0.01658014, 0.10303296, 0.26459585, + -0.2182805, 0.5960025, -0.1587441, -0.3121284, + 0.04671972, -0.61258267, 0.05571116, 0.04753251), isTransposed = true) + val interceptsRStd = Vectors.dense(-1.68571384, 0.2364585, 1.44925536) val coefficientsR = new DenseMatrix(3, 4, Array( - 0.1508182, 0.0121955, 0.1217930, 0.2162850, - -0.2028984, 0.2862431, -0.1843559, -0.2481218, - 0.05208012, -0.29843864, 0.06256289, 0.03183676), isTransposed = true) - val interceptsR = Vectors.dense(-1.5681088, 1.1217130, 0.44639579) + 0.15446206, 0.02134769, 0.12524946, 0.22607972, + -0.1931713, 0.2766548, -0.1910455, -0.2629336, + 0.03870921, -0.29800245, 0.06579606, 0.03685390), isTransposed = true) + val interceptsR = Vectors.dense(-1.65140201, 1.1367722, 0.51462979) assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.001) assert(model1.interceptVector ~== interceptsRStd relTol 0.05) @@ -1996,15 +2005,16 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { // The solution is generated by https://github.com/yanboliang/bound-optimization. val coefficientsExpectedWithStd = new DenseMatrix(3, 4, Array( - 1.0, 1.0, 1.0, 1.01647497, - 1.0, 1.44105616, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.025970328910313, + 1.0, 1.4150672323873024, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0), isTransposed = true) - val interceptsExpectedWithStd = Vectors.dense(2.52055893, 1.0, 2.560682) + val interceptsExpectedWithStd = Vectors.dense( + 2.4259954221861473, 1.0000087410832004, 2.490461716522559) val coefficientsExpected = new DenseMatrix(3, 4, Array( - 1.0, 1.0, 1.03189386, 1.0, + 1.0, 1.0, 1.0336746541813002, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0), isTransposed = true) - val interceptsExpected = Vectors.dense(1.06418835, 1.0, 1.20494701) + val interceptsExpected = Vectors.dense(1.0521598454128, 1.0, 1.213158241431565) assert(model1.coefficientMatrix ~== coefficientsExpectedWithStd relTol 0.01) assert(model1.interceptVector ~== interceptsExpectedWithStd relTol 0.01) @@ -2037,69 +2047,68 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { 5 x 1 sparse Matrix of class "dgCMatrix" s0 . - data.V3 0.04048126 - data.V4 -0.23075758 - data.V5 0.08228864 - data.V6 0.22277648 + data.V3 0.03804571 + data.V4 -0.23204409 + data.V5 0.08337512 + data.V6 0.23029089 $`1` 5 x 1 sparse Matrix of class "dgCMatrix" s0 . - data.V3 -0.2149745 - data.V4 0.6478666 - data.V5 -0.1515158 - data.V6 -0.2930498 + data.V3 -0.2015495 + data.V4 0.6328705 + data.V5 -0.1562475 + data.V6 -0.3071447 $`2` 5 x 1 sparse Matrix of class "dgCMatrix" s0 . - data.V3 0.17449321 - data.V4 -0.41710901 - data.V5 0.06922716 - data.V6 0.07027332 - + data.V3 0.16350376 + data.V4 -0.40082637 + data.V5 0.07287239 + data.V6 0.07685379 coefficients $`0` 5 x 1 sparse Matrix of class "dgCMatrix" s0 . - data.V3 -0.003949652 - data.V4 -0.142982415 - data.V5 0.091439598 - data.V6 0.179286241 + data.V3 -0.006493452 + data.V4 -0.143831823 + data.V5 0.092538445 + data.V6 0.187244839 $`1` 5 x 1 sparse Matrix of class "dgCMatrix" s0 . - data.V3 -0.09071124 - data.V4 0.39752531 - data.V5 -0.16233832 - data.V6 -0.22206059 + data.V3 -0.08068443 + data.V4 0.39038929 + data.V5 -0.16822390 + data.V6 -0.23667470 $`2` 5 x 1 sparse Matrix of class "dgCMatrix" s0 . - data.V3 0.09466090 - data.V4 -0.25454290 - data.V5 0.07089872 - data.V6 0.04277435 + data.V3 0.08717788 + data.V4 -0.24655746 + data.V5 0.07568546 + data.V6 0.04942986 */ val coefficientsRStd = new DenseMatrix(3, 4, Array( - 0.04048126, -0.23075758, 0.08228864, 0.22277648, - -0.2149745, 0.6478666, -0.1515158, -0.2930498, - 0.17449321, -0.41710901, 0.06922716, 0.07027332), isTransposed = true) + 0.03804571, -0.23204409, 0.08337512, 0.23029089, + -0.2015495, 0.6328705, -0.1562475, -0.3071447, + 0.16350376, -0.40082637, 0.07287239, 0.07685379), isTransposed = true) val coefficientsR = new DenseMatrix(3, 4, Array( - -0.003949652, -0.142982415, 0.091439598, 0.179286241, - -0.09071124, 0.39752531, -0.16233832, -0.22206059, - 0.09466090, -0.25454290, 0.07089872, 0.04277435), isTransposed = true) + -0.006493452, -0.143831823, 0.092538445, 0.187244839, + -0.08068443, 0.39038929, -0.16822390, -0.23667470, + 0.08717788, -0.24655746, 0.07568546, 0.04942986), isTransposed = true) assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.01) assert(model1.interceptVector.toArray === Array.fill(3)(0.0)) @@ -2150,7 +2159,7 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { .setMaxIter(220).setTol(1e-10) val trainer2 = (new LogisticRegression).setFitIntercept(true).setWeightCol("weight") .setElasticNetParam(0.5).setRegParam(0.1).setStandardization(false) - .setMaxIter(90).setTol(1e-10) + .setMaxIter(220).setTol(1e-10) val model1 = trainer1.fit(multinomialDataset) val model2 = trainer2.fit(multinomialDataset) @@ -2170,54 +2179,53 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { $`0` 5 x 1 sparse Matrix of class "dgCMatrix" s0 - -0.50133383 + -0.55325803 data.V3 . data.V4 . data.V5 . - data.V6 0.08351653 + data.V6 0.09074857 $`1` 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - -0.3151913 - data.V3 -0.1058702 - data.V4 0.3183251 - data.V5 -0.1212969 - data.V6 -0.1629778 + s0 + -0.27291366 + data.V3 -0.09093399 + data.V4 0.28078251 + data.V5 -0.12854559 + data.V6 -0.18382494 $`2` 5 x 1 sparse Matrix of class "dgCMatrix" s0 - 0.8165252 + 0.8261717 data.V3 . - data.V4 -0.3943069 + data.V4 -0.4064444 data.V5 . data.V6 . - coefficients $`0` 5 x 1 sparse Matrix of class "dgCMatrix" s0 - -0.38857157 + -0.40016908 data.V3 . data.V4 . - data.V5 0.02384198 - data.V6 0.03127749 + data.V5 0.02312769 + data.V6 0.04159224 $`1` 5 x 1 sparse Matrix of class "dgCMatrix" s0 - 0.62492165 - data.V3 -0.04949061 + 0.62474768 + data.V3 -0.03776471 data.V4 . - data.V5 -0.18584462 - data.V6 -0.08952455 + data.V5 -0.19588206 + data.V6 -0.11187712 $`2` 5 x 1 sparse Matrix of class "dgCMatrix" s0 - -0.2363501 + -0.2245786 data.V3 . data.V4 . data.V5 . @@ -2226,15 +2234,15 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { */ val coefficientsRStd = new DenseMatrix(3, 4, Array( - 0.0, 0.0, 0.0, 0.08351653, - -0.1058702, 0.3183251, -0.1212969, -0.1629778, - 0.0, -0.3943069, 0.0, 0.0), isTransposed = true) - val interceptsRStd = Vectors.dense(-0.50133383, -0.3151913, 0.8165252) + 0.0, 0.0, 0.0, 0.09074857, + -0.09093399, 0.28078251, -0.12854559, -0.18382494, + 0.0, -0.4064444, 0.0, 0.0), isTransposed = true) + val interceptsRStd = Vectors.dense(-0.55325803, -0.27291366, 0.8261717) val coefficientsR = new DenseMatrix(3, 4, Array( - 0.0, 0.0, 0.02384198, 0.03127749, - -0.04949061, 0.0, -0.18584462, -0.08952455, + 0.0, 0.0, 0.02312769, 0.04159224, + -0.03776471, 0.0, -0.19588206, -0.11187712, 0.0, 0.0, 0.0, 0.0), isTransposed = true) - val interceptsR = Vectors.dense(-0.38857157, 0.62492165, -0.2363501) + val interceptsR = Vectors.dense(-0.40016908, 0.62474768, -0.2245786) assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.05) assert(model1.interceptVector ~== interceptsRStd absTol 0.1) @@ -2274,27 +2282,26 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { data.V3 . data.V4 . data.V5 . - data.V6 0.03238285 + data.V6 0.03418889 $`1` 5 x 1 sparse Matrix of class "dgCMatrix" s0 . - data.V3 -0.1328284 - data.V4 0.4219321 - data.V5 -0.1247544 - data.V6 -0.1893318 + data.V3 -0.1114779 + data.V4 0.3992145 + data.V5 -0.1315371 + data.V6 -0.2107956 $`2` 5 x 1 sparse Matrix of class "dgCMatrix" s0 . - data.V3 0.004572312 + data.V3 0.006442826 data.V4 . data.V5 . data.V6 . - coefficients $`0` 5 x 1 sparse Matrix of class "dgCMatrix" @@ -2310,9 +2317,9 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { s0 . data.V3 . - data.V4 0.14571623 - data.V5 -0.16456351 - data.V6 -0.05866264 + data.V4 0.15710979 + data.V5 -0.16871602 + data.V6 -0.07928527 $`2` 5 x 1 sparse Matrix of class "dgCMatrix" @@ -2326,13 +2333,13 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { */ val coefficientsRStd = new DenseMatrix(3, 4, Array( - 0.0, 0.0, 0.0, 0.03238285, - -0.1328284, 0.4219321, -0.1247544, -0.1893318, - 0.004572312, 0.0, 0.0, 0.0), isTransposed = true) + 0.0, 0.0, 0.0, 0.03418889, + -0.1114779, 0.3992145, -0.1315371, -0.2107956, + 0.006442826, 0.0, 0.0, 0.0), isTransposed = true) val coefficientsR = new DenseMatrix(3, 4, Array( 0.0, 0.0, 0.0, 0.0, - 0.0, 0.14571623, -0.16456351, -0.05866264, + 0.0, 0.15710979, -0.16871602, -0.07928527, 0.0, 0.0, 0.0, 0.0), isTransposed = true) assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.01) diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala index a5159bcb0bbc..5d439a2fe29b 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala @@ -167,7 +167,7 @@ class KMeansSuite extends MLTest with DefaultReadWriteTest with PMMLReadWriteTes val model = new KMeans() .setK(3) - .setSeed(1) + .setSeed(42) .setInitMode(MLlibKMeans.RANDOM) .setTol(1e-6) .setDistanceMeasure(DistanceMeasure.COSINE) diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/PowerIterationClusteringSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/PowerIterationClusteringSuite.scala index 97269eea5b83..d3b8575327a8 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/clustering/PowerIterationClusteringSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/PowerIterationClusteringSuite.scala @@ -34,9 +34,9 @@ class PowerIterationClusteringSuite extends SparkFunSuite @transient var data: Dataset[_] = _ final val r1 = 1.0 - final val n1 = 10 + final val n1 = 80 final val r2 = 4.0 - final val n2 = 40 + final val n2 = 80 override def beforeAll(): Unit = { super.beforeAll() @@ -222,7 +222,7 @@ class PowerIterationClusteringSuite extends SparkFunSuite (0, 1), (0, 2), (3, 4) - )).toDF("src", "dst") + )).toDF("src", "dst").repartition(1) var assignments2 = new PowerIterationClustering() .setInitMode("random") diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala index 70d11774d1c7..d28f1f4240ad 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala @@ -65,7 +65,7 @@ class Word2VecSuite extends MLTest with DefaultReadWriteTest { // These expectations are just magic values, characterizing the current // behavior. The test needs to be updated to be more general, see SPARK-11502 - val magicExp = Vectors.dense(0.30153007534417237, -0.6833061711354689, 0.5116530778733167) + val magicExp = Vectors.dense(-0.11654884266582402, 0.3115301721475341, -0.6879349987615239) testTransformer[(Seq[String], Vector)](docDF, model, "result", "expected") { case Row(vector1: Vector, vector2: Vector) => assert(vector1 ~== magicExp absTol 1E-5, "Transformed vector is different with expected.") @@ -98,9 +98,9 @@ class Word2VecSuite extends MLTest with DefaultReadWriteTest { // These expectations are just magic values, characterizing the current // behavior. The test needs to be updated to be more general, see SPARK-11502 val magicExpected = Seq( - Vectors.dense(0.3326166272163391, -0.5603077411651611, -0.2309209555387497), - Vectors.dense(0.32463887333869934, -0.9306551218032837, 1.393115520477295), - Vectors.dense(-0.27150997519493103, 0.4372006058692932, -0.13465698063373566) + Vectors.dense(0.12662248313426971, 0.6108677387237549, -0.006755620241165161), + Vectors.dense(-0.3870747685432434, 0.023309476673603058, -1.567158818244934), + Vectors.dense(-0.08617416769266129, -0.09897610545158386, 0.6113300323486328) ) realVectors.zip(magicExpected).foreach { @@ -122,7 +122,7 @@ class Word2VecSuite extends MLTest with DefaultReadWriteTest { .setSeed(42L) .fit(docDF) - val expected = Map(("b", 0.2608488929093532), ("c", -0.8271274846926078)) + val expected = Map(("b", -0.024012837558984756), ("c", -0.19355152547359467)) val findSynonymsResult = model.findSynonyms("a", 2).rdd.map { case Row(w: String, sim: Double) => (w, sim) }.collectAsMap() diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala index 46fa3767efdc..f35c8c64bea6 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala @@ -184,7 +184,7 @@ class GBTRegressorSuite extends MLTest with DefaultReadWriteTest { val gbt = new GBTRegressor() .setMaxDepth(3) .setMaxIter(5) - .setSeed(123) + .setSeed(42) .setFeatureSubsetStrategy("all") // In this data, feature 1 is very important. diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala index 600a43242751..fc1284e770c0 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala @@ -232,8 +232,8 @@ class GeneralizedLinearRegressionSuite extends MLTest with DefaultReadWriteTest print(as.vector(coef(model))) } - [1] 2.2960999 0.8087933 - [1] 2.5002642 2.2000403 0.5999485 + [1] 2.2958751 0.8088523 + [1] 2.5009266 2.1997901 0.5999522 data <- read.csv("path", header=FALSE) model1 <- glm(f1, family=gaussian(link=log), data=data, start=c(0,0)) @@ -241,8 +241,8 @@ class GeneralizedLinearRegressionSuite extends MLTest with DefaultReadWriteTest print(as.vector(coef(model1))) print(as.vector(coef(model2))) - [1] 0.23069326 0.07993778 - [1] 0.25001858 0.22002452 0.05998789 + [1] 0.23063118 0.07995495 + [1] 0.25016124 0.21995737 0.05999335 data <- read.csv("path", header=FALSE) for (formula in c(f1, f2)) { @@ -250,17 +250,17 @@ class GeneralizedLinearRegressionSuite extends MLTest with DefaultReadWriteTest print(as.vector(coef(model))) } - [1] 2.3010179 0.8198976 - [1] 2.4108902 2.2130248 0.6086152 + [1] 2.3320341 0.8121904 + [1] 2.2837064 2.2487147 0.6120262 */ val expected = Seq( - Vectors.dense(0.0, 2.2960999, 0.8087933), - Vectors.dense(2.5002642, 2.2000403, 0.5999485), - Vectors.dense(0.0, 0.23069326, 0.07993778), - Vectors.dense(0.25001858, 0.22002452, 0.05998789), - Vectors.dense(0.0, 2.3010179, 0.8198976), - Vectors.dense(2.4108902, 2.2130248, 0.6086152)) + Vectors.dense(0.0, 2.2958751, 0.8088523), + Vectors.dense(2.5009266, 2.1997901, 0.5999522), + Vectors.dense(0.0, 0.23063118, 0.07995495), + Vectors.dense(0.25016124, 0.21995737, 0.05999335), + Vectors.dense(0.0, 2.3320341, 0.8121904), + Vectors.dense(2.2837064, 2.2487147, 0.6120262)) import GeneralizedLinearRegression._ @@ -308,21 +308,21 @@ class GeneralizedLinearRegressionSuite extends MLTest with DefaultReadWriteTest } } - [1] 0.0000000 2.2961005 0.8087932 - [1] 0.0000000 2.2130368 0.8309556 - [1] 0.0000000 1.7176137 0.9610657 - [1] 2.5002642 2.2000403 0.5999485 - [1] 3.1106389 2.0935142 0.5712711 - [1] 6.7597127 1.4581054 0.3994266 + [1] 0.0000000 2.2958757 0.8088521 + [1] 0.0000000 2.2128149 0.8310136 + [1] 0.0000000 1.7174260 0.9611137 + [1] 2.5009266 2.1997901 0.5999522 + [1] 3.1113269 2.0932659 0.5712717 + [1] 6.7604302 1.4578902 0.3994153 */ val expected = Seq( - Vectors.dense(0.0, 2.2961005, 0.8087932), - Vectors.dense(0.0, 2.2130368, 0.8309556), - Vectors.dense(0.0, 1.7176137, 0.9610657), - Vectors.dense(2.5002642, 2.2000403, 0.5999485), - Vectors.dense(3.1106389, 2.0935142, 0.5712711), - Vectors.dense(6.7597127, 1.4581054, 0.3994266)) + Vectors.dense(0.0, 2.2958757, 0.8088521), + Vectors.dense(0.0, 2.2128149, 0.8310136), + Vectors.dense(0.0, 1.7174260, 0.9611137), + Vectors.dense(2.5009266, 2.1997901, 0.5999522), + Vectors.dense(3.1113269, 2.0932659, 0.5712717), + Vectors.dense(6.7604302, 1.4578902, 0.3994153)) var idx = 0 for (fitIntercept <- Seq(false, true); diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringSuite.scala index b33b86b39a42..c25c89b5679a 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringSuite.scala @@ -47,9 +47,9 @@ class PowerIterationClusteringSuite extends SparkFunSuite with MLlibTestSparkCon test("power iteration clustering") { // Generate two circles following the example in the PIC paper. val r1 = 1.0 - val n1 = 10 + val n1 = 80 val r2 = 4.0 - val n2 = 10 + val n2 = 80 val n = n1 + n2 val points = genCircle(r1, n1) ++ genCircle(r2, n2) val similarities = for (i <- 1 until n; j <- 0 until i) yield { @@ -81,9 +81,9 @@ class PowerIterationClusteringSuite extends SparkFunSuite with MLlibTestSparkCon test("power iteration clustering on graph") { // Generate two circles following the example in the PIC paper. val r1 = 1.0 - val n1 = 10 + val n1 = 80 val r2 = 4.0 - val n2 = 10 + val n2 = 80 val n = n1 + n2 val points = genCircle(r1, n1) ++ genCircle(r2, n2) val similarities = for (i <- 1 until n; j <- 0 until i) yield { diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/StreamingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/StreamingKMeansSuite.scala index fdaa098345d1..a1ac10c06c69 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/StreamingKMeansSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/StreamingKMeansSuite.scala @@ -77,6 +77,7 @@ class StreamingKMeansSuite extends SparkFunSuite with TestSuiteBase { val k = 2 val d = 5 val r = 0.1 + val seed = 987654321 // create model with two clusters val kMeans = new StreamingKMeans() @@ -88,7 +89,7 @@ class StreamingKMeansSuite extends SparkFunSuite with TestSuiteBase { Array(5.0, 5.0)) // generate random data for k-means - val (input, centers) = StreamingKMeansDataGenerator(numPoints, numBatches, k, d, r, 42) + val (input, centers) = StreamingKMeansDataGenerator(numPoints, numBatches, k, d, r, seed) // setup and run the model training ssc = setupStreams(input, (inputDStream: DStream[Vector]) => { diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index 864e2a3e09d2..6c9cf7b6c829 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -1193,19 +1193,19 @@ class PowerIterationClustering(HasMaxIter, HasWeightCol, JavaParams, JavaMLReada ... (3, 0, 0.5), (3, 1, 0.7), (3, 2, 0.9), ... (4, 0, 0.5), (4, 1, 0.7), (4, 2, 0.9), (4, 3, 1.1), ... (5, 0, 0.5), (5, 1, 0.7), (5, 2, 0.9), (5, 3, 1.1), (5, 4, 1.3)] - >>> df = spark.createDataFrame(data).toDF("src", "dst", "weight") + >>> df = spark.createDataFrame(data).toDF("src", "dst", "weight").repartition(1) >>> pic = PowerIterationClustering(k=2, maxIter=40, weightCol="weight") >>> assignments = pic.assignClusters(df) >>> assignments.sort(assignments.id).show(truncate=False) +---+-------+ |id |cluster| +---+-------+ - |0 |1 | - |1 |1 | - |2 |1 | - |3 |1 | - |4 |1 | - |5 |0 | + |0 |0 | + |1 |0 | + |2 |0 | + |3 |0 | + |4 |0 | + |5 |1 | +---+-------+ ... >>> pic_path = temp_path + "/pic" diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 3f9de9ca207a..595ab1818488 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -3064,24 +3064,24 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, Has +----+--------------------+ |word| vector| +----+--------------------+ - | a|[0.09461779892444...| - | b|[1.15474212169647...| - | c|[-0.3794820010662...| + | a|[0.09511678665876...| + | b|[-1.2028766870498...| + | c|[0.30153277516365...| +----+--------------------+ ... >>> model.findSynonymsArray("a", 2) - [(u'b', 0.25053444504737854), (u'c', -0.6980510950088501)] + [(u'b', 0.015859870240092278), (u'c', -0.5680795907974243)] >>> from pyspark.sql.functions import format_number as fmt >>> model.findSynonyms("a", 2).select("word", fmt("similarity", 5).alias("similarity")).show() +----+----------+ |word|similarity| +----+----------+ - | b| 0.25053| - | c| -0.69805| + | b| 0.01586| + | c| -0.56808| +----+----------+ ... >>> model.transform(doc).head().model - DenseVector([0.5524, -0.4995, -0.3599, 0.0241, 0.3461]) + DenseVector([-0.4833, 0.1855, -0.273, -0.0509, -0.4769]) >>> word2vecPath = temp_path + "/word2vec" >>> word2Vec.save(word2vecPath) >>> loadedWord2Vec = Word2Vec.load(word2vecPath) diff --git a/python/pyspark/ml/recommendation.py b/python/pyspark/ml/recommendation.py index 520d7912c1a1..bf2716485df9 100644 --- a/python/pyspark/ml/recommendation.py +++ b/python/pyspark/ml/recommendation.py @@ -79,27 +79,27 @@ class ALS(JavaEstimator, HasCheckpointInterval, HasMaxIter, HasPredictionCol, Ha >>> test = spark.createDataFrame([(0, 2), (1, 0), (2, 0)], ["user", "item"]) >>> predictions = sorted(model.transform(test).collect(), key=lambda r: r[0]) >>> predictions[0] - Row(user=0, item=2, prediction=-0.13807615637779236) + Row(user=0, item=2, prediction=0.6929101347923279) >>> predictions[1] - Row(user=1, item=0, prediction=2.6258413791656494) + Row(user=1, item=0, prediction=3.47356915473938) >>> predictions[2] - Row(user=2, item=0, prediction=-1.5018409490585327) + Row(user=2, item=0, prediction=-0.8991986513137817) >>> user_recs = model.recommendForAllUsers(3) >>> user_recs.where(user_recs.user == 0)\ .select("recommendations.item", "recommendations.rating").collect() - [Row(item=[0, 1, 2], rating=[3.910..., 1.992..., -0.138...])] + [Row(item=[0, 1, 2], rating=[3.910..., 1.997..., 0.692...])] >>> item_recs = model.recommendForAllItems(3) >>> item_recs.where(item_recs.item == 2)\ .select("recommendations.user", "recommendations.rating").collect() - [Row(user=[2, 1, 0], rating=[4.901..., 3.981..., -0.138...])] + [Row(user=[2, 1, 0], rating=[4.892..., 3.991..., 0.692...])] >>> user_subset = df.where(df.user == 2) >>> user_subset_recs = model.recommendForUserSubset(user_subset, 3) >>> user_subset_recs.select("recommendations.item", "recommendations.rating").first() - Row(item=[2, 1, 0], rating=[4.901..., 1.056..., -1.501...]) + Row(item=[2, 1, 0], rating=[4.892..., 1.076..., -0.899...]) >>> item_subset = df.where(df.item == 0) >>> item_subset_recs = model.recommendForItemSubset(item_subset, 3) >>> item_subset_recs.select("recommendations.user", "recommendations.rating").first() - Row(user=[0, 1, 2], rating=[3.910..., 2.625..., -1.501...]) + Row(user=[0, 1, 2], rating=[3.910..., 3.473..., -0.899...]) >>> als_path = temp_path + "/als" >>> als.save(als_path) >>> als2 = ALS.load(als_path) diff --git a/python/pyspark/ml/tests/test_algorithms.py b/python/pyspark/ml/tests/test_algorithms.py index 6082082c1809..034eaed6868e 100644 --- a/python/pyspark/ml/tests/test_algorithms.py +++ b/python/pyspark/ml/tests/test_algorithms.py @@ -83,7 +83,7 @@ def test_raw_and_probability_prediction(self): result = model.transform(test).head() expected_prediction = 2.0 expected_probability = [0.0, 0.0, 1.0] - expected_rawPrediction = [57.3955, -124.5462, 67.9943] + expected_rawPrediction = [-11.6081922998, -8.15827998691, 22.17757045] self.assertTrue(result.prediction, expected_prediction) self.assertTrue(np.allclose(result.probability, expected_probability, atol=1E-4)) self.assertTrue(np.allclose(result.rawPrediction, expected_rawPrediction, atol=1E-4)) diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py index 1f4abf515733..be7b8da98131 100644 --- a/python/pyspark/ml/tuning.py +++ b/python/pyspark/ml/tuning.py @@ -504,15 +504,15 @@ class TrainValidationSplit(Estimator, ValidatorParams, HasParallelism, HasCollec ... (Vectors.dense([0.5]), 0.0), ... (Vectors.dense([0.6]), 1.0), ... (Vectors.dense([1.0]), 1.0)] * 10, - ... ["features", "label"]) + ... ["features", "label"]).repartition(1) >>> lr = LogisticRegression() >>> grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() >>> evaluator = BinaryClassificationEvaluator() >>> tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator, - ... parallelism=2) + ... parallelism=1, seed=42) >>> tvsModel = tvs.fit(dataset) >>> evaluator.evaluate(tvsModel.transform(dataset)) - 0.8333... + 0.833... .. versionadded:: 2.0.0 """ diff --git a/python/pyspark/mllib/recommendation.py b/python/pyspark/mllib/recommendation.py index 3d4eae85132b..3dd7cb200c28 100644 --- a/python/pyspark/mllib/recommendation.py +++ b/python/pyspark/mllib/recommendation.py @@ -100,16 +100,16 @@ class MatrixFactorizationModel(JavaModelWrapper, JavaSaveable, JavaLoader): >>> users_for_products[0] (1, (Rating(user=2, product=1, rating=...),)) - >>> model = ALS.train(ratings, 1, nonnegative=True, seed=10) + >>> model = ALS.train(ratings, 1, nonnegative=True, seed=123456789) >>> model.predict(2, 2) 3.73... >>> df = sqlContext.createDataFrame([Rating(1, 1, 1.0), Rating(1, 2, 2.0), Rating(2, 1, 2.0)]) - >>> model = ALS.train(df, 1, nonnegative=True, seed=10) + >>> model = ALS.train(df, 1, nonnegative=True, seed=123456789) >>> model.predict(2, 2) 3.73... - >>> model = ALS.trainImplicit(ratings, 1, nonnegative=True, seed=10) + >>> model = ALS.trainImplicit(ratings, 1, nonnegative=True, seed=123456789) >>> model.predict(2, 2) 0.4... diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 8227e829597e..58d74f5d7d9b 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -795,9 +795,9 @@ def sample(self, withReplacement=None, fraction=None, seed=None): >>> df = spark.range(10) >>> df.sample(0.5, 3).count() - 4 + 7 >>> df.sample(fraction=0.5, seed=3).count() - 4 + 7 >>> df.sample(withReplacement=True, fraction=0.5, seed=3).count() 1 >>> df.sample(1.0).count() @@ -865,8 +865,8 @@ def sampleBy(self, col, fractions, seed=None): +---+-----+ |key|count| +---+-----+ - | 0| 5| - | 1| 9| + | 0| 3| + | 1| 6| +---+-----+ >>> dataset.sampleBy(col("key"), fractions={2: 1.0}, seed=0).count() 33 @@ -898,10 +898,10 @@ def randomSplit(self, weights, seed=None): >>> splits = df4.randomSplit([1.0, 2.0], 24) >>> splits[0].count() - 1 + 2 >>> splits[1].count() - 3 + 2 """ for w in weights: if w < 0.0: diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index bc28c9d453ec..6ae23576e7bc 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -584,8 +584,8 @@ def rand(seed=None): .. note:: The function is non-deterministic in general case. >>> df.withColumn('rand', rand(seed=42) * 3).collect() - [Row(age=2, name=u'Alice', rand=1.1568609015300986), - Row(age=5, name=u'Bob', rand=1.403379671529166)] + [Row(age=2, name=u'Alice', rand=2.4052597283576684), + Row(age=5, name=u'Bob', rand=2.3913904055683974)] """ sc = SparkContext._active_spark_context if seed is not None: @@ -604,8 +604,8 @@ def randn(seed=None): .. note:: The function is non-deterministic in general case. >>> df.withColumn('randn', randn(seed=42)).collect() - [Row(age=2, name=u'Alice', randn=-0.7556247885860078), - Row(age=5, name=u'Bob', randn=-0.0861619008451133)] + [Row(age=2, name=u'Alice', randn=1.1027054481455365), + Row(age=5, name=u'Bob', randn=0.7400395449950132)] """ sc = SparkContext._active_spark_context if seed is not None: diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py index b77757342843..273749ed1112 100644 --- a/python/pyspark/sql/tests/test_functions.py +++ b/python/pyspark/sql/tests/test_functions.py @@ -83,9 +83,9 @@ def test_corr(self): self.assertTrue(abs(corr - 0.95734012) < 1e-6) def test_sampleby(self): - df = self.sc.parallelize([Row(a=i, b=(i % 3)) for i in range(10)]).toDF() + df = self.sc.parallelize([Row(a=i, b=(i % 3)) for i in range(100)]).toDF() sampled = df.stat.sampleBy(u"b", fractions={0: 0.5, 1: 0.5}, seed=0) - self.assertTrue(sampled.count() == 3) + self.assertTrue(sampled.count() == 35) def test_cov(self): df = self.sc.parallelize([Row(a=i, b=2 * i) for i in range(10)]).toDF() diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RandomSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RandomSuite.scala index 752c9d5449ee..469c24b3b5f4 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RandomSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RandomSuite.scala @@ -17,25 +17,21 @@ package org.apache.spark.sql.catalyst.expressions -import org.scalatest.Matchers._ - import org.apache.spark.SparkFunSuite import org.apache.spark.sql.types.{IntegerType, LongType} class RandomSuite extends SparkFunSuite with ExpressionEvalHelper { test("random") { - checkDoubleEvaluation(Rand(30), 0.31429268272540556 +- 0.001) - checkDoubleEvaluation(Randn(30), -0.4798519469521663 +- 0.001) + checkEvaluation(Rand(30), 0.2762195585886885) + checkEvaluation(Randn(30), -1.0451987154313813) - checkDoubleEvaluation( - new Rand(Literal.create(null, LongType)), 0.8446490682263027 +- 0.001) - checkDoubleEvaluation( - new Randn(Literal.create(null, IntegerType)), 1.1164209726833079 +- 0.001) + checkEvaluation(new Rand(Literal.create(null, LongType)), 0.7604953758285915) + checkEvaluation(new Randn(Literal.create(null, IntegerType)), 1.6034991609278433) } test("SPARK-9127 codegen with long seed") { - checkDoubleEvaluation(Rand(5419823303878592871L), 0.2304755080444375 +- 0.001) - checkDoubleEvaluation(Randn(5419823303878592871L), -1.2824262718225607 +- 0.001) + checkEvaluation(Rand(5419823303878592871L), 0.7145363364564755) + checkEvaluation(Randn(5419823303878592871L), 0.7816815274533012) } } diff --git a/sql/core/src/test/resources/sql-tests/results/group-by-ordinal.sql.out b/sql/core/src/test/resources/sql-tests/results/group-by-ordinal.sql.out index cf5add6a71af..09e2c632f638 100644 --- a/sql/core/src/test/resources/sql-tests/results/group-by-ordinal.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/group-by-ordinal.sql.out @@ -141,12 +141,12 @@ from -- !query 13 schema struct -- !query 13 output -1 0.4048454303385226 2 -1 0.8446490682263027 1 -2 0.5871875724155838 1 -2 0.8865128837019473 2 -3 0.742083829230211 1 -3 0.9179913208300406 2 +1 0.5234194256885571 2 +1 0.7604953758285915 1 +2 0.0953472826424725 1 +2 0.3163249920547614 2 +3 0.2710259815484829 2 +3 0.7141011170991605 1 -- !query 14 diff --git a/sql/core/src/test/resources/sql-tests/results/random.sql.out b/sql/core/src/test/resources/sql-tests/results/random.sql.out index bca67320fe7b..acd0609aabb1 100644 --- a/sql/core/src/test/resources/sql-tests/results/random.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/random.sql.out @@ -7,7 +7,7 @@ SELECT rand(0) -- !query 0 schema struct -- !query 0 output -0.8446490682263027 +0.7604953758285915 -- !query 1 @@ -15,7 +15,7 @@ SELECT rand(cast(3 / 7 AS int)) -- !query 1 schema struct -- !query 1 output -0.8446490682263027 +0.7604953758285915 -- !query 2 @@ -23,7 +23,7 @@ SELECT rand(NULL) -- !query 2 schema struct -- !query 2 output -0.8446490682263027 +0.7604953758285915 -- !query 3 @@ -31,7 +31,7 @@ SELECT rand(cast(NULL AS int)) -- !query 3 schema struct -- !query 3 output -0.8446490682263027 +0.7604953758285915 -- !query 4 @@ -48,7 +48,7 @@ SELECT randn(0L) -- !query 5 schema struct -- !query 5 output -1.1164209726833079 +1.6034991609278433 -- !query 6 @@ -56,7 +56,7 @@ SELECT randn(cast(3 / 7 AS long)) -- !query 6 schema struct -- !query 6 output -1.1164209726833079 +1.6034991609278433 -- !query 7 @@ -64,7 +64,7 @@ SELECT randn(NULL) -- !query 7 schema struct -- !query 7 output -1.1164209726833079 +1.6034991609278433 -- !query 8 @@ -72,7 +72,7 @@ SELECT randn(cast(NULL AS long)) -- !query 8 schema struct -- !query 8 output -1.1164209726833079 +1.6034991609278433 -- !query 9 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala index 589873b9c3ea..2a74bfe4d378 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala @@ -47,7 +47,7 @@ class DataFrameStatSuite extends QueryTest with SharedSQLContext { val data = sparkContext.parallelize(1 to n, 2).toDF("id") checkAnswer( data.sample(withReplacement = false, 0.05, seed = 13), - Seq(3, 17, 27, 58, 62).map(Row(_)) + Seq(37, 8, 90).map(Row(_)) ) } @@ -371,7 +371,7 @@ class DataFrameStatSuite extends QueryTest with SharedSQLContext { val sampled = df.stat.sampleBy("key", Map(0 -> 0.1, 1 -> 0.2), 0L) checkAnswer( sampled.groupBy("key").count().orderBy("key"), - Seq(Row(0, 6), Row(1, 11))) + Seq(Row(0, 1), Row(1, 6))) } test("sampleBy one column") { @@ -379,7 +379,7 @@ class DataFrameStatSuite extends QueryTest with SharedSQLContext { val sampled = df.stat.sampleBy($"key", Map(0 -> 0.1, 1 -> 0.2), 0L) checkAnswer( sampled.groupBy("key").count().orderBy("key"), - Seq(Row(0, 6), Row(1, 11))) + Seq(Row(0, 1), Row(1, 6))) } test("sampleBy multiple columns") { @@ -389,7 +389,7 @@ class DataFrameStatSuite extends QueryTest with SharedSQLContext { struct($"name", $"key"), Map(Row("Foo", 0) -> 0.1, Row("Foo", 1) -> 0.2), 0L) checkAnswer( sampled.groupBy("key").count().orderBy("key"), - Seq(Row(0, 6), Row(1, 11))) + Seq(Row(0, 1), Row(1, 6))) } // This test case only verifies that `DataFrame.countMinSketch()` methods do return diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala index db9f251e1dca..2dc21a7e054b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala @@ -618,7 +618,7 @@ class DatasetSuite extends QueryTest with SharedSQLContext { val data = sparkContext.parallelize(1 to n, 2).toDS() checkDataset( data.sample(withReplacement = false, 0.05, seed = 13), - 3, 17, 27, 58, 62) + 8, 37, 90) } test("sample fraction should not be negative with replacement") { @@ -650,9 +650,10 @@ class DatasetSuite extends QueryTest with SharedSQLContext { } test("SPARK-16686: Dataset.sample with seed results shouldn't depend on downstream usage") { + val a = 7 val simpleUdf = udf((n: Int) => { - require(n != 1, "simpleUdf shouldn't see id=1!") - 1 + require(n != a, s"simpleUdf shouldn't see id=$a!") + a }) val df = Seq( @@ -668,10 +669,10 @@ class DatasetSuite extends QueryTest with SharedSQLContext { (9, "string9") ).toDF("id", "stringData") val sampleDF = df.sample(false, 0.7, 50) - // After sampling, sampleDF doesn't contain id=1. - assert(!sampleDF.select("id").as[Int].collect.contains(1)) - // simpleUdf should not encounter id=1. - checkAnswer(sampleDF.select(simpleUdf($"id")), List.fill(sampleDF.count.toInt)(Row(1))) + // After sampling, sampleDF doesn't contain id=a. + assert(!sampleDF.select("id").as[Int].collect.contains(a)) + // simpleUdf should not encounter id=a. + checkAnswer(sampleDF.select(simpleUdf($"id")), List.fill(sampleDF.count.toInt)(Row(a))) } test("SPARK-11436: we should rebind right encoder when join 2 datasets") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/TestCsvData.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/TestCsvData.scala index 3e20cc47dca2..79993313a86e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/TestCsvData.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/TestCsvData.scala @@ -24,8 +24,7 @@ private[csv] trait TestCsvData { def sampledTestData: Dataset[String] = { spark.range(0, 100, 1).map { index => - val predefinedSample = Set[Long](2, 8, 15, 27, 30, 34, 35, 37, 44, 46, - 57, 62, 68, 72) + val predefinedSample = Set[Long](3, 18, 20, 24, 50, 60, 87, 99) if (predefinedSample.contains(index)) { index.toString } else { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/TestJsonData.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/TestJsonData.scala index 6e9559edf8ec..17503330bfd5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/TestJsonData.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/TestJsonData.scala @@ -236,8 +236,7 @@ private[json] trait TestJsonData { def sampledTestData: Dataset[String] = { spark.range(0, 100, 1).map { index => - val predefinedSample = Set[Long](2, 8, 15, 27, 30, 34, 35, 37, 44, 46, - 57, 62, 68, 72) + val predefinedSample = Set[Long](3, 18, 20, 24, 50, 60, 87, 99) if (predefinedSample.contains(index)) { s"""{"f1":${index.toString}}""" } else {