diff --git a/mllib-dal/pom.xml b/mllib-dal/pom.xml index eed1b4ecc..e4b9808ac 100644 --- a/mllib-dal/pom.xml +++ b/mllib-dal/pom.xml @@ -274,6 +274,9 @@ ${project.build.directory}/surefire-reports . + + ${computeDevice} + test-reports.txt diff --git a/mllib-dal/src/main/native/LinearRegressionImpl.cpp b/mllib-dal/src/main/native/LinearRegressionImpl.cpp index 51e6401b4..433c73a6d 100644 --- a/mllib-dal/src/main/native/LinearRegressionImpl.cpp +++ b/mllib-dal/src/main/native/LinearRegressionImpl.cpp @@ -41,11 +41,9 @@ namespace ridge_regression_cpu = daal::algorithms::ridge_regression; typedef double algorithmFPType; /* Algorithm floating-point type */ -static NumericTablePtr linear_regression_compute(size_t rankId, - ccl::communicator &comm, - const NumericTablePtr &pData, - const NumericTablePtr &pLabel, - size_t nBlocks) { +static NumericTablePtr linear_regression_compute( + size_t rankId, ccl::communicator &comm, const NumericTablePtr &pData, + const NumericTablePtr &pLabel, bool fitIntercept, size_t nBlocks) { using daal::byte; linear_regression_cpu::training::Distributed localAlgorithm; @@ -54,6 +52,7 @@ static NumericTablePtr linear_regression_compute(size_t rankId, localAlgorithm.input.set(linear_regression_cpu::training::data, pData); localAlgorithm.input.set( linear_regression_cpu::training::dependentVariables, pLabel); + localAlgorithm.parameter.interceptFlag = fitIntercept; /* Train the multiple linear regression model on local nodes */ localAlgorithm.compute(); @@ -107,6 +106,7 @@ static NumericTablePtr linear_regression_compute(size_t rankId, /* Merge and finalizeCompute the multiple linear regression model on the * master node */ + masterAlgorithm.parameter.interceptFlag = fitIntercept; masterAlgorithm.compute(); masterAlgorithm.finalizeCompute(); @@ -125,17 +125,19 @@ static NumericTablePtr linear_regression_compute(size_t rankId, return resultTable; } -static NumericTablePtr ridge_regression_compute( - size_t rankId, ccl::communicator &comm, const NumericTablePtr &pData, - const NumericTablePtr &pLabel, double regParam, size_t nBlocks) { +static NumericTablePtr +ridge_regression_compute(size_t rankId, ccl::communicator &comm, + const NumericTablePtr &pData, + const NumericTablePtr &pLabel, bool fitIntercept, + double regParam, size_t nBlocks) { using daal::byte; NumericTablePtr ridgeParams(new HomogenNumericTable( 1, 1, NumericTable::doAllocate, regParam)); - ridge_regression_cpu::training::Distributed localAlgorithm; localAlgorithm.parameter.ridgeParameters = ridgeParams; + localAlgorithm.parameter.interceptFlag = fitIntercept; /* Pass a training data set and dependent values to the algorithm */ localAlgorithm.input.set(ridge_regression_cpu::training::data, pData); @@ -195,6 +197,7 @@ static NumericTablePtr ridge_regression_compute( /* Merge and finalizeCompute the multiple ridge regression model on the * master node */ + masterAlgorithm.parameter.interceptFlag = fitIntercept; masterAlgorithm.compute(); masterAlgorithm.finalizeCompute(); @@ -215,11 +218,13 @@ static NumericTablePtr ridge_regression_compute( #ifdef CPU_GPU_PROFILE static jlong doLROneAPICompute(JNIEnv *env, size_t rankId, ccl::communicator &cclComm, sycl::queue &queue, - jlong pData, jlong pLabel, jint executorNum, + jlong pData, jlong pLabel, + jboolean jfitIntercept, jint executorNum, jobject resultObj) { std::cout << "oneDAL (native): GPU compute start , rankid " << rankId << std::endl; const bool isRoot = (rankId == ccl_root); + bool fitIntercept = bool(jfitIntercept); int size = cclComm.size(); ccl::shared_ptr_class &kvs = getKvs(); @@ -230,7 +235,8 @@ static jlong doLROneAPICompute(JNIEnv *env, size_t rankId, homogen_table ytrain = *reinterpret_cast(pLabel); linear_regression_gpu::train_input local_input{xtrain, ytrain}; - const auto linear_regression_desc = linear_regression_gpu::descriptor<>(); + const auto linear_regression_desc = + linear_regression_gpu::descriptor<>(fitIntercept); linear_regression_gpu::train_result result_train = preview::train(comm, linear_regression_desc, xtrain, ytrain); @@ -248,14 +254,14 @@ static jlong doLROneAPICompute(JNIEnv *env, size_t rankId, /* * Class: com_intel_oap_mllib_regression_LinearRegressionDALImpl * Method: cLinearRegressionTrainDAL - * Signature: - * (JJDDIIIILjava/lang/String;Lcom/intel/oap/mllib/regression/LiRResult;)J + * Signature: (JJZDDIII[ILcom/intel/oap/mllib/regression/LiRResult;)J */ JNIEXPORT jlong JNICALL Java_com_intel_oap_mllib_regression_LinearRegressionDALImpl_cLinearRegressionTrainDAL( - JNIEnv *env, jobject obj, jlong data, jlong label, jdouble regParam, - jdouble elasticNetParam, jint executorNum, jint executorCores, - jint computeDeviceOrdinal, jintArray gpuIdxArray, jobject resultObj) { + JNIEnv *env, jobject obj, jlong data, jlong label, jboolean fitIntercept, + jdouble regParam, jdouble elasticNetParam, jint executorNum, + jint executorCores, jint computeDeviceOrdinal, jintArray gpuIdxArray, + jobject resultObj) { std::cout << "oneDAL (native): use DPC++ kernels " << "; device " << ComputeDeviceString[computeDeviceOrdinal] @@ -284,8 +290,9 @@ Java_com_intel_oap_mllib_regression_LinearRegressionDALImpl_cLinearRegressionTra jlong pDatagpu = (jlong)data; jlong pLabelgpu = (jlong)label; - resultptr = doLROneAPICompute(env, rankId, cclComm, queue, pDatagpu, - pLabelgpu, executorNum, resultObj); + resultptr = + doLROneAPICompute(env, rankId, cclComm, queue, pDatagpu, pLabelgpu, + fitIntercept, executorNum, resultObj); env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0); #endif } else { @@ -300,11 +307,12 @@ Java_com_intel_oap_mllib_regression_LinearRegressionDALImpl_cLinearRegressionTra cout << "oneDAL (native): Number of CPU threads used: " << nThreadsNew << endl; if (regParam == 0) { - resultTable = linear_regression_compute(rankId, cclComm, pData, - pLabel, executorNum); + resultTable = linear_regression_compute( + rankId, cclComm, pData, pLabel, fitIntercept, executorNum); } else { - resultTable = ridge_regression_compute( - rankId, cclComm, pData, pLabel, regParam, executorNum); + resultTable = + ridge_regression_compute(rankId, cclComm, pData, pLabel, + fitIntercept, regParam, executorNum); } NumericTablePtr *coeffvectors = new NumericTablePtr(resultTable); diff --git a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_regression_LinearRegressionDALImpl.h b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_regression_LinearRegressionDALImpl.h index a9bf8521f..d92ed6ece 100644 --- a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_regression_LinearRegressionDALImpl.h +++ b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_regression_LinearRegressionDALImpl.h @@ -10,10 +10,10 @@ extern "C" { /* * Class: com_intel_oap_mllib_regression_LinearRegressionDALImpl * Method: cLinearRegressionTrainDAL - * Signature: (JJDDIII[ILcom/intel/oap/mllib/regression/LiRResult;)J + * Signature: (JJZDDIII[ILcom/intel/oap/mllib/regression/LiRResult;)J */ JNIEXPORT jlong JNICALL Java_com_intel_oap_mllib_regression_LinearRegressionDALImpl_cLinearRegressionTrainDAL - (JNIEnv *, jobject, jlong, jlong, jdouble, jdouble, jint, jint, jint, jintArray, jobject); + (JNIEnv *, jobject, jlong, jlong, jboolean, jdouble, jdouble, jint, jint, jint, jintArray, jobject); #ifdef __cplusplus } diff --git a/mllib-dal/src/main/native/javah/com_intel_oneapi_dal_table_ColumnAccessor.h b/mllib-dal/src/main/native/javah/com_intel_oneapi_dal_table_ColumnAccessor.h index 9f6e9ec8b..079d90aee 100644 --- a/mllib-dal/src/main/native/javah/com_intel_oneapi_dal_table_ColumnAccessor.h +++ b/mllib-dal/src/main/native/javah/com_intel_oneapi_dal_table_ColumnAccessor.h @@ -15,20 +15,22 @@ extern "C" { JNIEXPORT jdoubleArray JNICALL Java_com_intel_oneapi_dal_table_ColumnAccessor_cPullDouble (JNIEnv *, jobject, jlong, jlong, jlong, jlong, jint); + /* * Class: com_intel_oneapi_dal_table_ColumnAccessor - * Method: cPullInt - * Signature: (JJJJI)[I + * Method: cPullFloat + * Signature: (JJJJI)[F */ -JNIEXPORT jintArray JNICALL Java_com_intel_oneapi_dal_table_ColumnAccessor_cPullInt +JNIEXPORT jfloatArray JNICALL Java_com_intel_oneapi_dal_table_ColumnAccessor_cPullFloat (JNIEnv *, jobject, jlong, jlong, jlong, jlong, jint); + /* * Class: com_intel_oneapi_dal_table_ColumnAccessor - * Method: cPullFloat - * Signature: (JJJJI)[F + * Method: cPullInt + * Signature: (JJJJI)[I */ -JNIEXPORT jfloatArray JNICALL Java_com_intel_oneapi_dal_table_ColumnAccessor_cPullFloat +JNIEXPORT jintArray JNICALL Java_com_intel_oneapi_dal_table_ColumnAccessor_cPullInt (JNIEnv *, jobject, jlong, jlong, jlong, jlong, jint); #ifdef __cplusplus diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/LinearRegressionDALImpl.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/LinearRegressionDALImpl.scala index da07c9f7f..27156d784 100644 --- a/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/LinearRegressionDALImpl.scala +++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/LinearRegressionDALImpl.scala @@ -74,15 +74,15 @@ class LinearRegressionDALImpl( val fitIntercept: Boolean, val useDevice = sparkContext.getConf.get("spark.oap.mllib.device", Utils.DefaultComputeDevice) val computeDevice = Common.ComputeDevice.getDeviceByName(useDevice) + val isTest = sparkContext.getConf.getBoolean("spark.oap.mllib.isTest", false) + val kvsIPPort = getOneCCLIPPort(labeledPoints.rdd) val labeledPointsTables = if (useDevice == "GPU") { if (OneDAL.isDenseDataset(labeledPoints, featuresCol)) { OneDAL.rddLabeledPointToMergedHomogenTables(labeledPoints, labelCol, featuresCol, executorNum, computeDevice) } else { - val msg = s"OAPMLlib: Sparse table is not supported for gpu now." - //todo sparse table is not supported - + val msg = s"OAP MLlib: Sparse table is not supported for GPU now." logError(msg) throw new SparkException(msg) } @@ -94,6 +94,13 @@ class LinearRegressionDALImpl( val fitIntercept: Boolean, } } + // OAP MLlib: Only normal linear regression is supported for GPU currently + if (useDevice == "GPU" && regParam != 0){ + val msg = s"OAP MLlib: Regularization parameter is not supported for GPU now." + logError(msg) + throw new SparkException(msg) + } + val results = labeledPointsTables.mapPartitionsWithIndex { case (rank: Int, tables: Iterator[(Long, Long)]) => val (featureTabAddr, lableTabAddr) = tables.next() @@ -101,8 +108,14 @@ class LinearRegressionDALImpl( val fitIntercept: Boolean, val result = new LiRResult() val gpuIndices = if (useDevice == "GPU") { - val resources = TaskContext.get().resources() - resources("gpu").addresses.map(_.toInt) + // OAP MLlib: This ia a hack for unit test, and will be repaireid in the future. + // GPU info can't be detected automatically in UT enviroment. + if (isTest) { + Array(0) + } else { + val resources = TaskContext.get().resources() + resources("gpu").addresses.map(_.toInt) + } } else { null } @@ -110,6 +123,7 @@ class LinearRegressionDALImpl( val fitIntercept: Boolean, val cbeta = cLinearRegressionTrainDAL( featureTabAddr, lableTabAddr, + fitIntercept, regParam, elasticNetParam, executorNum, @@ -149,6 +163,7 @@ class LinearRegressionDALImpl( val fitIntercept: Boolean, // Single entry to call Linear Regression DAL backend with parameters @native private def cLinearRegressionTrainDAL(data: Long, label: Long, + fitIntercept: Boolean, regParam: Double, elasticNetParam: Double, executorNum: Int, diff --git a/mllib-dal/src/main/scala/org/apache/spark/ml/regression/spark313/LinearRegression.scala b/mllib-dal/src/main/scala/org/apache/spark/ml/regression/spark313/LinearRegression.scala index f5e93fc77..6a8d1051b 100644 --- a/mllib-dal/src/main/scala/org/apache/spark/ml/regression/spark313/LinearRegression.scala +++ b/mllib-dal/src/main/scala/org/apache/spark/ml/regression/spark313/LinearRegression.scala @@ -455,8 +455,9 @@ class LinearRegression @Since("1.3") (@Since("1.3.0") override val uid: String) private def trainWithNormal( dataset: Dataset[_], instr: Instrumentation): LinearRegressionModel = { - // oneDAL only support simple linear regression and ridge regression - val paramSupported = ($(regParam) == 0) || ($(regParam) != 0 && $(elasticNetParam) == 0) + val paramSupported = ($(regParam) == 0) && (!isDefined(weightCol) || getWeightCol.isEmpty) + val sparkContext = dataset.sparkSession.sparkContext + val useDevice = sparkContext.getConf.get("spark.oap.mllib.device", Utils.DefaultComputeDevice) val isPlatformSupported = Utils.checkClusterPlatformCompatibility( dataset.sparkSession.sparkContext) if (paramSupported && Utils.isOAPEnabled && isPlatformSupported) { @@ -489,6 +490,13 @@ class LinearRegression @Since("1.3") (@Since("1.3.0") override val uid: String) } else { // For low dimensional data, WeightedLeastSquares is more efficient since the // training algorithm only requires one pass through the data. (SPARK-10668) + val fallbackmsg = s"OAP MLlib: Fall back to vanilla Spark MLlib" + logError(fallbackmsg) + if (!paramSupported && useDevice == "GPU") { + val msg = s"OAP MLlib: Parameter used is not supported for GPU now." + logError(msg) + throw new SparkException(msg) + } val optimizer = new WeightedLeastSquares($(fitIntercept), $(regParam), elasticNetParam = $(elasticNetParam), $(standardization), true, diff --git a/mllib-dal/src/main/scala/org/apache/spark/ml/regression/spark321/LinearRegression.scala b/mllib-dal/src/main/scala/org/apache/spark/ml/regression/spark321/LinearRegression.scala index 0fb7f6c05..72d2ae717 100644 --- a/mllib-dal/src/main/scala/org/apache/spark/ml/regression/spark321/LinearRegression.scala +++ b/mllib-dal/src/main/scala/org/apache/spark/ml/regression/spark321/LinearRegression.scala @@ -454,8 +454,9 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String private def trainWithNormal( dataset: Dataset[_], instr: Instrumentation): LinearRegressionModel = { - // oneDAL only support simple linear regression and ridge regression - val paramSupported = ($(regParam) == 0) || ($(regParam) != 0 && $(elasticNetParam) == 0) + val paramSupported = ($(regParam) == 0) && (!isDefined(weightCol) || getWeightCol.isEmpty) + val sparkContext = dataset.sparkSession.sparkContext + val useDevice = sparkContext.getConf.get("spark.oap.mllib.device", Utils.DefaultComputeDevice) val isPlatformSupported = Utils.checkClusterPlatformCompatibility( dataset.sparkSession.sparkContext) if (paramSupported && Utils.isOAPEnabled && isPlatformSupported) { @@ -488,6 +489,13 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String } else { // For low dimensional data, WeightedLeastSquares is more efficient since the // training algorithm only requires one pass through the data. (SPARK-10668) + val fallbackmsg = s"OAP MLlib: Fall back to vanilla Spark MLlib" + logError(fallbackmsg) + if (!paramSupported && useDevice == "GPU") { + val msg = s"OAP MLlib: Parameter used is not supported for GPU now." + logError(msg) + throw new SparkException(msg) + } val optimizer = new WeightedLeastSquares($(fitIntercept), $(regParam), elasticNetParam = $(elasticNetParam), $(standardization), true, diff --git a/mllib-dal/src/test/scala/org/apache/spark/ml/regression/MLlibLinearRegressionSuite.scala b/mllib-dal/src/test/scala/org/apache/spark/ml/regression/MLlibLinearRegressionSuite.scala index a03a3fe94..d6ff7c377 100755 --- a/mllib-dal/src/test/scala/org/apache/spark/ml/regression/MLlibLinearRegressionSuite.scala +++ b/mllib-dal/src/test/scala/org/apache/spark/ml/regression/MLlibLinearRegressionSuite.scala @@ -17,6 +17,8 @@ package org.apache.spark.ml.regression +import com.intel.oneapi.dal.table.Common + import org.apache.spark.{SparkConf, TestCommon} import scala.collection.JavaConverters._ @@ -37,8 +39,16 @@ class MLlibLinearRegressionSuite extends MLTest with DefaultReadWriteTest with P import testImplicits._ override def sparkConf: SparkConf = { + val device = TestCommon.getComputeDevice.toString val conf = super.sparkConf - conf.set("spark.oap.mllib.device", TestCommon.getComputeDevice.toString) + if (device == "GPU") { + conf.set("spark.oap.mllib.device", Common.ComputeDevice.GPU.toString) + conf.set("spark.oap.mllib.isTest", "true") + } else { + conf.set("spark.oap.mllib.device", TestCommon.getComputeDevice.toString) + } + + conf } private val seed: Int = 42 @@ -53,6 +63,7 @@ class MLlibLinearRegressionSuite extends MLTest with DefaultReadWriteTest with P override def beforeAll(): Unit = { super.beforeAll() + val testDevice = TestCommon.getComputeDevice.toString datasetWithDenseFeature = sc.parallelize(LinearDataGenerator.generateLinearInput( intercept = 6.3, weights = Array(4.7, 7.2), xMean = Array(0.9, -1.3), xVariance = Array(0.7, 1.2), nPoints = 10000, seed, eps = 0.1), 2).map(_.asML).toDF() @@ -218,6 +229,7 @@ class MLlibLinearRegressionSuite extends MLTest with DefaultReadWriteTest with P } } + // OAP-MLlib: Singular matrice prejudege is not supported. // test("linear regression handles singular matrices") { // // check for both constant columns with intercept (zero std) and collinear // val singularDataConstantColumn = sc.parallelize(Seq( @@ -296,51 +308,51 @@ class MLlibLinearRegressionSuite extends MLTest with DefaultReadWriteTest with P } } -// test("linear regression without intercept without regularization") { -// Seq("auto", "l-bfgs", "normal").foreach { solver => -// val trainer1 = (new LinearRegression).setFitIntercept(false).setSolver(solver) -// // Without regularization the results should be the same -// val trainer2 = (new LinearRegression).setFitIntercept(false).setStandardization(false) -// .setSolver(solver) -// val model1 = trainer1.fit(datasetWithDenseFeature) -// val modelWithoutIntercept1 = trainer1.fit(datasetWithDenseFeatureWithoutIntercept) -// val model2 = trainer2.fit(datasetWithDenseFeature) -// val modelWithoutIntercept2 = trainer2.fit(datasetWithDenseFeatureWithoutIntercept) -// -// /* -// coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0, lambda = 0, -// intercept = FALSE)) -// > coefficients -// 3 x 1 sparse Matrix of class "dgCMatrix" -// s0 -// (Intercept) . -// as.numeric.data.V2. 6.973403 -// as.numeric.data.V3. 5.284370 -// */ -// val coefficientsR = Vectors.dense(6.973403, 5.284370) -// -// assert(model1.intercept ~== 0 absTol 1E-2) -// assert(model1.coefficients ~= coefficientsR relTol 1E-2) -// assert(model2.intercept ~== 0 absTol 1E-2) -// assert(model2.coefficients ~= coefficientsR relTol 1E-2) -// -// /* -// Then again with the data with no intercept: -// > coefficientsWithoutIntercept -// 3 x 1 sparse Matrix of class "dgCMatrix" -// s0 -// (Intercept) . -// as.numeric.data3.V2. 4.70011 -// as.numeric.data3.V3. 7.19943 -// */ -// val coefficientsWithoutInterceptR = Vectors.dense(4.70011, 7.19943) -// -// assert(modelWithoutIntercept1.intercept ~== 0 absTol 1E-3) -// assert(modelWithoutIntercept1.coefficients ~= coefficientsWithoutInterceptR relTol 1E-3) -// assert(modelWithoutIntercept2.intercept ~== 0 absTol 1E-3) -// assert(modelWithoutIntercept2.coefficients ~= coefficientsWithoutInterceptR relTol 1E-3) -// } -// } + test("linear regression without intercept without regularization") { + Seq("auto", "l-bfgs", "normal").foreach { solver => + val trainer1 = (new LinearRegression).setFitIntercept(false).setSolver(solver) + // Without regularization the results should be the same + val trainer2 = (new LinearRegression).setFitIntercept(false).setStandardization(false) + .setSolver(solver) + val model1 = trainer1.fit(datasetWithDenseFeature) + val modelWithoutIntercept1 = trainer1.fit(datasetWithDenseFeatureWithoutIntercept) + val model2 = trainer2.fit(datasetWithDenseFeature) + val modelWithoutIntercept2 = trainer2.fit(datasetWithDenseFeatureWithoutIntercept) + + /* + coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0, lambda = 0, + intercept = FALSE)) + > coefficients + 3 x 1 sparse Matrix of class "dgCMatrix" + s0 + (Intercept) . + as.numeric.data.V2. 6.973403 + as.numeric.data.V3. 5.284370 + */ + val coefficientsR = Vectors.dense(6.973403, 5.284370) + + assert(model1.intercept ~== 0 absTol 1E-2) + assert(model1.coefficients ~= coefficientsR relTol 1E-2) + assert(model2.intercept ~== 0 absTol 1E-2) + assert(model2.coefficients ~= coefficientsR relTol 1E-2) + + /* + Then again with the data with no intercept: + > coefficientsWithoutIntercept + 3 x 1 sparse Matrix of class "dgCMatrix" + s0 + (Intercept) . + as.numeric.data3.V2. 4.70011 + as.numeric.data3.V3. 7.19943 + */ + val coefficientsWithoutInterceptR = Vectors.dense(4.70011, 7.19943) + + assert(modelWithoutIntercept1.intercept ~== 0 absTol 1E-3) + assert(modelWithoutIntercept1.coefficients ~= coefficientsWithoutInterceptR relTol 1E-3) + assert(modelWithoutIntercept2.intercept ~== 0 absTol 1E-3) + assert(modelWithoutIntercept2.coefficients ~= coefficientsWithoutInterceptR relTol 1E-3) + } + } test("linear regression with intercept with L1 regularization") { Seq("auto", "l-bfgs", "normal").foreach { solver => @@ -446,109 +458,111 @@ class MLlibLinearRegressionSuite extends MLTest with DefaultReadWriteTest with P } } } -// -// test("linear regression with intercept with L2 regularization") { -// Seq("auto", "l-bfgs", "normal").foreach { solver => -// val trainer1 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3) -// .setSolver(solver) -// val trainer2 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3) -// .setStandardization(false).setSolver(solver) -// val model1 = trainer1.fit(datasetWithDenseFeature) -// val model2 = trainer2.fit(datasetWithDenseFeature) -// -// /* -// coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3)) -// > coefficients -// 3 x 1 sparse Matrix of class "dgCMatrix" -// s0 -// (Intercept) 5.260103 -// as.numeric.d1.V2. 3.725522 -// as.numeric.d1.V3. 5.711203 -// */ -// val interceptR1 = 5.260103 -// val coefficientsR1 = Vectors.dense(3.725522, 5.711203) -// -// assert(model1.intercept ~== interceptR1 relTol 1E-2) -// assert(model1.coefficients ~= coefficientsR1 relTol 1E-2) -// -// /* -// coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3, -// standardize=FALSE)) -// > coefficients -// 3 x 1 sparse Matrix of class "dgCMatrix" -// s0 -// (Intercept) 5.790885 -// as.numeric.d1.V2. 3.432373 -// as.numeric.d1.V3. 5.919196 -// */ -// val interceptR2 = 5.790885 -// val coefficientsR2 = Vectors.dense(3.432373, 5.919196) -// -// assert(model2.intercept ~== interceptR2 relTol 1E-2) -// assert(model2.coefficients ~= coefficientsR2 relTol 1E-2) -// -// testTransformer[(Double, Vector)](datasetWithDenseFeature, model1, -// "features", "prediction") { -// case Row(features: DenseVector, prediction1: Double) => -// val prediction2 = -// features(0) * model1.coefficients(0) + features(1) * model1.coefficients(1) + -// model1.intercept -// assert(prediction1 ~== prediction2 relTol 1E-5) -// } -// } -// } -// -// test("linear regression without intercept with L2 regularization") { -// Seq("auto", "l-bfgs", "normal").foreach { solver => -// val trainer1 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3) -// .setFitIntercept(false).setSolver(solver) -// val trainer2 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3) -// .setFitIntercept(false).setStandardization(false).setSolver(solver) -// val model1 = trainer1.fit(datasetWithDenseFeature) -// val model2 = trainer2.fit(datasetWithDenseFeature) -// -// /* -// coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3, -// intercept = FALSE)) -// > coefficients -// 3 x 1 sparse Matrix of class "dgCMatrix" -// s0 -// (Intercept) . -// as.numeric.d1.V2. 5.493430 -// as.numeric.d1.V3. 4.223082 -// */ -// val interceptR1 = 0.0 -// val coefficientsR1 = Vectors.dense(5.493430, 4.223082) -// -// assert(model1.intercept ~== interceptR1 absTol 1E-2) -// assert(model1.coefficients ~= coefficientsR1 relTol 1E-2) -// -// /* -// coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3, -// intercept = FALSE, standardize=FALSE)) -// > coefficients -// 3 x 1 sparse Matrix of class "dgCMatrix" -// s0 -// (Intercept) . -// as.numeric.d1.V2. 5.244324 -// as.numeric.d1.V3. 4.203106 -// */ -// val interceptR2 = 0.0 -// val coefficientsR2 = Vectors.dense(5.244324, 4.203106) -// -// assert(model2.intercept ~== interceptR2 absTol 1E-2) -// assert(model2.coefficients ~= coefficientsR2 relTol 1E-2) -// -// testTransformer[(Double, Vector)](datasetWithDenseFeature, model1, -// "features", "prediction") { -// case Row(features: DenseVector, prediction1: Double) => -// val prediction2 = -// features(0) * model1.coefficients(0) + features(1) * model1.coefficients(1) + -// model1.intercept -// assert(prediction1 ~== prediction2 relTol 1E-5) -// } -// } -// } + + // OAP-MLlib: Ridge regression will fall back now + test("linear regression with intercept with L2 regularization") { + Seq("auto", "l-bfgs", "normal").foreach { solver => + val trainer1 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3) + .setSolver(solver) + val trainer2 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3) + .setStandardization(false).setSolver(solver) + val model1 = trainer1.fit(datasetWithDenseFeature) + val model2 = trainer2.fit(datasetWithDenseFeature) + + /* + coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3)) + > coefficients + 3 x 1 sparse Matrix of class "dgCMatrix" + s0 + (Intercept) 5.260103 + as.numeric.d1.V2. 3.725522 + as.numeric.d1.V3. 5.711203 + */ + val interceptR1 = 5.260103 + val coefficientsR1 = Vectors.dense(3.725522, 5.711203) + + assert(model1.intercept ~== interceptR1 relTol 1E-2) + assert(model1.coefficients ~= coefficientsR1 relTol 1E-2) + + /* + coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3, + standardize=FALSE)) + > coefficients + 3 x 1 sparse Matrix of class "dgCMatrix" + s0 + (Intercept) 5.790885 + as.numeric.d1.V2. 3.432373 + as.numeric.d1.V3. 5.919196 + */ + val interceptR2 = 5.790885 + val coefficientsR2 = Vectors.dense(3.432373, 5.919196) + + assert(model2.intercept ~== interceptR2 relTol 1E-2) + assert(model2.coefficients ~= coefficientsR2 relTol 1E-2) + + testTransformer[(Double, Vector)](datasetWithDenseFeature, model1, + "features", "prediction") { + case Row(features: DenseVector, prediction1: Double) => + val prediction2 = + features(0) * model1.coefficients(0) + features(1) * model1.coefficients(1) + + model1.intercept + assert(prediction1 ~== prediction2 relTol 1E-5) + } + } + } + + // OAP-MLlib: Ridge regression will fall back now + test("linear regression without intercept with L2 regularization") { + Seq("auto", "l-bfgs", "normal").foreach { solver => + val trainer1 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3) + .setFitIntercept(false).setSolver(solver) + val trainer2 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3) + .setFitIntercept(false).setStandardization(false).setSolver(solver) + val model1 = trainer1.fit(datasetWithDenseFeature) + val model2 = trainer2.fit(datasetWithDenseFeature) + + /* + coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3, + intercept = FALSE)) + > coefficients + 3 x 1 sparse Matrix of class "dgCMatrix" + s0 + (Intercept) . + as.numeric.d1.V2. 5.493430 + as.numeric.d1.V3. 4.223082 + */ + val interceptR1 = 0.0 + val coefficientsR1 = Vectors.dense(5.493430, 4.223082) + + assert(model1.intercept ~== interceptR1 absTol 1E-2) + assert(model1.coefficients ~= coefficientsR1 relTol 1E-2) + + /* + coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3, + intercept = FALSE, standardize=FALSE)) + > coefficients + 3 x 1 sparse Matrix of class "dgCMatrix" + s0 + (Intercept) . + as.numeric.d1.V2. 5.244324 + as.numeric.d1.V3. 4.203106 + */ + val interceptR2 = 0.0 + val coefficientsR2 = Vectors.dense(5.244324, 4.203106) + + assert(model2.intercept ~== interceptR2 absTol 1E-2) + assert(model2.coefficients ~= coefficientsR2 relTol 1E-2) + + testTransformer[(Double, Vector)](datasetWithDenseFeature, model1, + "features", "prediction") { + case Row(features: DenseVector, prediction1: Double) => + val prediction2 = + features(0) * model1.coefficients(0) + features(1) * model1.coefficients(1) + + model1.intercept + assert(prediction1 ~== prediction2 relTol 1E-5) + } + } + } test("linear regression with intercept with ElasticNet regularization") { Seq("auto", "l-bfgs", "normal").foreach { solver => @@ -664,69 +678,70 @@ class MLlibLinearRegressionSuite extends MLTest with DefaultReadWriteTest with P testPredictionModelSinglePrediction(model, datasetWithDenseFeature) } -// test("linear regression model with constant label") { -// /* -// R code: -// for (formula in c(b.const ~ . -1, b.const ~ .)) { -// model <- lm(formula, data=df.const.label, weights=w) -// print(as.vector(coef(model))) -// } -// [1] -9.221298 3.394343 -// [1] 17 0 0 -// */ -// val expected = Seq( -// Vectors.dense(0.0, -9.221298, 3.394343), -// Vectors.dense(17.0, 0.0, 0.0)) -// -// Seq("auto", "l-bfgs", "normal").foreach { solver => -// var idx = 0 -// for (fitIntercept <- Seq(false, true)) { -// val model1 = new LinearRegression() -// .setFitIntercept(fitIntercept) -// .setWeightCol("weight") -// .setPredictionCol("myPrediction") -// .setSolver(solver) -// .fit(datasetWithWeightConstantLabel) -// val actual1 = Vectors.dense(model1.intercept, model1.coefficients(0), -// model1.coefficients(1)) -// assert(actual1 ~== expected(idx) absTol 1e-4) -// -// // Schema of summary.predictions should be a superset of the input dataset -// assert((datasetWithWeightConstantLabel.schema.fieldNames.toSet + model1.getPredictionCol) -// .subsetOf(model1.summary.predictions.schema.fieldNames.toSet)) -// -// val model2 = new LinearRegression() -// .setFitIntercept(fitIntercept) -// .setWeightCol("weight") -// .setPredictionCol("myPrediction") -// .setSolver(solver) -// .fit(datasetWithWeightZeroLabel) -// val actual2 = Vectors.dense(model2.intercept, model2.coefficients(0), -// model2.coefficients(1)) -// assert(actual2 ~== Vectors.dense(0.0, 0.0, 0.0) absTol 1e-4) -// -// // Schema of summary.predictions should be a superset of the input dataset -// assert((datasetWithWeightZeroLabel.schema.fieldNames.toSet + model2.getPredictionCol) -// .subsetOf(model2.summary.predictions.schema.fieldNames.toSet)) -// -// idx += 1 -// } -// } -// } + test("linear regression model with constant label") { + /* + R code: + for (formula in c(b.const ~ . -1, b.const ~ .)) { + model <- lm(formula, data=df.const.label, weights=w) + print(as.vector(coef(model))) + } + [1] -9.221298 3.394343 + [1] 17 0 0 + */ + val expected = Seq( + Vectors.dense(0.0, -9.221298, 3.394343), + Vectors.dense(17.0, 0.0, 0.0)) -// test("regularized linear regression through origin with constant label") { -// // The problem is ill-defined if fitIntercept=false, regParam is non-zero. -// // An exception is thrown in this case. -// Seq("auto", "l-bfgs", "normal").foreach { solver => -// for (standardization <- Seq(false, true)) { -// val model = new LinearRegression().setFitIntercept(false) -// .setRegParam(0.1).setStandardization(standardization).setSolver(solver) -// intercept[IllegalArgumentException] { -// model.fit(datasetWithWeightConstantLabel) -// } -// } -// } -// } + Seq("auto", "l-bfgs", "normal").foreach { solver => + var idx = 0 + for (fitIntercept <- Seq(false, true)) { + val model1 = new LinearRegression() + .setFitIntercept(fitIntercept) + .setWeightCol("weight") + .setPredictionCol("myPrediction") + .setSolver(solver) + .fit(datasetWithWeightConstantLabel) + val actual1 = Vectors.dense(model1.intercept, model1.coefficients(0), + model1.coefficients(1)) + assert(actual1 ~== expected(idx) absTol 1e-4) + + // Schema of summary.predictions should be a superset of the input dataset + assert((datasetWithWeightConstantLabel.schema.fieldNames.toSet + model1.getPredictionCol) + .subsetOf(model1.summary.predictions.schema.fieldNames.toSet)) + + val model2 = new LinearRegression() + .setFitIntercept(fitIntercept) + .setWeightCol("weight") + .setPredictionCol("myPrediction") + .setSolver(solver) + .fit(datasetWithWeightZeroLabel) + val actual2 = Vectors.dense(model2.intercept, model2.coefficients(0), + model2.coefficients(1)) + assert(actual2 ~== Vectors.dense(0.0, 0.0, 0.0) absTol 1e-4) + + // Schema of summary.predictions should be a superset of the input dataset + assert((datasetWithWeightZeroLabel.schema.fieldNames.toSet + model2.getPredictionCol) + .subsetOf(model2.summary.predictions.schema.fieldNames.toSet)) + + idx += 1 + } + } + } + + // OAP-MLlib: origin check with constant label is not supported + //test("regularized linear regression through origin with constant label") { + // // The problem is ill-defined if fitIntercept=false, regParam is non-zero. + // // An exception is thrown in this case. + // Seq("auto", "l-bfgs", "normal").foreach { solver => + // for (standardization <- Seq(false, true)) { + // val model = new LinearRegression().setFitIntercept(false) + // .setRegParam(0.1).setStandardization(standardization).setSolver(solver) + // intercept[IllegalArgumentException] { + // model.fit(datasetWithWeightConstantLabel) + // } + // } + // } + //} test("linear regression with l-bfgs when training is not needed") { // When label is constant, l-bfgs solver returns results without training. @@ -755,6 +770,7 @@ class MLlibLinearRegressionSuite extends MLTest with DefaultReadWriteTest with P } } + // OAP-MLlib summary is not fully supported // test("linear regression model training summary") { // Seq("auto", "l-bfgs", "normal").foreach { solver => // val trainer = new LinearRegression().setSolver(solver).setPredictionCol("myPrediction") @@ -867,467 +883,60 @@ class MLlibLinearRegressionSuite extends MLTest with DefaultReadWriteTest with P // } // } // } -// -// test("linear regression model testset evaluation summary") { -// Seq("auto", "l-bfgs", "normal").foreach { solver => -// val trainer = new LinearRegression().setSolver(solver) -// val model = trainer.fit(datasetWithDenseFeature) -// -// // Evaluating on training dataset should yield results summary equal to training summary -// val testSummary = model.evaluate(datasetWithDenseFeature) -// assert(model.summary.meanSquaredError ~== testSummary.meanSquaredError relTol 1E-5) -// assert(model.summary.r2 ~== testSummary.r2 relTol 1E-5) -// model.summary.residuals.select("residuals").collect() -// .zip(testSummary.residuals.select("residuals").collect()) -// .forall { case (Row(r1: Double), Row(r2: Double)) => r1 ~== r2 relTol 1E-5 } -// } -// } -// -// test("linear regression with weighted samples") { -// val sqlContext = spark.sqlContext -// import sqlContext.implicits._ -// val numClasses = 0 -// def modelEquals(m1: LinearRegressionModel, m2: LinearRegressionModel): Unit = { -// assert(m1.coefficients ~== m2.coefficients relTol 0.01) -// assert(m1.intercept ~== m2.intercept relTol 0.01) -// } -// val testParams = Seq( -// // (elasticNetParam, regParam, fitIntercept, standardization) -// (0.0, 0.21, true, true), -// (0.0, 0.21, true, false), -// (0.0, 0.21, false, false), -// (1.0, 0.21, true, true) -// ) -// -// // For squaredError loss -// for (solver <- Seq("auto", "l-bfgs", "normal"); -// (elasticNetParam, regParam, fitIntercept, standardization) <- testParams) { -// val estimator = new LinearRegression() -// .setFitIntercept(fitIntercept) -// .setStandardization(standardization) -// .setRegParam(regParam) -// .setElasticNetParam(elasticNetParam) -// .setSolver(solver) -// .setMaxIter(1) -// MLTestingUtils.testArbitrarilyScaledWeights[LinearRegressionModel, LinearRegression]( -// datasetWithStrongNoise.as[LabeledPoint], estimator, modelEquals) -// MLTestingUtils.testOutliersWithSmallWeights[LinearRegressionModel, LinearRegression]( -// datasetWithStrongNoise.as[LabeledPoint], estimator, numClasses, modelEquals, -// outlierRatio = 3) -// MLTestingUtils.testOversamplingVsWeighting[LinearRegressionModel, LinearRegression]( -// datasetWithStrongNoise.as[LabeledPoint], estimator, modelEquals, seed) -// } -// -// // For huber loss -// for ((_, regParam, fitIntercept, standardization) <- testParams) { -// val estimator = new LinearRegression() -// .setLoss("huber") -// .setFitIntercept(fitIntercept) -// .setStandardization(standardization) -// .setRegParam(regParam) -// .setMaxIter(1) -// MLTestingUtils.testArbitrarilyScaledWeights[LinearRegressionModel, LinearRegression]( -// datasetWithOutlier.as[LabeledPoint], estimator, modelEquals) -// MLTestingUtils.testOutliersWithSmallWeights[LinearRegressionModel, LinearRegression]( -// datasetWithOutlier.as[LabeledPoint], estimator, numClasses, modelEquals, -// outlierRatio = 3) -// MLTestingUtils.testOversamplingVsWeighting[LinearRegressionModel, LinearRegression]( -// datasetWithOutlier.as[LabeledPoint], estimator, modelEquals, seed) -// } -// } - - test("linear regression model with l-bfgs with big feature datasets") { - val trainer = new LinearRegression().setSolver("auto") - val model = trainer.fit(datasetWithSparseFeature) - - // Training results for the model should be available - assert(model.hasSummary) - // When LBFGS is used as optimizer, objective history can be restored. - assert( - model.summary - .objectiveHistory - .sliding(2) - .forall(x => x(0) >= x(1))) - } -// test("linear regression summary with weighted samples and intercept by normal solver") { -// /* -// R code: -// -// model <- glm(formula = "b ~ .", data = df, weights = w) -// summary(model) -// -// Call: -// glm(formula = "b ~ .", data = df, weights = w) -// -// Deviance Residuals: -// 1 2 3 4 -// 1.920 -1.358 -1.109 0.960 -// -// Coefficients: -// Estimate Std. Error t value Pr(>|t|) -// (Intercept) 18.080 9.608 1.882 0.311 -// V1 6.080 5.556 1.094 0.471 -// V2 -0.600 1.960 -0.306 0.811 -// -// (Dispersion parameter for gaussian family taken to be 7.68) -// -// Null deviance: 202.00 on 3 degrees of freedom -// Residual deviance: 7.68 on 1 degrees of freedom -// AIC: 18.783 -// -// Number of Fisher Scoring iterations: 2 -// */ -// -// val model = new LinearRegression() -// .setWeightCol("weight") -// .setSolver("normal") -// .fit(datasetWithWeight) -// val coefficientsR = Vectors.dense(Array(6.080, -0.600)) -// val interceptR = 18.080 -// val devianceResidualsR = Array(-1.358, 1.920) -// val seCoefR = Array(5.556, 1.960, 9.608) -// val tValsR = Array(1.094, -0.306, 1.882) -// val pValsR = Array(0.471, 0.811, 0.311) -// -// assert(model.coefficients ~== coefficientsR absTol 1E-3) -// assert(model.intercept ~== interceptR absTol 1E-3) -// model.summary.devianceResiduals.zip(devianceResidualsR).foreach { x => -// assert(x._1 ~== x._2 absTol 1E-3) } -// model.summary.coefficientStandardErrors.zip(seCoefR).foreach{ x => -// assert(x._1 ~== x._2 absTol 1E-3) } -// model.summary.tValues.zip(tValsR).foreach{ x => assert(x._1 ~== x._2 absTol 1E-3) } -// model.summary.pValues.zip(pValsR).foreach{ x => assert(x._1 ~== x._2 absTol 1E-3) } -// -// val modelWithL1 = new LinearRegression() -// .setWeightCol("weight") -// .setSolver("normal") -// .setRegParam(0.5) -// .setElasticNetParam(1.0) -// .fit(datasetWithWeight) -// -// assert(modelWithL1.summary.objectiveHistory !== Array(0.0)) -// assert( -// modelWithL1.summary -// .objectiveHistory -// .sliding(2) -// .forall(x => x(0) >= x(1))) -// } - -// test("linear regression summary with weighted samples and w/o intercept by normal solver") { -// /* -// R code: -// -// model <- glm(formula = "b ~ . -1", data = df, weights = w) -// summary(model) -// -// Call: -// glm(formula = "b ~ . -1", data = df, weights = w) -// -// Deviance Residuals: -// 1 2 3 4 -// 1.950 2.344 -4.600 2.103 -// -// Coefficients: -// Estimate Std. Error t value Pr(>|t|) -// V1 -3.7271 2.9032 -1.284 0.3279 -// V2 3.0100 0.6022 4.998 0.0378 * -// --- -// Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 -// -// (Dispersion parameter for gaussian family taken to be 17.4376) -// -// Null deviance: 5962.000 on 4 degrees of freedom -// Residual deviance: 34.875 on 2 degrees of freedom -// AIC: 22.835 -// -// Number of Fisher Scoring iterations: 2 -// */ -// -// val model = new LinearRegression() -// .setWeightCol("weight") -// .setSolver("normal") -// .setFitIntercept(false) -// .fit(datasetWithWeight) -// val coefficientsR = Vectors.dense(Array(-3.7271, 3.0100)) -// val interceptR = 0.0 -// val devianceResidualsR = Array(-4.600, 2.344) -// val seCoefR = Array(2.9032, 0.6022) -// val tValsR = Array(-1.284, 4.998) -// val pValsR = Array(0.3279, 0.0378) -// -// assert(model.coefficients ~== coefficientsR absTol 1E-3) -// assert(model.intercept === interceptR) -// model.summary.devianceResiduals.zip(devianceResidualsR).foreach { x => -// assert(x._1 ~== x._2 absTol 1E-3) } -// model.summary.coefficientStandardErrors.zip(seCoefR).foreach{ x => -// assert(x._1 ~== x._2 absTol 1E-3) } -// model.summary.tValues.zip(tValsR).foreach{ x => assert(x._1 ~== x._2 absTol 1E-3) } -// model.summary.pValues.zip(pValsR).foreach{ x => assert(x._1 ~== x._2 absTol 1E-3) } -// } - - test("read/write") { - def checkModelData(model: LinearRegressionModel, model2: LinearRegressionModel): Unit = { - assert(model.intercept === model2.intercept) - assert(model.coefficients === model2.coefficients) + test("linear regression with weighted samples") { + val sqlContext = spark.sqlContext + import sqlContext.implicits._ + val numClasses = 0 + def modelEquals(m1: LinearRegressionModel, m2: LinearRegressionModel): Unit = { + assert(m1.coefficients ~== m2.coefficients relTol 0.01) + assert(m1.intercept ~== m2.intercept relTol 0.01) + } + val testParams = Seq( + // (elasticNetParam, regParam, fitIntercept, standardization) + (0.0, 0.21, true, true), + (0.0, 0.21, true, false), + (0.0, 0.21, false, false), + (1.0, 0.21, true, true) + ) + + // For squaredError loss + for (solver <- Seq("auto", "l-bfgs", "normal"); + (elasticNetParam, regParam, fitIntercept, standardization) <- testParams) { + val estimator = new LinearRegression() + .setFitIntercept(fitIntercept) + .setStandardization(standardization) + .setRegParam(regParam) + .setElasticNetParam(elasticNetParam) + .setSolver(solver) + .setMaxIter(1) + MLTestingUtils.testArbitrarilyScaledWeights[LinearRegressionModel, LinearRegression]( + datasetWithStrongNoise.as[LabeledPoint], estimator, modelEquals) + MLTestingUtils.testOutliersWithSmallWeights[LinearRegressionModel, LinearRegression]( + datasetWithStrongNoise.as[LabeledPoint], estimator, numClasses, modelEquals, + outlierRatio = 3) + MLTestingUtils.testOversamplingVsWeighting[LinearRegressionModel, LinearRegression]( + datasetWithStrongNoise.as[LabeledPoint], estimator, modelEquals, seed) } - val lr = new LinearRegression() - testEstimatorAndModelReadWrite(lr, datasetWithWeight, LinearRegressionSuite.allParamSettings, - LinearRegressionSuite.allParamSettings, checkModelData) - } - -// test("pmml export") { -// val lr = new LinearRegression() -// val model = lr.fit(datasetWithWeight) -// def checkModel(pmml: PMML): Unit = { -// val dd = pmml.getDataDictionary -// assert(dd.getNumberOfFields === 3) -// val fields = dd.getDataFields.asScala -// assert(fields(0).getName().toString === "field_0") -// assert(fields(0).getOpType() == OpType.CONTINUOUS) -// val pmmlRegressionModel = pmml.getModels().get(0).asInstanceOf[PMMLRegressionModel] -// val pmmlPredictors = pmmlRegressionModel.getRegressionTables.get(0).getNumericPredictors -// val pmmlWeights = pmmlPredictors.asScala.map(_.getCoefficient()).toList -// assert(pmmlWeights(0) ~== model.coefficients(0) relTol 1E-3) -// assert(pmmlWeights(1) ~== model.coefficients(1) relTol 1E-3) -// } -// testPMMLWrite(sc, model, checkModel) -// } - -// test("should support all NumericType labels and weights, and not support other types") { -// for (solver <- Seq("auto", "l-bfgs", "normal")) { -// val lr = new LinearRegression().setMaxIter(1).setSolver(solver) -// MLTestingUtils.checkNumericTypes[LinearRegressionModel, LinearRegression]( -// lr, spark, isClassification = false) { (expected, actual) => -// assert(expected.intercept === actual.intercept) -// assert(expected.coefficients === actual.coefficients) -// } -// } -// } - - test("linear regression (huber loss) with intercept without regularization") { - val trainer1 = (new LinearRegression).setLoss("huber") - .setFitIntercept(true).setStandardization(true) - val trainer2 = (new LinearRegression).setLoss("huber") - .setFitIntercept(true).setStandardization(false) - - val model1 = trainer1.fit(datasetWithOutlier) - val model2 = trainer2.fit(datasetWithOutlier) - - /* - Using the following Python code to load the data and train the model using - scikit-learn package. - - import pandas as pd - import numpy as np - from sklearn.linear_model import HuberRegressor - df = pd.read_csv("path", header = None) - X = df[df.columns[1:3]] - y = np.array(df[df.columns[0]]) - huber = HuberRegressor(fit_intercept=True, alpha=0.0, max_iter=100, epsilon=1.35) - huber.fit(X, y) - - >>> huber.coef_ - array([ 4.68998007, 7.19429011]) - >>> huber.intercept_ - 6.3002404351083037 - >>> huber.scale_ - 0.077810159205220747 - */ - val coefficientsPy = Vectors.dense(4.68998007, 7.19429011) - val interceptPy = 6.30024044 - val scalePy = 0.07781016 - - assert(model1.coefficients ~= coefficientsPy relTol 1E-3) - assert(model1.intercept ~== interceptPy relTol 1E-3) - assert(model1.scale ~== scalePy relTol 1E-3) - - // Without regularization, with or without standardization will converge to the same solution. - assert(model2.coefficients ~= coefficientsPy relTol 1E-3) - assert(model2.intercept ~== interceptPy relTol 1E-3) - assert(model2.scale ~== scalePy relTol 1E-3) - } - - test("linear regression (huber loss) without intercept without regularization") { - val trainer1 = (new LinearRegression).setLoss("huber") - .setFitIntercept(false).setStandardization(true) - val trainer2 = (new LinearRegression).setLoss("huber") - .setFitIntercept(false).setStandardization(false) - - val model1 = trainer1.fit(datasetWithOutlier) - val model2 = trainer2.fit(datasetWithOutlier) - - /* - huber = HuberRegressor(fit_intercept=False, alpha=0.0, max_iter=100, epsilon=1.35) - huber.fit(X, y) - - >>> huber.coef_ - array([ 6.71756703, 5.08873222]) - >>> huber.intercept_ - 0.0 - >>> huber.scale_ - 2.5560209922722317 - */ - val coefficientsPy = Vectors.dense(6.71756703, 5.08873222) - val interceptPy = 0.0 - val scalePy = 2.55602099 - - assert(model1.coefficients ~= coefficientsPy relTol 1E-3) - assert(model1.intercept === interceptPy) - assert(model1.scale ~== scalePy relTol 1E-3) - - // Without regularization, with or without standardization will converge to the same solution. - assert(model2.coefficients ~= coefficientsPy relTol 1E-3) - assert(model2.intercept === interceptPy) - assert(model2.scale ~== scalePy relTol 1E-3) - } - - test("linear regression (huber loss) with intercept with L2 regularization") { - val trainer1 = (new LinearRegression).setLoss("huber") - .setFitIntercept(true).setRegParam(0.21).setStandardization(true) - val trainer2 = (new LinearRegression).setLoss("huber") - .setFitIntercept(true).setRegParam(0.21).setStandardization(false) - - val model1 = trainer1.fit(datasetWithOutlier) - val model2 = trainer2.fit(datasetWithOutlier) - - /* - Since scikit-learn HuberRegressor does not support standardization, - we do it manually out of the estimator. - - xStd = np.std(X, axis=0) - scaledX = X / xStd - huber = HuberRegressor(fit_intercept=True, alpha=210, max_iter=100, epsilon=1.35) - huber.fit(scaledX, y) - - >>> np.array(huber.coef_ / xStd) - array([ 1.97732633, 3.38816722]) - >>> huber.intercept_ - 3.7527581430531227 - >>> huber.scale_ - 3.787363673371801 - */ - val coefficientsPy1 = Vectors.dense(1.97732633, 3.38816722) - val interceptPy1 = 3.75275814 - val scalePy1 = 3.78736367 - - assert(model1.coefficients ~= coefficientsPy1 relTol 1E-2) - assert(model1.intercept ~== interceptPy1 relTol 1E-2) - assert(model1.scale ~== scalePy1 relTol 1E-2) - - /* - huber = HuberRegressor(fit_intercept=True, alpha=210, max_iter=100, epsilon=1.35) - huber.fit(X, y) - - >>> huber.coef_ - array([ 1.73346444, 3.63746999]) - >>> huber.intercept_ - 4.3017134790781739 - >>> huber.scale_ - 3.6472742809286793 - */ - val coefficientsPy2 = Vectors.dense(1.73346444, 3.63746999) - val interceptPy2 = 4.30171347 - val scalePy2 = 3.64727428 - - assert(model2.coefficients ~= coefficientsPy2 relTol 1E-3) - assert(model2.intercept ~== interceptPy2 relTol 1E-3) - assert(model2.scale ~== scalePy2 relTol 1E-3) - } - - test("linear regression (huber loss) without intercept with L2 regularization") { - val trainer1 = (new LinearRegression).setLoss("huber") - .setFitIntercept(false).setRegParam(0.21).setStandardization(true) - val trainer2 = (new LinearRegression).setLoss("huber") - .setFitIntercept(false).setRegParam(0.21).setStandardization(false) - - val model1 = trainer1.fit(datasetWithOutlier) - val model2 = trainer2.fit(datasetWithOutlier) - - /* - Since scikit-learn HuberRegressor does not support standardization, - we do it manually out of the estimator. - - xStd = np.std(X, axis=0) - scaledX = X / xStd - huber = HuberRegressor(fit_intercept=False, alpha=210, max_iter=100, epsilon=1.35) - huber.fit(scaledX, y) - - >>> np.array(huber.coef_ / xStd) - array([ 2.59679008, 2.26973102]) - >>> huber.intercept_ - 0.0 - >>> huber.scale_ - 4.5766311924091791 - */ - val coefficientsPy1 = Vectors.dense(2.59679008, 2.26973102) - val interceptPy1 = 0.0 - val scalePy1 = 4.57663119 - - assert(model1.coefficients ~= coefficientsPy1 relTol 1E-2) - assert(model1.intercept === interceptPy1) - assert(model1.scale ~== scalePy1 relTol 1E-2) - - /* - huber = HuberRegressor(fit_intercept=False, alpha=210, max_iter=100, epsilon=1.35) - huber.fit(X, y) - - >>> huber.coef_ - array([ 2.28423908, 2.25196887]) - >>> huber.intercept_ - 0.0 - >>> huber.scale_ - 4.5979643506051753 - */ - val coefficientsPy2 = Vectors.dense(2.28423908, 2.25196887) - val interceptPy2 = 0.0 - val scalePy2 = 4.59796435 - - assert(model2.coefficients ~= coefficientsPy2 relTol 1E-3) - assert(model2.intercept === interceptPy2) - assert(model2.scale ~== scalePy2 relTol 1E-3) - } - - test("huber loss model match squared error for large epsilon") { - val trainer1 = new LinearRegression().setLoss("huber").setEpsilon(1E5) - val model1 = trainer1.fit(datasetWithOutlier) - val trainer2 = new LinearRegression() - val model2 = trainer2.fit(datasetWithOutlier) - assert(model1.coefficients ~== model2.coefficients relTol 1E-3) - assert(model1.intercept ~== model2.intercept relTol 1E-3) - } - - test("oap-mllib-163: should support all kind of labelCol name and featuresCol name") { - val df_1 = datasetWithDenseFeature.toDF("label_alias", "features_alias") - val nb_1 = new LinearRegression().setLabelCol("label_alias") - val e_1 = intercept[IllegalArgumentException](nb_1.fit(df_1)).getMessage - assert(e_1.contains("features does not exist. Available: label_alias, features_alias")) - - val df_2 = datasetWithDenseFeature.toDF("label_alias", "features_alias") - val nb_2 = new LinearRegression().setFeaturesCol("features_alias") - val e_2 = intercept[IllegalArgumentException](nb_2.fit(df_2)).getMessage - assert(e_2.contains("label does not exist. Available: label_alias, features_alias")) - val df_3 = datasetWithDenseFeature.toDF("label_alias", "features_alias") - val nb_3 = new LinearRegression().setLabelCol("label_alias").setFeaturesCol("features_alias") - val model = nb_3.fit(df_3) - assert(model.hasParent) - assert(model.hasSummary) + // For huber loss + for ((_, regParam, fitIntercept, standardization) <- testParams) { + val estimator = new LinearRegression() + .setLoss("huber") + .setFitIntercept(fitIntercept) + .setStandardization(standardization) + .setRegParam(regParam) + .setMaxIter(1) + MLTestingUtils.testArbitrarilyScaledWeights[LinearRegressionModel, LinearRegression]( + datasetWithOutlier.as[LabeledPoint], estimator, modelEquals) + MLTestingUtils.testOutliersWithSmallWeights[LinearRegressionModel, LinearRegression]( + datasetWithOutlier.as[LabeledPoint], estimator, numClasses, modelEquals, + outlierRatio = 3) + MLTestingUtils.testOversamplingVsWeighting[LinearRegressionModel, LinearRegression]( + datasetWithOutlier.as[LabeledPoint], estimator, modelEquals, seed) + } } - test("oap-mllib-163: should support column in any order") { - val df_1 = datasetWithDenseFeature.select("label", "features").toDF() - val nb_1 = new LinearRegression().setLabelCol("label").setFeaturesCol("features") - val model_1 = nb_1.fit(df_1) - assert(model_1.hasParent) - assert(model_1.hasSummary) - - val df_2 = datasetWithDenseFeature.select("features", "label").toDF() - val nb_2 = new LinearRegression().setLabelCol("label").setFeaturesCol("features") - val model_2 = nb_2.fit(df_2) - assert(model_2.hasParent) - assert(model_2.hasSummary) - } } object LinearRegressionSuite { diff --git a/mllib-dal/test.sh b/mllib-dal/test.sh index 07ef18366..0d509481b 100755 --- a/mllib-dal/test.sh +++ b/mllib-dal/test.sh @@ -52,11 +52,11 @@ MVN_NO_TRANSFER_PROGRESS= print_usage() { echo - echo "Usage: ./test.sh [-p ] [-q] [-h] [-d] " + echo "Usage: ./test.sh [-p ] [-q] [-h] [-d] " echo echo "-p Supported Platform Profiles:" - echo " CPU_ONLY_PROFILE" - echo " CPU_GPU_PROFILE" + echo " CPU_GPU_PROFILE (Default)" + echo " CPU_ONLY_PROFILE" echo } @@ -78,10 +78,12 @@ shift "$((OPTIND-1))" SUITE=$* print_usage +echo "Testing suite(s) $SUITE ..." +echo -if [[ ! ($PLATFORM_PROFILE == CPU_ONLY_PROFILE || $PLATFORM_PROFILE == CPU_GPU_PROFILE) ]]; then +if [[ ! ($PLATFORM_PROFILE == CPU_GPU_PROFILE || $PLATFORM_PROFILE == CPU_ONLY_PROFILE) ]]; then echo - echo Platform Profile should be CPU_ONLY_PROFILE or CPU_GPU_PROFILE, but \"$PLATFORM_PROFILE\" found! + echo Platform Profile should be CPU_GPU_PROFILE or CPU_ONLY_PROFILE, but \"$PLATFORM_PROFILE\" found! echo exit 1 fi @@ -93,7 +95,7 @@ if [ "HOST" = $DEVICE_OPT ]; then "RowAccessorTest" \ "com.intel.oap.mllib.ConvertHomogenTableSuite" ) -else +elif [ "CPU" = $DEVICE_OPT ]; then suiteArray=( "org.apache.spark.ml.clustering.MLlibKMeansSuite" \ "org.apache.spark.ml.feature.MLlibPCASuite" \ @@ -103,6 +105,13 @@ else "org.apache.spark.ml.stat.MLlibCorrelationSuite" \ "org.apache.spark.ml.stat.MLlibSummarizerSuite" ) +elif [ "GPU" = $DEVICE_OPT ]; then + suiteArray=( + "org.apache.spark.ml.regression.MLlibLinearRegressionSuite" + ) +else + echo Error: $DEVICE_OPT is not supported! + exit 1 fi if [[ ! ${suiteArray[*]} =~ $SUITE ]]; then @@ -114,18 +123,18 @@ fi SCRIPT_DIR=$( cd $(dirname ${BASH_SOURCE[0]}) && pwd ) OAP_MLLIB_ROOT=$(cd $SCRIPT_DIR/.. && pwd) source $OAP_MLLIB_ROOT/RELEASE - +# Set defaults from RELEASE envs export SPARK_VERSION=${SPARK_OPT:-$SPARK_VERSION} export PLATFORM_PROFILE=${PLATFORM_OPT:-$PLATFORM_PROFILE} -echo "computeDevice : $DEVICE_OPT" +echo "Using Compute Device: $DEVICE_OPT" echo echo === Testing Environments === -echo JAVA_HOME=$JAVA_HOME -echo Maven Version: $(mvn -v | head -n 1 | cut -f3 -d" ") -echo Spark Version: $SPARK_VERSION echo Platform Profile: $PLATFORM_PROFILE +echo Spark Version: $SPARK_VERSION +echo Maven Version: $(mvn -v | head -n 1 | cut -f3 -d" ") +echo JAVA_HOME=$JAVA_HOME echo ============================ echo @@ -153,21 +162,22 @@ else SUBSUITE=$(echo $SUITE | tr "," "\n") for suite in ${SUBSUITE[*]} do - if [[ $suite == *"com.intel.oap.mllib"* ]]; then - echo - echo Testing $suite ... - echo - mvn $MVN_NO_TRANSFER_PROGRESS -Dspark.version=$SPARK_VERSION -DcomputeDevice=$DEVICE_OPT -Dtest=none -DforkMode=never -Dmaven.test.failure.ignore=true -DfailIfNoTests=false -DwildcardSuites=$suite test - elif [[ $suite == *"org.apache.spark.ml"* ]]; then - echo - echo Testing $suite ... - echo - mvn $MVN_NO_TRANSFER_PROGRESS -Dspark.version=$SPARK_VERSION -DcomputeDevice=$DEVICE_OPT -Dtest=none -DwildcardSuites=$suite test - else - echo - echo Testing java $suite ... - echo - mvn $MVN_NO_TRANSFER_PROGRESS -Dspark.version=$SPARK_VERSION -DcomputeDevice=$DEVICE_OPT -DwildcardSuites=none -Dtest=$suite test - fi + echo $suite + if [[ $suite == *"com.intel.oap.mllib"* ]]; then + echo + echo Testing $suite ... + echo + mvn $MVN_NO_TRANSFER_PROGRESS -Dspark.version=$SPARK_VERSION -Dtest=none -DcomputeDevice=$DEVICE_OPT -DforkMode=never -Dmaven.test.failure.ignore=true -DfailIfNoTests=false -DwildcardSuites=$suite test + elif [[ $suite == *"org.apache.spark.ml"* ]]; then + echo + echo Testing $suite ... + echo + mvn $MVN_NO_TRANSFER_PROGRESS -Dspark.version=$SPARK_VERSION -Dtest=none -DcomputeDevice=$DEVICE_OPT -DfailIfNoTests=false -DwildcardSuites=$suite test + else + echo + echo Testing Java $suite ... + echo + mvn $MVN_NO_TRANSFER_PROGRESS -Dspark.version=$SPARK_VERSION -Dtest=$suite -DcomputeDevice=$DEVICE_OPT -DwildcardSuites=none test + fi done fi