diff --git a/mllib-dal/pom.xml b/mllib-dal/pom.xml
index eed1b4ecc..e4b9808ac 100644
--- a/mllib-dal/pom.xml
+++ b/mllib-dal/pom.xml
@@ -274,6 +274,9 @@
${project.build.directory}/surefire-reports
.
+
+ ${computeDevice}
+
test-reports.txt
diff --git a/mllib-dal/src/main/native/LinearRegressionImpl.cpp b/mllib-dal/src/main/native/LinearRegressionImpl.cpp
index 51e6401b4..433c73a6d 100644
--- a/mllib-dal/src/main/native/LinearRegressionImpl.cpp
+++ b/mllib-dal/src/main/native/LinearRegressionImpl.cpp
@@ -41,11 +41,9 @@ namespace ridge_regression_cpu = daal::algorithms::ridge_regression;
typedef double algorithmFPType; /* Algorithm floating-point type */
-static NumericTablePtr linear_regression_compute(size_t rankId,
- ccl::communicator &comm,
- const NumericTablePtr &pData,
- const NumericTablePtr &pLabel,
- size_t nBlocks) {
+static NumericTablePtr linear_regression_compute(
+ size_t rankId, ccl::communicator &comm, const NumericTablePtr &pData,
+ const NumericTablePtr &pLabel, bool fitIntercept, size_t nBlocks) {
using daal::byte;
linear_regression_cpu::training::Distributed localAlgorithm;
@@ -54,6 +52,7 @@ static NumericTablePtr linear_regression_compute(size_t rankId,
localAlgorithm.input.set(linear_regression_cpu::training::data, pData);
localAlgorithm.input.set(
linear_regression_cpu::training::dependentVariables, pLabel);
+ localAlgorithm.parameter.interceptFlag = fitIntercept;
/* Train the multiple linear regression model on local nodes */
localAlgorithm.compute();
@@ -107,6 +106,7 @@ static NumericTablePtr linear_regression_compute(size_t rankId,
/* Merge and finalizeCompute the multiple linear regression model on the
* master node */
+ masterAlgorithm.parameter.interceptFlag = fitIntercept;
masterAlgorithm.compute();
masterAlgorithm.finalizeCompute();
@@ -125,17 +125,19 @@ static NumericTablePtr linear_regression_compute(size_t rankId,
return resultTable;
}
-static NumericTablePtr ridge_regression_compute(
- size_t rankId, ccl::communicator &comm, const NumericTablePtr &pData,
- const NumericTablePtr &pLabel, double regParam, size_t nBlocks) {
+static NumericTablePtr
+ridge_regression_compute(size_t rankId, ccl::communicator &comm,
+ const NumericTablePtr &pData,
+ const NumericTablePtr &pLabel, bool fitIntercept,
+ double regParam, size_t nBlocks) {
using daal::byte;
NumericTablePtr ridgeParams(new HomogenNumericTable(
1, 1, NumericTable::doAllocate, regParam));
-
ridge_regression_cpu::training::Distributed localAlgorithm;
localAlgorithm.parameter.ridgeParameters = ridgeParams;
+ localAlgorithm.parameter.interceptFlag = fitIntercept;
/* Pass a training data set and dependent values to the algorithm */
localAlgorithm.input.set(ridge_regression_cpu::training::data, pData);
@@ -195,6 +197,7 @@ static NumericTablePtr ridge_regression_compute(
/* Merge and finalizeCompute the multiple ridge regression model on the
* master node */
+ masterAlgorithm.parameter.interceptFlag = fitIntercept;
masterAlgorithm.compute();
masterAlgorithm.finalizeCompute();
@@ -215,11 +218,13 @@ static NumericTablePtr ridge_regression_compute(
#ifdef CPU_GPU_PROFILE
static jlong doLROneAPICompute(JNIEnv *env, size_t rankId,
ccl::communicator &cclComm, sycl::queue &queue,
- jlong pData, jlong pLabel, jint executorNum,
+ jlong pData, jlong pLabel,
+ jboolean jfitIntercept, jint executorNum,
jobject resultObj) {
std::cout << "oneDAL (native): GPU compute start , rankid " << rankId
<< std::endl;
const bool isRoot = (rankId == ccl_root);
+ bool fitIntercept = bool(jfitIntercept);
int size = cclComm.size();
ccl::shared_ptr_class &kvs = getKvs();
@@ -230,7 +235,8 @@ static jlong doLROneAPICompute(JNIEnv *env, size_t rankId,
homogen_table ytrain = *reinterpret_cast(pLabel);
linear_regression_gpu::train_input local_input{xtrain, ytrain};
- const auto linear_regression_desc = linear_regression_gpu::descriptor<>();
+ const auto linear_regression_desc =
+ linear_regression_gpu::descriptor<>(fitIntercept);
linear_regression_gpu::train_result result_train =
preview::train(comm, linear_regression_desc, xtrain, ytrain);
@@ -248,14 +254,14 @@ static jlong doLROneAPICompute(JNIEnv *env, size_t rankId,
/*
* Class: com_intel_oap_mllib_regression_LinearRegressionDALImpl
* Method: cLinearRegressionTrainDAL
- * Signature:
- * (JJDDIIIILjava/lang/String;Lcom/intel/oap/mllib/regression/LiRResult;)J
+ * Signature: (JJZDDIII[ILcom/intel/oap/mllib/regression/LiRResult;)J
*/
JNIEXPORT jlong JNICALL
Java_com_intel_oap_mllib_regression_LinearRegressionDALImpl_cLinearRegressionTrainDAL(
- JNIEnv *env, jobject obj, jlong data, jlong label, jdouble regParam,
- jdouble elasticNetParam, jint executorNum, jint executorCores,
- jint computeDeviceOrdinal, jintArray gpuIdxArray, jobject resultObj) {
+ JNIEnv *env, jobject obj, jlong data, jlong label, jboolean fitIntercept,
+ jdouble regParam, jdouble elasticNetParam, jint executorNum,
+ jint executorCores, jint computeDeviceOrdinal, jintArray gpuIdxArray,
+ jobject resultObj) {
std::cout << "oneDAL (native): use DPC++ kernels "
<< "; device " << ComputeDeviceString[computeDeviceOrdinal]
@@ -284,8 +290,9 @@ Java_com_intel_oap_mllib_regression_LinearRegressionDALImpl_cLinearRegressionTra
jlong pDatagpu = (jlong)data;
jlong pLabelgpu = (jlong)label;
- resultptr = doLROneAPICompute(env, rankId, cclComm, queue, pDatagpu,
- pLabelgpu, executorNum, resultObj);
+ resultptr =
+ doLROneAPICompute(env, rankId, cclComm, queue, pDatagpu, pLabelgpu,
+ fitIntercept, executorNum, resultObj);
env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0);
#endif
} else {
@@ -300,11 +307,12 @@ Java_com_intel_oap_mllib_regression_LinearRegressionDALImpl_cLinearRegressionTra
cout << "oneDAL (native): Number of CPU threads used: " << nThreadsNew
<< endl;
if (regParam == 0) {
- resultTable = linear_regression_compute(rankId, cclComm, pData,
- pLabel, executorNum);
+ resultTable = linear_regression_compute(
+ rankId, cclComm, pData, pLabel, fitIntercept, executorNum);
} else {
- resultTable = ridge_regression_compute(
- rankId, cclComm, pData, pLabel, regParam, executorNum);
+ resultTable =
+ ridge_regression_compute(rankId, cclComm, pData, pLabel,
+ fitIntercept, regParam, executorNum);
}
NumericTablePtr *coeffvectors = new NumericTablePtr(resultTable);
diff --git a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_regression_LinearRegressionDALImpl.h b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_regression_LinearRegressionDALImpl.h
index a9bf8521f..d92ed6ece 100644
--- a/mllib-dal/src/main/native/javah/com_intel_oap_mllib_regression_LinearRegressionDALImpl.h
+++ b/mllib-dal/src/main/native/javah/com_intel_oap_mllib_regression_LinearRegressionDALImpl.h
@@ -10,10 +10,10 @@ extern "C" {
/*
* Class: com_intel_oap_mllib_regression_LinearRegressionDALImpl
* Method: cLinearRegressionTrainDAL
- * Signature: (JJDDIII[ILcom/intel/oap/mllib/regression/LiRResult;)J
+ * Signature: (JJZDDIII[ILcom/intel/oap/mllib/regression/LiRResult;)J
*/
JNIEXPORT jlong JNICALL Java_com_intel_oap_mllib_regression_LinearRegressionDALImpl_cLinearRegressionTrainDAL
- (JNIEnv *, jobject, jlong, jlong, jdouble, jdouble, jint, jint, jint, jintArray, jobject);
+ (JNIEnv *, jobject, jlong, jlong, jboolean, jdouble, jdouble, jint, jint, jint, jintArray, jobject);
#ifdef __cplusplus
}
diff --git a/mllib-dal/src/main/native/javah/com_intel_oneapi_dal_table_ColumnAccessor.h b/mllib-dal/src/main/native/javah/com_intel_oneapi_dal_table_ColumnAccessor.h
index 9f6e9ec8b..079d90aee 100644
--- a/mllib-dal/src/main/native/javah/com_intel_oneapi_dal_table_ColumnAccessor.h
+++ b/mllib-dal/src/main/native/javah/com_intel_oneapi_dal_table_ColumnAccessor.h
@@ -15,20 +15,22 @@ extern "C" {
JNIEXPORT jdoubleArray JNICALL Java_com_intel_oneapi_dal_table_ColumnAccessor_cPullDouble
(JNIEnv *, jobject, jlong, jlong, jlong, jlong, jint);
+
/*
* Class: com_intel_oneapi_dal_table_ColumnAccessor
- * Method: cPullInt
- * Signature: (JJJJI)[I
+ * Method: cPullFloat
+ * Signature: (JJJJI)[F
*/
-JNIEXPORT jintArray JNICALL Java_com_intel_oneapi_dal_table_ColumnAccessor_cPullInt
+JNIEXPORT jfloatArray JNICALL Java_com_intel_oneapi_dal_table_ColumnAccessor_cPullFloat
(JNIEnv *, jobject, jlong, jlong, jlong, jlong, jint);
+
/*
* Class: com_intel_oneapi_dal_table_ColumnAccessor
- * Method: cPullFloat
- * Signature: (JJJJI)[F
+ * Method: cPullInt
+ * Signature: (JJJJI)[I
*/
-JNIEXPORT jfloatArray JNICALL Java_com_intel_oneapi_dal_table_ColumnAccessor_cPullFloat
+JNIEXPORT jintArray JNICALL Java_com_intel_oneapi_dal_table_ColumnAccessor_cPullInt
(JNIEnv *, jobject, jlong, jlong, jlong, jlong, jint);
#ifdef __cplusplus
diff --git a/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/LinearRegressionDALImpl.scala b/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/LinearRegressionDALImpl.scala
index da07c9f7f..27156d784 100644
--- a/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/LinearRegressionDALImpl.scala
+++ b/mllib-dal/src/main/scala/com/intel/oap/mllib/regression/LinearRegressionDALImpl.scala
@@ -74,15 +74,15 @@ class LinearRegressionDALImpl( val fitIntercept: Boolean,
val useDevice = sparkContext.getConf.get("spark.oap.mllib.device", Utils.DefaultComputeDevice)
val computeDevice = Common.ComputeDevice.getDeviceByName(useDevice)
+ val isTest = sparkContext.getConf.getBoolean("spark.oap.mllib.isTest", false)
+
val kvsIPPort = getOneCCLIPPort(labeledPoints.rdd)
val labeledPointsTables = if (useDevice == "GPU") {
if (OneDAL.isDenseDataset(labeledPoints, featuresCol)) {
OneDAL.rddLabeledPointToMergedHomogenTables(labeledPoints, labelCol, featuresCol, executorNum, computeDevice)
} else {
- val msg = s"OAPMLlib: Sparse table is not supported for gpu now."
- //todo sparse table is not supported
-
+ val msg = s"OAP MLlib: Sparse table is not supported for GPU now."
logError(msg)
throw new SparkException(msg)
}
@@ -94,6 +94,13 @@ class LinearRegressionDALImpl( val fitIntercept: Boolean,
}
}
+ // OAP MLlib: Only normal linear regression is supported for GPU currently
+ if (useDevice == "GPU" && regParam != 0){
+ val msg = s"OAP MLlib: Regularization parameter is not supported for GPU now."
+ logError(msg)
+ throw new SparkException(msg)
+ }
+
val results = labeledPointsTables.mapPartitionsWithIndex {
case (rank: Int, tables: Iterator[(Long, Long)]) =>
val (featureTabAddr, lableTabAddr) = tables.next()
@@ -101,8 +108,14 @@ class LinearRegressionDALImpl( val fitIntercept: Boolean,
val result = new LiRResult()
val gpuIndices = if (useDevice == "GPU") {
- val resources = TaskContext.get().resources()
- resources("gpu").addresses.map(_.toInt)
+ // OAP MLlib: This ia a hack for unit test, and will be repaireid in the future.
+ // GPU info can't be detected automatically in UT enviroment.
+ if (isTest) {
+ Array(0)
+ } else {
+ val resources = TaskContext.get().resources()
+ resources("gpu").addresses.map(_.toInt)
+ }
} else {
null
}
@@ -110,6 +123,7 @@ class LinearRegressionDALImpl( val fitIntercept: Boolean,
val cbeta = cLinearRegressionTrainDAL(
featureTabAddr,
lableTabAddr,
+ fitIntercept,
regParam,
elasticNetParam,
executorNum,
@@ -149,6 +163,7 @@ class LinearRegressionDALImpl( val fitIntercept: Boolean,
// Single entry to call Linear Regression DAL backend with parameters
@native private def cLinearRegressionTrainDAL(data: Long,
label: Long,
+ fitIntercept: Boolean,
regParam: Double,
elasticNetParam: Double,
executorNum: Int,
diff --git a/mllib-dal/src/main/scala/org/apache/spark/ml/regression/spark313/LinearRegression.scala b/mllib-dal/src/main/scala/org/apache/spark/ml/regression/spark313/LinearRegression.scala
index f5e93fc77..6a8d1051b 100644
--- a/mllib-dal/src/main/scala/org/apache/spark/ml/regression/spark313/LinearRegression.scala
+++ b/mllib-dal/src/main/scala/org/apache/spark/ml/regression/spark313/LinearRegression.scala
@@ -455,8 +455,9 @@ class LinearRegression @Since("1.3") (@Since("1.3.0") override val uid: String)
private def trainWithNormal(
dataset: Dataset[_],
instr: Instrumentation): LinearRegressionModel = {
- // oneDAL only support simple linear regression and ridge regression
- val paramSupported = ($(regParam) == 0) || ($(regParam) != 0 && $(elasticNetParam) == 0)
+ val paramSupported = ($(regParam) == 0) && (!isDefined(weightCol) || getWeightCol.isEmpty)
+ val sparkContext = dataset.sparkSession.sparkContext
+ val useDevice = sparkContext.getConf.get("spark.oap.mllib.device", Utils.DefaultComputeDevice)
val isPlatformSupported = Utils.checkClusterPlatformCompatibility(
dataset.sparkSession.sparkContext)
if (paramSupported && Utils.isOAPEnabled && isPlatformSupported) {
@@ -489,6 +490,13 @@ class LinearRegression @Since("1.3") (@Since("1.3.0") override val uid: String)
} else {
// For low dimensional data, WeightedLeastSquares is more efficient since the
// training algorithm only requires one pass through the data. (SPARK-10668)
+ val fallbackmsg = s"OAP MLlib: Fall back to vanilla Spark MLlib"
+ logError(fallbackmsg)
+ if (!paramSupported && useDevice == "GPU") {
+ val msg = s"OAP MLlib: Parameter used is not supported for GPU now."
+ logError(msg)
+ throw new SparkException(msg)
+ }
val optimizer = new WeightedLeastSquares($(fitIntercept), $(regParam),
elasticNetParam = $(elasticNetParam), $(standardization), true,
diff --git a/mllib-dal/src/main/scala/org/apache/spark/ml/regression/spark321/LinearRegression.scala b/mllib-dal/src/main/scala/org/apache/spark/ml/regression/spark321/LinearRegression.scala
index 0fb7f6c05..72d2ae717 100644
--- a/mllib-dal/src/main/scala/org/apache/spark/ml/regression/spark321/LinearRegression.scala
+++ b/mllib-dal/src/main/scala/org/apache/spark/ml/regression/spark321/LinearRegression.scala
@@ -454,8 +454,9 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
private def trainWithNormal(
dataset: Dataset[_],
instr: Instrumentation): LinearRegressionModel = {
- // oneDAL only support simple linear regression and ridge regression
- val paramSupported = ($(regParam) == 0) || ($(regParam) != 0 && $(elasticNetParam) == 0)
+ val paramSupported = ($(regParam) == 0) && (!isDefined(weightCol) || getWeightCol.isEmpty)
+ val sparkContext = dataset.sparkSession.sparkContext
+ val useDevice = sparkContext.getConf.get("spark.oap.mllib.device", Utils.DefaultComputeDevice)
val isPlatformSupported = Utils.checkClusterPlatformCompatibility(
dataset.sparkSession.sparkContext)
if (paramSupported && Utils.isOAPEnabled && isPlatformSupported) {
@@ -488,6 +489,13 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
} else {
// For low dimensional data, WeightedLeastSquares is more efficient since the
// training algorithm only requires one pass through the data. (SPARK-10668)
+ val fallbackmsg = s"OAP MLlib: Fall back to vanilla Spark MLlib"
+ logError(fallbackmsg)
+ if (!paramSupported && useDevice == "GPU") {
+ val msg = s"OAP MLlib: Parameter used is not supported for GPU now."
+ logError(msg)
+ throw new SparkException(msg)
+ }
val optimizer = new WeightedLeastSquares($(fitIntercept), $(regParam),
elasticNetParam = $(elasticNetParam), $(standardization), true,
diff --git a/mllib-dal/src/test/scala/org/apache/spark/ml/regression/MLlibLinearRegressionSuite.scala b/mllib-dal/src/test/scala/org/apache/spark/ml/regression/MLlibLinearRegressionSuite.scala
index a03a3fe94..d6ff7c377 100755
--- a/mllib-dal/src/test/scala/org/apache/spark/ml/regression/MLlibLinearRegressionSuite.scala
+++ b/mllib-dal/src/test/scala/org/apache/spark/ml/regression/MLlibLinearRegressionSuite.scala
@@ -17,6 +17,8 @@
package org.apache.spark.ml.regression
+import com.intel.oneapi.dal.table.Common
+
import org.apache.spark.{SparkConf, TestCommon}
import scala.collection.JavaConverters._
@@ -37,8 +39,16 @@ class MLlibLinearRegressionSuite extends MLTest with DefaultReadWriteTest with P
import testImplicits._
override def sparkConf: SparkConf = {
+ val device = TestCommon.getComputeDevice.toString
val conf = super.sparkConf
- conf.set("spark.oap.mllib.device", TestCommon.getComputeDevice.toString)
+ if (device == "GPU") {
+ conf.set("spark.oap.mllib.device", Common.ComputeDevice.GPU.toString)
+ conf.set("spark.oap.mllib.isTest", "true")
+ } else {
+ conf.set("spark.oap.mllib.device", TestCommon.getComputeDevice.toString)
+ }
+
+ conf
}
private val seed: Int = 42
@@ -53,6 +63,7 @@ class MLlibLinearRegressionSuite extends MLTest with DefaultReadWriteTest with P
override def beforeAll(): Unit = {
super.beforeAll()
+ val testDevice = TestCommon.getComputeDevice.toString
datasetWithDenseFeature = sc.parallelize(LinearDataGenerator.generateLinearInput(
intercept = 6.3, weights = Array(4.7, 7.2), xMean = Array(0.9, -1.3),
xVariance = Array(0.7, 1.2), nPoints = 10000, seed, eps = 0.1), 2).map(_.asML).toDF()
@@ -218,6 +229,7 @@ class MLlibLinearRegressionSuite extends MLTest with DefaultReadWriteTest with P
}
}
+ // OAP-MLlib: Singular matrice prejudege is not supported.
// test("linear regression handles singular matrices") {
// // check for both constant columns with intercept (zero std) and collinear
// val singularDataConstantColumn = sc.parallelize(Seq(
@@ -296,51 +308,51 @@ class MLlibLinearRegressionSuite extends MLTest with DefaultReadWriteTest with P
}
}
-// test("linear regression without intercept without regularization") {
-// Seq("auto", "l-bfgs", "normal").foreach { solver =>
-// val trainer1 = (new LinearRegression).setFitIntercept(false).setSolver(solver)
-// // Without regularization the results should be the same
-// val trainer2 = (new LinearRegression).setFitIntercept(false).setStandardization(false)
-// .setSolver(solver)
-// val model1 = trainer1.fit(datasetWithDenseFeature)
-// val modelWithoutIntercept1 = trainer1.fit(datasetWithDenseFeatureWithoutIntercept)
-// val model2 = trainer2.fit(datasetWithDenseFeature)
-// val modelWithoutIntercept2 = trainer2.fit(datasetWithDenseFeatureWithoutIntercept)
-//
-// /*
-// coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0, lambda = 0,
-// intercept = FALSE))
-// > coefficients
-// 3 x 1 sparse Matrix of class "dgCMatrix"
-// s0
-// (Intercept) .
-// as.numeric.data.V2. 6.973403
-// as.numeric.data.V3. 5.284370
-// */
-// val coefficientsR = Vectors.dense(6.973403, 5.284370)
-//
-// assert(model1.intercept ~== 0 absTol 1E-2)
-// assert(model1.coefficients ~= coefficientsR relTol 1E-2)
-// assert(model2.intercept ~== 0 absTol 1E-2)
-// assert(model2.coefficients ~= coefficientsR relTol 1E-2)
-//
-// /*
-// Then again with the data with no intercept:
-// > coefficientsWithoutIntercept
-// 3 x 1 sparse Matrix of class "dgCMatrix"
-// s0
-// (Intercept) .
-// as.numeric.data3.V2. 4.70011
-// as.numeric.data3.V3. 7.19943
-// */
-// val coefficientsWithoutInterceptR = Vectors.dense(4.70011, 7.19943)
-//
-// assert(modelWithoutIntercept1.intercept ~== 0 absTol 1E-3)
-// assert(modelWithoutIntercept1.coefficients ~= coefficientsWithoutInterceptR relTol 1E-3)
-// assert(modelWithoutIntercept2.intercept ~== 0 absTol 1E-3)
-// assert(modelWithoutIntercept2.coefficients ~= coefficientsWithoutInterceptR relTol 1E-3)
-// }
-// }
+ test("linear regression without intercept without regularization") {
+ Seq("auto", "l-bfgs", "normal").foreach { solver =>
+ val trainer1 = (new LinearRegression).setFitIntercept(false).setSolver(solver)
+ // Without regularization the results should be the same
+ val trainer2 = (new LinearRegression).setFitIntercept(false).setStandardization(false)
+ .setSolver(solver)
+ val model1 = trainer1.fit(datasetWithDenseFeature)
+ val modelWithoutIntercept1 = trainer1.fit(datasetWithDenseFeatureWithoutIntercept)
+ val model2 = trainer2.fit(datasetWithDenseFeature)
+ val modelWithoutIntercept2 = trainer2.fit(datasetWithDenseFeatureWithoutIntercept)
+
+ /*
+ coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0, lambda = 0,
+ intercept = FALSE))
+ > coefficients
+ 3 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ (Intercept) .
+ as.numeric.data.V2. 6.973403
+ as.numeric.data.V3. 5.284370
+ */
+ val coefficientsR = Vectors.dense(6.973403, 5.284370)
+
+ assert(model1.intercept ~== 0 absTol 1E-2)
+ assert(model1.coefficients ~= coefficientsR relTol 1E-2)
+ assert(model2.intercept ~== 0 absTol 1E-2)
+ assert(model2.coefficients ~= coefficientsR relTol 1E-2)
+
+ /*
+ Then again with the data with no intercept:
+ > coefficientsWithoutIntercept
+ 3 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ (Intercept) .
+ as.numeric.data3.V2. 4.70011
+ as.numeric.data3.V3. 7.19943
+ */
+ val coefficientsWithoutInterceptR = Vectors.dense(4.70011, 7.19943)
+
+ assert(modelWithoutIntercept1.intercept ~== 0 absTol 1E-3)
+ assert(modelWithoutIntercept1.coefficients ~= coefficientsWithoutInterceptR relTol 1E-3)
+ assert(modelWithoutIntercept2.intercept ~== 0 absTol 1E-3)
+ assert(modelWithoutIntercept2.coefficients ~= coefficientsWithoutInterceptR relTol 1E-3)
+ }
+ }
test("linear regression with intercept with L1 regularization") {
Seq("auto", "l-bfgs", "normal").foreach { solver =>
@@ -446,109 +458,111 @@ class MLlibLinearRegressionSuite extends MLTest with DefaultReadWriteTest with P
}
}
}
-//
-// test("linear regression with intercept with L2 regularization") {
-// Seq("auto", "l-bfgs", "normal").foreach { solver =>
-// val trainer1 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3)
-// .setSolver(solver)
-// val trainer2 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3)
-// .setStandardization(false).setSolver(solver)
-// val model1 = trainer1.fit(datasetWithDenseFeature)
-// val model2 = trainer2.fit(datasetWithDenseFeature)
-//
-// /*
-// coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3))
-// > coefficients
-// 3 x 1 sparse Matrix of class "dgCMatrix"
-// s0
-// (Intercept) 5.260103
-// as.numeric.d1.V2. 3.725522
-// as.numeric.d1.V3. 5.711203
-// */
-// val interceptR1 = 5.260103
-// val coefficientsR1 = Vectors.dense(3.725522, 5.711203)
-//
-// assert(model1.intercept ~== interceptR1 relTol 1E-2)
-// assert(model1.coefficients ~= coefficientsR1 relTol 1E-2)
-//
-// /*
-// coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3,
-// standardize=FALSE))
-// > coefficients
-// 3 x 1 sparse Matrix of class "dgCMatrix"
-// s0
-// (Intercept) 5.790885
-// as.numeric.d1.V2. 3.432373
-// as.numeric.d1.V3. 5.919196
-// */
-// val interceptR2 = 5.790885
-// val coefficientsR2 = Vectors.dense(3.432373, 5.919196)
-//
-// assert(model2.intercept ~== interceptR2 relTol 1E-2)
-// assert(model2.coefficients ~= coefficientsR2 relTol 1E-2)
-//
-// testTransformer[(Double, Vector)](datasetWithDenseFeature, model1,
-// "features", "prediction") {
-// case Row(features: DenseVector, prediction1: Double) =>
-// val prediction2 =
-// features(0) * model1.coefficients(0) + features(1) * model1.coefficients(1) +
-// model1.intercept
-// assert(prediction1 ~== prediction2 relTol 1E-5)
-// }
-// }
-// }
-//
-// test("linear regression without intercept with L2 regularization") {
-// Seq("auto", "l-bfgs", "normal").foreach { solver =>
-// val trainer1 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3)
-// .setFitIntercept(false).setSolver(solver)
-// val trainer2 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3)
-// .setFitIntercept(false).setStandardization(false).setSolver(solver)
-// val model1 = trainer1.fit(datasetWithDenseFeature)
-// val model2 = trainer2.fit(datasetWithDenseFeature)
-//
-// /*
-// coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3,
-// intercept = FALSE))
-// > coefficients
-// 3 x 1 sparse Matrix of class "dgCMatrix"
-// s0
-// (Intercept) .
-// as.numeric.d1.V2. 5.493430
-// as.numeric.d1.V3. 4.223082
-// */
-// val interceptR1 = 0.0
-// val coefficientsR1 = Vectors.dense(5.493430, 4.223082)
-//
-// assert(model1.intercept ~== interceptR1 absTol 1E-2)
-// assert(model1.coefficients ~= coefficientsR1 relTol 1E-2)
-//
-// /*
-// coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3,
-// intercept = FALSE, standardize=FALSE))
-// > coefficients
-// 3 x 1 sparse Matrix of class "dgCMatrix"
-// s0
-// (Intercept) .
-// as.numeric.d1.V2. 5.244324
-// as.numeric.d1.V3. 4.203106
-// */
-// val interceptR2 = 0.0
-// val coefficientsR2 = Vectors.dense(5.244324, 4.203106)
-//
-// assert(model2.intercept ~== interceptR2 absTol 1E-2)
-// assert(model2.coefficients ~= coefficientsR2 relTol 1E-2)
-//
-// testTransformer[(Double, Vector)](datasetWithDenseFeature, model1,
-// "features", "prediction") {
-// case Row(features: DenseVector, prediction1: Double) =>
-// val prediction2 =
-// features(0) * model1.coefficients(0) + features(1) * model1.coefficients(1) +
-// model1.intercept
-// assert(prediction1 ~== prediction2 relTol 1E-5)
-// }
-// }
-// }
+
+ // OAP-MLlib: Ridge regression will fall back now
+ test("linear regression with intercept with L2 regularization") {
+ Seq("auto", "l-bfgs", "normal").foreach { solver =>
+ val trainer1 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3)
+ .setSolver(solver)
+ val trainer2 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3)
+ .setStandardization(false).setSolver(solver)
+ val model1 = trainer1.fit(datasetWithDenseFeature)
+ val model2 = trainer2.fit(datasetWithDenseFeature)
+
+ /*
+ coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3))
+ > coefficients
+ 3 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ (Intercept) 5.260103
+ as.numeric.d1.V2. 3.725522
+ as.numeric.d1.V3. 5.711203
+ */
+ val interceptR1 = 5.260103
+ val coefficientsR1 = Vectors.dense(3.725522, 5.711203)
+
+ assert(model1.intercept ~== interceptR1 relTol 1E-2)
+ assert(model1.coefficients ~= coefficientsR1 relTol 1E-2)
+
+ /*
+ coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3,
+ standardize=FALSE))
+ > coefficients
+ 3 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ (Intercept) 5.790885
+ as.numeric.d1.V2. 3.432373
+ as.numeric.d1.V3. 5.919196
+ */
+ val interceptR2 = 5.790885
+ val coefficientsR2 = Vectors.dense(3.432373, 5.919196)
+
+ assert(model2.intercept ~== interceptR2 relTol 1E-2)
+ assert(model2.coefficients ~= coefficientsR2 relTol 1E-2)
+
+ testTransformer[(Double, Vector)](datasetWithDenseFeature, model1,
+ "features", "prediction") {
+ case Row(features: DenseVector, prediction1: Double) =>
+ val prediction2 =
+ features(0) * model1.coefficients(0) + features(1) * model1.coefficients(1) +
+ model1.intercept
+ assert(prediction1 ~== prediction2 relTol 1E-5)
+ }
+ }
+ }
+
+ // OAP-MLlib: Ridge regression will fall back now
+ test("linear regression without intercept with L2 regularization") {
+ Seq("auto", "l-bfgs", "normal").foreach { solver =>
+ val trainer1 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3)
+ .setFitIntercept(false).setSolver(solver)
+ val trainer2 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3)
+ .setFitIntercept(false).setStandardization(false).setSolver(solver)
+ val model1 = trainer1.fit(datasetWithDenseFeature)
+ val model2 = trainer2.fit(datasetWithDenseFeature)
+
+ /*
+ coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3,
+ intercept = FALSE))
+ > coefficients
+ 3 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ (Intercept) .
+ as.numeric.d1.V2. 5.493430
+ as.numeric.d1.V3. 4.223082
+ */
+ val interceptR1 = 0.0
+ val coefficientsR1 = Vectors.dense(5.493430, 4.223082)
+
+ assert(model1.intercept ~== interceptR1 absTol 1E-2)
+ assert(model1.coefficients ~= coefficientsR1 relTol 1E-2)
+
+ /*
+ coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3,
+ intercept = FALSE, standardize=FALSE))
+ > coefficients
+ 3 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ (Intercept) .
+ as.numeric.d1.V2. 5.244324
+ as.numeric.d1.V3. 4.203106
+ */
+ val interceptR2 = 0.0
+ val coefficientsR2 = Vectors.dense(5.244324, 4.203106)
+
+ assert(model2.intercept ~== interceptR2 absTol 1E-2)
+ assert(model2.coefficients ~= coefficientsR2 relTol 1E-2)
+
+ testTransformer[(Double, Vector)](datasetWithDenseFeature, model1,
+ "features", "prediction") {
+ case Row(features: DenseVector, prediction1: Double) =>
+ val prediction2 =
+ features(0) * model1.coefficients(0) + features(1) * model1.coefficients(1) +
+ model1.intercept
+ assert(prediction1 ~== prediction2 relTol 1E-5)
+ }
+ }
+ }
test("linear regression with intercept with ElasticNet regularization") {
Seq("auto", "l-bfgs", "normal").foreach { solver =>
@@ -664,69 +678,70 @@ class MLlibLinearRegressionSuite extends MLTest with DefaultReadWriteTest with P
testPredictionModelSinglePrediction(model, datasetWithDenseFeature)
}
-// test("linear regression model with constant label") {
-// /*
-// R code:
-// for (formula in c(b.const ~ . -1, b.const ~ .)) {
-// model <- lm(formula, data=df.const.label, weights=w)
-// print(as.vector(coef(model)))
-// }
-// [1] -9.221298 3.394343
-// [1] 17 0 0
-// */
-// val expected = Seq(
-// Vectors.dense(0.0, -9.221298, 3.394343),
-// Vectors.dense(17.0, 0.0, 0.0))
-//
-// Seq("auto", "l-bfgs", "normal").foreach { solver =>
-// var idx = 0
-// for (fitIntercept <- Seq(false, true)) {
-// val model1 = new LinearRegression()
-// .setFitIntercept(fitIntercept)
-// .setWeightCol("weight")
-// .setPredictionCol("myPrediction")
-// .setSolver(solver)
-// .fit(datasetWithWeightConstantLabel)
-// val actual1 = Vectors.dense(model1.intercept, model1.coefficients(0),
-// model1.coefficients(1))
-// assert(actual1 ~== expected(idx) absTol 1e-4)
-//
-// // Schema of summary.predictions should be a superset of the input dataset
-// assert((datasetWithWeightConstantLabel.schema.fieldNames.toSet + model1.getPredictionCol)
-// .subsetOf(model1.summary.predictions.schema.fieldNames.toSet))
-//
-// val model2 = new LinearRegression()
-// .setFitIntercept(fitIntercept)
-// .setWeightCol("weight")
-// .setPredictionCol("myPrediction")
-// .setSolver(solver)
-// .fit(datasetWithWeightZeroLabel)
-// val actual2 = Vectors.dense(model2.intercept, model2.coefficients(0),
-// model2.coefficients(1))
-// assert(actual2 ~== Vectors.dense(0.0, 0.0, 0.0) absTol 1e-4)
-//
-// // Schema of summary.predictions should be a superset of the input dataset
-// assert((datasetWithWeightZeroLabel.schema.fieldNames.toSet + model2.getPredictionCol)
-// .subsetOf(model2.summary.predictions.schema.fieldNames.toSet))
-//
-// idx += 1
-// }
-// }
-// }
+ test("linear regression model with constant label") {
+ /*
+ R code:
+ for (formula in c(b.const ~ . -1, b.const ~ .)) {
+ model <- lm(formula, data=df.const.label, weights=w)
+ print(as.vector(coef(model)))
+ }
+ [1] -9.221298 3.394343
+ [1] 17 0 0
+ */
+ val expected = Seq(
+ Vectors.dense(0.0, -9.221298, 3.394343),
+ Vectors.dense(17.0, 0.0, 0.0))
-// test("regularized linear regression through origin with constant label") {
-// // The problem is ill-defined if fitIntercept=false, regParam is non-zero.
-// // An exception is thrown in this case.
-// Seq("auto", "l-bfgs", "normal").foreach { solver =>
-// for (standardization <- Seq(false, true)) {
-// val model = new LinearRegression().setFitIntercept(false)
-// .setRegParam(0.1).setStandardization(standardization).setSolver(solver)
-// intercept[IllegalArgumentException] {
-// model.fit(datasetWithWeightConstantLabel)
-// }
-// }
-// }
-// }
+ Seq("auto", "l-bfgs", "normal").foreach { solver =>
+ var idx = 0
+ for (fitIntercept <- Seq(false, true)) {
+ val model1 = new LinearRegression()
+ .setFitIntercept(fitIntercept)
+ .setWeightCol("weight")
+ .setPredictionCol("myPrediction")
+ .setSolver(solver)
+ .fit(datasetWithWeightConstantLabel)
+ val actual1 = Vectors.dense(model1.intercept, model1.coefficients(0),
+ model1.coefficients(1))
+ assert(actual1 ~== expected(idx) absTol 1e-4)
+
+ // Schema of summary.predictions should be a superset of the input dataset
+ assert((datasetWithWeightConstantLabel.schema.fieldNames.toSet + model1.getPredictionCol)
+ .subsetOf(model1.summary.predictions.schema.fieldNames.toSet))
+
+ val model2 = new LinearRegression()
+ .setFitIntercept(fitIntercept)
+ .setWeightCol("weight")
+ .setPredictionCol("myPrediction")
+ .setSolver(solver)
+ .fit(datasetWithWeightZeroLabel)
+ val actual2 = Vectors.dense(model2.intercept, model2.coefficients(0),
+ model2.coefficients(1))
+ assert(actual2 ~== Vectors.dense(0.0, 0.0, 0.0) absTol 1e-4)
+
+ // Schema of summary.predictions should be a superset of the input dataset
+ assert((datasetWithWeightZeroLabel.schema.fieldNames.toSet + model2.getPredictionCol)
+ .subsetOf(model2.summary.predictions.schema.fieldNames.toSet))
+
+ idx += 1
+ }
+ }
+ }
+
+ // OAP-MLlib: origin check with constant label is not supported
+ //test("regularized linear regression through origin with constant label") {
+ // // The problem is ill-defined if fitIntercept=false, regParam is non-zero.
+ // // An exception is thrown in this case.
+ // Seq("auto", "l-bfgs", "normal").foreach { solver =>
+ // for (standardization <- Seq(false, true)) {
+ // val model = new LinearRegression().setFitIntercept(false)
+ // .setRegParam(0.1).setStandardization(standardization).setSolver(solver)
+ // intercept[IllegalArgumentException] {
+ // model.fit(datasetWithWeightConstantLabel)
+ // }
+ // }
+ // }
+ //}
test("linear regression with l-bfgs when training is not needed") {
// When label is constant, l-bfgs solver returns results without training.
@@ -755,6 +770,7 @@ class MLlibLinearRegressionSuite extends MLTest with DefaultReadWriteTest with P
}
}
+ // OAP-MLlib summary is not fully supported
// test("linear regression model training summary") {
// Seq("auto", "l-bfgs", "normal").foreach { solver =>
// val trainer = new LinearRegression().setSolver(solver).setPredictionCol("myPrediction")
@@ -867,467 +883,60 @@ class MLlibLinearRegressionSuite extends MLTest with DefaultReadWriteTest with P
// }
// }
// }
-//
-// test("linear regression model testset evaluation summary") {
-// Seq("auto", "l-bfgs", "normal").foreach { solver =>
-// val trainer = new LinearRegression().setSolver(solver)
-// val model = trainer.fit(datasetWithDenseFeature)
-//
-// // Evaluating on training dataset should yield results summary equal to training summary
-// val testSummary = model.evaluate(datasetWithDenseFeature)
-// assert(model.summary.meanSquaredError ~== testSummary.meanSquaredError relTol 1E-5)
-// assert(model.summary.r2 ~== testSummary.r2 relTol 1E-5)
-// model.summary.residuals.select("residuals").collect()
-// .zip(testSummary.residuals.select("residuals").collect())
-// .forall { case (Row(r1: Double), Row(r2: Double)) => r1 ~== r2 relTol 1E-5 }
-// }
-// }
-//
-// test("linear regression with weighted samples") {
-// val sqlContext = spark.sqlContext
-// import sqlContext.implicits._
-// val numClasses = 0
-// def modelEquals(m1: LinearRegressionModel, m2: LinearRegressionModel): Unit = {
-// assert(m1.coefficients ~== m2.coefficients relTol 0.01)
-// assert(m1.intercept ~== m2.intercept relTol 0.01)
-// }
-// val testParams = Seq(
-// // (elasticNetParam, regParam, fitIntercept, standardization)
-// (0.0, 0.21, true, true),
-// (0.0, 0.21, true, false),
-// (0.0, 0.21, false, false),
-// (1.0, 0.21, true, true)
-// )
-//
-// // For squaredError loss
-// for (solver <- Seq("auto", "l-bfgs", "normal");
-// (elasticNetParam, regParam, fitIntercept, standardization) <- testParams) {
-// val estimator = new LinearRegression()
-// .setFitIntercept(fitIntercept)
-// .setStandardization(standardization)
-// .setRegParam(regParam)
-// .setElasticNetParam(elasticNetParam)
-// .setSolver(solver)
-// .setMaxIter(1)
-// MLTestingUtils.testArbitrarilyScaledWeights[LinearRegressionModel, LinearRegression](
-// datasetWithStrongNoise.as[LabeledPoint], estimator, modelEquals)
-// MLTestingUtils.testOutliersWithSmallWeights[LinearRegressionModel, LinearRegression](
-// datasetWithStrongNoise.as[LabeledPoint], estimator, numClasses, modelEquals,
-// outlierRatio = 3)
-// MLTestingUtils.testOversamplingVsWeighting[LinearRegressionModel, LinearRegression](
-// datasetWithStrongNoise.as[LabeledPoint], estimator, modelEquals, seed)
-// }
-//
-// // For huber loss
-// for ((_, regParam, fitIntercept, standardization) <- testParams) {
-// val estimator = new LinearRegression()
-// .setLoss("huber")
-// .setFitIntercept(fitIntercept)
-// .setStandardization(standardization)
-// .setRegParam(regParam)
-// .setMaxIter(1)
-// MLTestingUtils.testArbitrarilyScaledWeights[LinearRegressionModel, LinearRegression](
-// datasetWithOutlier.as[LabeledPoint], estimator, modelEquals)
-// MLTestingUtils.testOutliersWithSmallWeights[LinearRegressionModel, LinearRegression](
-// datasetWithOutlier.as[LabeledPoint], estimator, numClasses, modelEquals,
-// outlierRatio = 3)
-// MLTestingUtils.testOversamplingVsWeighting[LinearRegressionModel, LinearRegression](
-// datasetWithOutlier.as[LabeledPoint], estimator, modelEquals, seed)
-// }
-// }
-
- test("linear regression model with l-bfgs with big feature datasets") {
- val trainer = new LinearRegression().setSolver("auto")
- val model = trainer.fit(datasetWithSparseFeature)
-
- // Training results for the model should be available
- assert(model.hasSummary)
- // When LBFGS is used as optimizer, objective history can be restored.
- assert(
- model.summary
- .objectiveHistory
- .sliding(2)
- .forall(x => x(0) >= x(1)))
- }
-// test("linear regression summary with weighted samples and intercept by normal solver") {
-// /*
-// R code:
-//
-// model <- glm(formula = "b ~ .", data = df, weights = w)
-// summary(model)
-//
-// Call:
-// glm(formula = "b ~ .", data = df, weights = w)
-//
-// Deviance Residuals:
-// 1 2 3 4
-// 1.920 -1.358 -1.109 0.960
-//
-// Coefficients:
-// Estimate Std. Error t value Pr(>|t|)
-// (Intercept) 18.080 9.608 1.882 0.311
-// V1 6.080 5.556 1.094 0.471
-// V2 -0.600 1.960 -0.306 0.811
-//
-// (Dispersion parameter for gaussian family taken to be 7.68)
-//
-// Null deviance: 202.00 on 3 degrees of freedom
-// Residual deviance: 7.68 on 1 degrees of freedom
-// AIC: 18.783
-//
-// Number of Fisher Scoring iterations: 2
-// */
-//
-// val model = new LinearRegression()
-// .setWeightCol("weight")
-// .setSolver("normal")
-// .fit(datasetWithWeight)
-// val coefficientsR = Vectors.dense(Array(6.080, -0.600))
-// val interceptR = 18.080
-// val devianceResidualsR = Array(-1.358, 1.920)
-// val seCoefR = Array(5.556, 1.960, 9.608)
-// val tValsR = Array(1.094, -0.306, 1.882)
-// val pValsR = Array(0.471, 0.811, 0.311)
-//
-// assert(model.coefficients ~== coefficientsR absTol 1E-3)
-// assert(model.intercept ~== interceptR absTol 1E-3)
-// model.summary.devianceResiduals.zip(devianceResidualsR).foreach { x =>
-// assert(x._1 ~== x._2 absTol 1E-3) }
-// model.summary.coefficientStandardErrors.zip(seCoefR).foreach{ x =>
-// assert(x._1 ~== x._2 absTol 1E-3) }
-// model.summary.tValues.zip(tValsR).foreach{ x => assert(x._1 ~== x._2 absTol 1E-3) }
-// model.summary.pValues.zip(pValsR).foreach{ x => assert(x._1 ~== x._2 absTol 1E-3) }
-//
-// val modelWithL1 = new LinearRegression()
-// .setWeightCol("weight")
-// .setSolver("normal")
-// .setRegParam(0.5)
-// .setElasticNetParam(1.0)
-// .fit(datasetWithWeight)
-//
-// assert(modelWithL1.summary.objectiveHistory !== Array(0.0))
-// assert(
-// modelWithL1.summary
-// .objectiveHistory
-// .sliding(2)
-// .forall(x => x(0) >= x(1)))
-// }
-
-// test("linear regression summary with weighted samples and w/o intercept by normal solver") {
-// /*
-// R code:
-//
-// model <- glm(formula = "b ~ . -1", data = df, weights = w)
-// summary(model)
-//
-// Call:
-// glm(formula = "b ~ . -1", data = df, weights = w)
-//
-// Deviance Residuals:
-// 1 2 3 4
-// 1.950 2.344 -4.600 2.103
-//
-// Coefficients:
-// Estimate Std. Error t value Pr(>|t|)
-// V1 -3.7271 2.9032 -1.284 0.3279
-// V2 3.0100 0.6022 4.998 0.0378 *
-// ---
-// Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-//
-// (Dispersion parameter for gaussian family taken to be 17.4376)
-//
-// Null deviance: 5962.000 on 4 degrees of freedom
-// Residual deviance: 34.875 on 2 degrees of freedom
-// AIC: 22.835
-//
-// Number of Fisher Scoring iterations: 2
-// */
-//
-// val model = new LinearRegression()
-// .setWeightCol("weight")
-// .setSolver("normal")
-// .setFitIntercept(false)
-// .fit(datasetWithWeight)
-// val coefficientsR = Vectors.dense(Array(-3.7271, 3.0100))
-// val interceptR = 0.0
-// val devianceResidualsR = Array(-4.600, 2.344)
-// val seCoefR = Array(2.9032, 0.6022)
-// val tValsR = Array(-1.284, 4.998)
-// val pValsR = Array(0.3279, 0.0378)
-//
-// assert(model.coefficients ~== coefficientsR absTol 1E-3)
-// assert(model.intercept === interceptR)
-// model.summary.devianceResiduals.zip(devianceResidualsR).foreach { x =>
-// assert(x._1 ~== x._2 absTol 1E-3) }
-// model.summary.coefficientStandardErrors.zip(seCoefR).foreach{ x =>
-// assert(x._1 ~== x._2 absTol 1E-3) }
-// model.summary.tValues.zip(tValsR).foreach{ x => assert(x._1 ~== x._2 absTol 1E-3) }
-// model.summary.pValues.zip(pValsR).foreach{ x => assert(x._1 ~== x._2 absTol 1E-3) }
-// }
-
- test("read/write") {
- def checkModelData(model: LinearRegressionModel, model2: LinearRegressionModel): Unit = {
- assert(model.intercept === model2.intercept)
- assert(model.coefficients === model2.coefficients)
+ test("linear regression with weighted samples") {
+ val sqlContext = spark.sqlContext
+ import sqlContext.implicits._
+ val numClasses = 0
+ def modelEquals(m1: LinearRegressionModel, m2: LinearRegressionModel): Unit = {
+ assert(m1.coefficients ~== m2.coefficients relTol 0.01)
+ assert(m1.intercept ~== m2.intercept relTol 0.01)
+ }
+ val testParams = Seq(
+ // (elasticNetParam, regParam, fitIntercept, standardization)
+ (0.0, 0.21, true, true),
+ (0.0, 0.21, true, false),
+ (0.0, 0.21, false, false),
+ (1.0, 0.21, true, true)
+ )
+
+ // For squaredError loss
+ for (solver <- Seq("auto", "l-bfgs", "normal");
+ (elasticNetParam, regParam, fitIntercept, standardization) <- testParams) {
+ val estimator = new LinearRegression()
+ .setFitIntercept(fitIntercept)
+ .setStandardization(standardization)
+ .setRegParam(regParam)
+ .setElasticNetParam(elasticNetParam)
+ .setSolver(solver)
+ .setMaxIter(1)
+ MLTestingUtils.testArbitrarilyScaledWeights[LinearRegressionModel, LinearRegression](
+ datasetWithStrongNoise.as[LabeledPoint], estimator, modelEquals)
+ MLTestingUtils.testOutliersWithSmallWeights[LinearRegressionModel, LinearRegression](
+ datasetWithStrongNoise.as[LabeledPoint], estimator, numClasses, modelEquals,
+ outlierRatio = 3)
+ MLTestingUtils.testOversamplingVsWeighting[LinearRegressionModel, LinearRegression](
+ datasetWithStrongNoise.as[LabeledPoint], estimator, modelEquals, seed)
}
- val lr = new LinearRegression()
- testEstimatorAndModelReadWrite(lr, datasetWithWeight, LinearRegressionSuite.allParamSettings,
- LinearRegressionSuite.allParamSettings, checkModelData)
- }
-
-// test("pmml export") {
-// val lr = new LinearRegression()
-// val model = lr.fit(datasetWithWeight)
-// def checkModel(pmml: PMML): Unit = {
-// val dd = pmml.getDataDictionary
-// assert(dd.getNumberOfFields === 3)
-// val fields = dd.getDataFields.asScala
-// assert(fields(0).getName().toString === "field_0")
-// assert(fields(0).getOpType() == OpType.CONTINUOUS)
-// val pmmlRegressionModel = pmml.getModels().get(0).asInstanceOf[PMMLRegressionModel]
-// val pmmlPredictors = pmmlRegressionModel.getRegressionTables.get(0).getNumericPredictors
-// val pmmlWeights = pmmlPredictors.asScala.map(_.getCoefficient()).toList
-// assert(pmmlWeights(0) ~== model.coefficients(0) relTol 1E-3)
-// assert(pmmlWeights(1) ~== model.coefficients(1) relTol 1E-3)
-// }
-// testPMMLWrite(sc, model, checkModel)
-// }
-
-// test("should support all NumericType labels and weights, and not support other types") {
-// for (solver <- Seq("auto", "l-bfgs", "normal")) {
-// val lr = new LinearRegression().setMaxIter(1).setSolver(solver)
-// MLTestingUtils.checkNumericTypes[LinearRegressionModel, LinearRegression](
-// lr, spark, isClassification = false) { (expected, actual) =>
-// assert(expected.intercept === actual.intercept)
-// assert(expected.coefficients === actual.coefficients)
-// }
-// }
-// }
-
- test("linear regression (huber loss) with intercept without regularization") {
- val trainer1 = (new LinearRegression).setLoss("huber")
- .setFitIntercept(true).setStandardization(true)
- val trainer2 = (new LinearRegression).setLoss("huber")
- .setFitIntercept(true).setStandardization(false)
-
- val model1 = trainer1.fit(datasetWithOutlier)
- val model2 = trainer2.fit(datasetWithOutlier)
-
- /*
- Using the following Python code to load the data and train the model using
- scikit-learn package.
-
- import pandas as pd
- import numpy as np
- from sklearn.linear_model import HuberRegressor
- df = pd.read_csv("path", header = None)
- X = df[df.columns[1:3]]
- y = np.array(df[df.columns[0]])
- huber = HuberRegressor(fit_intercept=True, alpha=0.0, max_iter=100, epsilon=1.35)
- huber.fit(X, y)
-
- >>> huber.coef_
- array([ 4.68998007, 7.19429011])
- >>> huber.intercept_
- 6.3002404351083037
- >>> huber.scale_
- 0.077810159205220747
- */
- val coefficientsPy = Vectors.dense(4.68998007, 7.19429011)
- val interceptPy = 6.30024044
- val scalePy = 0.07781016
-
- assert(model1.coefficients ~= coefficientsPy relTol 1E-3)
- assert(model1.intercept ~== interceptPy relTol 1E-3)
- assert(model1.scale ~== scalePy relTol 1E-3)
-
- // Without regularization, with or without standardization will converge to the same solution.
- assert(model2.coefficients ~= coefficientsPy relTol 1E-3)
- assert(model2.intercept ~== interceptPy relTol 1E-3)
- assert(model2.scale ~== scalePy relTol 1E-3)
- }
-
- test("linear regression (huber loss) without intercept without regularization") {
- val trainer1 = (new LinearRegression).setLoss("huber")
- .setFitIntercept(false).setStandardization(true)
- val trainer2 = (new LinearRegression).setLoss("huber")
- .setFitIntercept(false).setStandardization(false)
-
- val model1 = trainer1.fit(datasetWithOutlier)
- val model2 = trainer2.fit(datasetWithOutlier)
-
- /*
- huber = HuberRegressor(fit_intercept=False, alpha=0.0, max_iter=100, epsilon=1.35)
- huber.fit(X, y)
-
- >>> huber.coef_
- array([ 6.71756703, 5.08873222])
- >>> huber.intercept_
- 0.0
- >>> huber.scale_
- 2.5560209922722317
- */
- val coefficientsPy = Vectors.dense(6.71756703, 5.08873222)
- val interceptPy = 0.0
- val scalePy = 2.55602099
-
- assert(model1.coefficients ~= coefficientsPy relTol 1E-3)
- assert(model1.intercept === interceptPy)
- assert(model1.scale ~== scalePy relTol 1E-3)
-
- // Without regularization, with or without standardization will converge to the same solution.
- assert(model2.coefficients ~= coefficientsPy relTol 1E-3)
- assert(model2.intercept === interceptPy)
- assert(model2.scale ~== scalePy relTol 1E-3)
- }
-
- test("linear regression (huber loss) with intercept with L2 regularization") {
- val trainer1 = (new LinearRegression).setLoss("huber")
- .setFitIntercept(true).setRegParam(0.21).setStandardization(true)
- val trainer2 = (new LinearRegression).setLoss("huber")
- .setFitIntercept(true).setRegParam(0.21).setStandardization(false)
-
- val model1 = trainer1.fit(datasetWithOutlier)
- val model2 = trainer2.fit(datasetWithOutlier)
-
- /*
- Since scikit-learn HuberRegressor does not support standardization,
- we do it manually out of the estimator.
-
- xStd = np.std(X, axis=0)
- scaledX = X / xStd
- huber = HuberRegressor(fit_intercept=True, alpha=210, max_iter=100, epsilon=1.35)
- huber.fit(scaledX, y)
-
- >>> np.array(huber.coef_ / xStd)
- array([ 1.97732633, 3.38816722])
- >>> huber.intercept_
- 3.7527581430531227
- >>> huber.scale_
- 3.787363673371801
- */
- val coefficientsPy1 = Vectors.dense(1.97732633, 3.38816722)
- val interceptPy1 = 3.75275814
- val scalePy1 = 3.78736367
-
- assert(model1.coefficients ~= coefficientsPy1 relTol 1E-2)
- assert(model1.intercept ~== interceptPy1 relTol 1E-2)
- assert(model1.scale ~== scalePy1 relTol 1E-2)
-
- /*
- huber = HuberRegressor(fit_intercept=True, alpha=210, max_iter=100, epsilon=1.35)
- huber.fit(X, y)
-
- >>> huber.coef_
- array([ 1.73346444, 3.63746999])
- >>> huber.intercept_
- 4.3017134790781739
- >>> huber.scale_
- 3.6472742809286793
- */
- val coefficientsPy2 = Vectors.dense(1.73346444, 3.63746999)
- val interceptPy2 = 4.30171347
- val scalePy2 = 3.64727428
-
- assert(model2.coefficients ~= coefficientsPy2 relTol 1E-3)
- assert(model2.intercept ~== interceptPy2 relTol 1E-3)
- assert(model2.scale ~== scalePy2 relTol 1E-3)
- }
-
- test("linear regression (huber loss) without intercept with L2 regularization") {
- val trainer1 = (new LinearRegression).setLoss("huber")
- .setFitIntercept(false).setRegParam(0.21).setStandardization(true)
- val trainer2 = (new LinearRegression).setLoss("huber")
- .setFitIntercept(false).setRegParam(0.21).setStandardization(false)
-
- val model1 = trainer1.fit(datasetWithOutlier)
- val model2 = trainer2.fit(datasetWithOutlier)
-
- /*
- Since scikit-learn HuberRegressor does not support standardization,
- we do it manually out of the estimator.
-
- xStd = np.std(X, axis=0)
- scaledX = X / xStd
- huber = HuberRegressor(fit_intercept=False, alpha=210, max_iter=100, epsilon=1.35)
- huber.fit(scaledX, y)
-
- >>> np.array(huber.coef_ / xStd)
- array([ 2.59679008, 2.26973102])
- >>> huber.intercept_
- 0.0
- >>> huber.scale_
- 4.5766311924091791
- */
- val coefficientsPy1 = Vectors.dense(2.59679008, 2.26973102)
- val interceptPy1 = 0.0
- val scalePy1 = 4.57663119
-
- assert(model1.coefficients ~= coefficientsPy1 relTol 1E-2)
- assert(model1.intercept === interceptPy1)
- assert(model1.scale ~== scalePy1 relTol 1E-2)
-
- /*
- huber = HuberRegressor(fit_intercept=False, alpha=210, max_iter=100, epsilon=1.35)
- huber.fit(X, y)
-
- >>> huber.coef_
- array([ 2.28423908, 2.25196887])
- >>> huber.intercept_
- 0.0
- >>> huber.scale_
- 4.5979643506051753
- */
- val coefficientsPy2 = Vectors.dense(2.28423908, 2.25196887)
- val interceptPy2 = 0.0
- val scalePy2 = 4.59796435
-
- assert(model2.coefficients ~= coefficientsPy2 relTol 1E-3)
- assert(model2.intercept === interceptPy2)
- assert(model2.scale ~== scalePy2 relTol 1E-3)
- }
-
- test("huber loss model match squared error for large epsilon") {
- val trainer1 = new LinearRegression().setLoss("huber").setEpsilon(1E5)
- val model1 = trainer1.fit(datasetWithOutlier)
- val trainer2 = new LinearRegression()
- val model2 = trainer2.fit(datasetWithOutlier)
- assert(model1.coefficients ~== model2.coefficients relTol 1E-3)
- assert(model1.intercept ~== model2.intercept relTol 1E-3)
- }
-
- test("oap-mllib-163: should support all kind of labelCol name and featuresCol name") {
- val df_1 = datasetWithDenseFeature.toDF("label_alias", "features_alias")
- val nb_1 = new LinearRegression().setLabelCol("label_alias")
- val e_1 = intercept[IllegalArgumentException](nb_1.fit(df_1)).getMessage
- assert(e_1.contains("features does not exist. Available: label_alias, features_alias"))
-
- val df_2 = datasetWithDenseFeature.toDF("label_alias", "features_alias")
- val nb_2 = new LinearRegression().setFeaturesCol("features_alias")
- val e_2 = intercept[IllegalArgumentException](nb_2.fit(df_2)).getMessage
- assert(e_2.contains("label does not exist. Available: label_alias, features_alias"))
- val df_3 = datasetWithDenseFeature.toDF("label_alias", "features_alias")
- val nb_3 = new LinearRegression().setLabelCol("label_alias").setFeaturesCol("features_alias")
- val model = nb_3.fit(df_3)
- assert(model.hasParent)
- assert(model.hasSummary)
+ // For huber loss
+ for ((_, regParam, fitIntercept, standardization) <- testParams) {
+ val estimator = new LinearRegression()
+ .setLoss("huber")
+ .setFitIntercept(fitIntercept)
+ .setStandardization(standardization)
+ .setRegParam(regParam)
+ .setMaxIter(1)
+ MLTestingUtils.testArbitrarilyScaledWeights[LinearRegressionModel, LinearRegression](
+ datasetWithOutlier.as[LabeledPoint], estimator, modelEquals)
+ MLTestingUtils.testOutliersWithSmallWeights[LinearRegressionModel, LinearRegression](
+ datasetWithOutlier.as[LabeledPoint], estimator, numClasses, modelEquals,
+ outlierRatio = 3)
+ MLTestingUtils.testOversamplingVsWeighting[LinearRegressionModel, LinearRegression](
+ datasetWithOutlier.as[LabeledPoint], estimator, modelEquals, seed)
+ }
}
- test("oap-mllib-163: should support column in any order") {
- val df_1 = datasetWithDenseFeature.select("label", "features").toDF()
- val nb_1 = new LinearRegression().setLabelCol("label").setFeaturesCol("features")
- val model_1 = nb_1.fit(df_1)
- assert(model_1.hasParent)
- assert(model_1.hasSummary)
-
- val df_2 = datasetWithDenseFeature.select("features", "label").toDF()
- val nb_2 = new LinearRegression().setLabelCol("label").setFeaturesCol("features")
- val model_2 = nb_2.fit(df_2)
- assert(model_2.hasParent)
- assert(model_2.hasSummary)
- }
}
object LinearRegressionSuite {
diff --git a/mllib-dal/test.sh b/mllib-dal/test.sh
index 07ef18366..0d509481b 100755
--- a/mllib-dal/test.sh
+++ b/mllib-dal/test.sh
@@ -52,11 +52,11 @@ MVN_NO_TRANSFER_PROGRESS=
print_usage() {
echo
- echo "Usage: ./test.sh [-p ] [-q] [-h] [-d] "
+ echo "Usage: ./test.sh [-p ] [-q] [-h] [-d] "
echo
echo "-p Supported Platform Profiles:"
- echo " CPU_ONLY_PROFILE"
- echo " CPU_GPU_PROFILE"
+ echo " CPU_GPU_PROFILE (Default)"
+ echo " CPU_ONLY_PROFILE"
echo
}
@@ -78,10 +78,12 @@ shift "$((OPTIND-1))"
SUITE=$*
print_usage
+echo "Testing suite(s) $SUITE ..."
+echo
-if [[ ! ($PLATFORM_PROFILE == CPU_ONLY_PROFILE || $PLATFORM_PROFILE == CPU_GPU_PROFILE) ]]; then
+if [[ ! ($PLATFORM_PROFILE == CPU_GPU_PROFILE || $PLATFORM_PROFILE == CPU_ONLY_PROFILE) ]]; then
echo
- echo Platform Profile should be CPU_ONLY_PROFILE or CPU_GPU_PROFILE, but \"$PLATFORM_PROFILE\" found!
+ echo Platform Profile should be CPU_GPU_PROFILE or CPU_ONLY_PROFILE, but \"$PLATFORM_PROFILE\" found!
echo
exit 1
fi
@@ -93,7 +95,7 @@ if [ "HOST" = $DEVICE_OPT ]; then
"RowAccessorTest" \
"com.intel.oap.mllib.ConvertHomogenTableSuite"
)
-else
+elif [ "CPU" = $DEVICE_OPT ]; then
suiteArray=(
"org.apache.spark.ml.clustering.MLlibKMeansSuite" \
"org.apache.spark.ml.feature.MLlibPCASuite" \
@@ -103,6 +105,13 @@ else
"org.apache.spark.ml.stat.MLlibCorrelationSuite" \
"org.apache.spark.ml.stat.MLlibSummarizerSuite"
)
+elif [ "GPU" = $DEVICE_OPT ]; then
+ suiteArray=(
+ "org.apache.spark.ml.regression.MLlibLinearRegressionSuite"
+ )
+else
+ echo Error: $DEVICE_OPT is not supported!
+ exit 1
fi
if [[ ! ${suiteArray[*]} =~ $SUITE ]]; then
@@ -114,18 +123,18 @@ fi
SCRIPT_DIR=$( cd $(dirname ${BASH_SOURCE[0]}) && pwd )
OAP_MLLIB_ROOT=$(cd $SCRIPT_DIR/.. && pwd)
source $OAP_MLLIB_ROOT/RELEASE
-
+# Set defaults from RELEASE envs
export SPARK_VERSION=${SPARK_OPT:-$SPARK_VERSION}
export PLATFORM_PROFILE=${PLATFORM_OPT:-$PLATFORM_PROFILE}
-echo "computeDevice : $DEVICE_OPT"
+echo "Using Compute Device: $DEVICE_OPT"
echo
echo === Testing Environments ===
-echo JAVA_HOME=$JAVA_HOME
-echo Maven Version: $(mvn -v | head -n 1 | cut -f3 -d" ")
-echo Spark Version: $SPARK_VERSION
echo Platform Profile: $PLATFORM_PROFILE
+echo Spark Version: $SPARK_VERSION
+echo Maven Version: $(mvn -v | head -n 1 | cut -f3 -d" ")
+echo JAVA_HOME=$JAVA_HOME
echo ============================
echo
@@ -153,21 +162,22 @@ else
SUBSUITE=$(echo $SUITE | tr "," "\n")
for suite in ${SUBSUITE[*]}
do
- if [[ $suite == *"com.intel.oap.mllib"* ]]; then
- echo
- echo Testing $suite ...
- echo
- mvn $MVN_NO_TRANSFER_PROGRESS -Dspark.version=$SPARK_VERSION -DcomputeDevice=$DEVICE_OPT -Dtest=none -DforkMode=never -Dmaven.test.failure.ignore=true -DfailIfNoTests=false -DwildcardSuites=$suite test
- elif [[ $suite == *"org.apache.spark.ml"* ]]; then
- echo
- echo Testing $suite ...
- echo
- mvn $MVN_NO_TRANSFER_PROGRESS -Dspark.version=$SPARK_VERSION -DcomputeDevice=$DEVICE_OPT -Dtest=none -DwildcardSuites=$suite test
- else
- echo
- echo Testing java $suite ...
- echo
- mvn $MVN_NO_TRANSFER_PROGRESS -Dspark.version=$SPARK_VERSION -DcomputeDevice=$DEVICE_OPT -DwildcardSuites=none -Dtest=$suite test
- fi
+ echo $suite
+ if [[ $suite == *"com.intel.oap.mllib"* ]]; then
+ echo
+ echo Testing $suite ...
+ echo
+ mvn $MVN_NO_TRANSFER_PROGRESS -Dspark.version=$SPARK_VERSION -Dtest=none -DcomputeDevice=$DEVICE_OPT -DforkMode=never -Dmaven.test.failure.ignore=true -DfailIfNoTests=false -DwildcardSuites=$suite test
+ elif [[ $suite == *"org.apache.spark.ml"* ]]; then
+ echo
+ echo Testing $suite ...
+ echo
+ mvn $MVN_NO_TRANSFER_PROGRESS -Dspark.version=$SPARK_VERSION -Dtest=none -DcomputeDevice=$DEVICE_OPT -DfailIfNoTests=false -DwildcardSuites=$suite test
+ else
+ echo
+ echo Testing Java $suite ...
+ echo
+ mvn $MVN_NO_TRANSFER_PROGRESS -Dspark.version=$SPARK_VERSION -Dtest=$suite -DcomputeDevice=$DEVICE_OPT -DwildcardSuites=none test
+ fi
done
fi