[ML-248][LinearRegression] Add Linear Regression GPU algorithm (#261)

* File init * Add linear regression framework * Get LinearRegressionDALImpl done * Scala almost done Spark MLlib linearRegression model doesn't match now. * Cpp code done * Fix typo * Add CPU/GPU macro * Remove spark313 * Prepare for review * Remove useless file * Remove other branch edited file * Clean files * Do native code link * Function complete * Format cpp * Fomat cpp clean up * restruct cpu code * Fix typo * Add cpu parameter * Active cpu even use CPU_GPU_PROFILE * Solve namespace problem Take CPU_ONLY_PROFILE and CPU_GPU_PROFILE as exclusive * Fix variable name mismatch * Fix typo * Format cpp * Clean up * Clean up debug log in scala * Fix typo * Revert kmeans change * Remove CPU_ONLY * Revert jni change * run.sh is used to run CPU mode * Add sampile run script for gpu * Update head file * Native code adapt new communicator * Change scala communicator api * Rename LR cpp file and generate new bridge file * Remove debug code * Patition problem will be fixed at data convertion * Revert useless change * Fix indentation * Fix space problem * Fix space * Revert "Fix space problem" This reverts commit 95563a1. * Format cpp * Make parameter simpler * Format cpp * Fix sparse data process * Refine error msg
oap-project · May 6, 2023 · 820f72d · 820f72d
1 parent 4e1ce4d
commit 820f72d
Show file tree

Hide file tree

Showing 10 changed files with 277 additions and 77 deletions.
diff --git a/examples/linear-regression/GetIntelGpuResources.sh b/examples/linear-regression/GetIntelGpuResources.sh
@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+
+# This script is a basic example script to get resource information about NVIDIA GPUs.
+# It assumes the drivers are properly installed and the nvidia-smi command is available.
+# It is not guaranteed to work on all setups so please test and customize as needed
+# for your environment. It can be passed into SPARK via the config
+# spark.{driver/executor}.resource.gpu.discoveryScript to allow the driver or executor to discover
+# the GPUs it was allocated. It assumes you are running within an isolated container where the
+# GPUs are allocated exclusively to that driver or executor.
+# It outputs a JSON formatted string that is expected by the
+# spark.{driver/executor}.resource.gpu.discoveryScript config.
+#
+# Example output: {"name": "gpu", "addresses":["0","1","2","3","4","5","6","7"]}
+
+#ADDRS=`nvidia-smi --query-gpu=index --format=csv,noheader | sed -e ':a' -e 'N' -e'$!ba' -e 's/\n/","/g'`
+#echo {\"name\": \"gpu\", \"addresses\":[\"$ADDRS\"]}
+#ADDRS="0","1","2","3","4","5","6","7"
+#echo {\"name\": \"gpu\", \"addresses\":[\"$ADDRS\"]}
+
+echo {\"name\": \"gpu\", \"addresses\":[\"0\",\"1\",\"2\",\"3\"]}
diff --git a/examples/linear-regression/IntelGpuResourceFile.json b/examples/linear-regression/IntelGpuResourceFile.json
@@ -0,0 +1 @@
+[{"id":{"componentName": "spark.worker","resourceName":"gpu"},"addresses":["0","1","2","3"]}]
diff --git a/examples/linear-regression/run-gpu-standalone.sh b/examples/linear-regression/run-gpu-standalone.sh
@@ -0,0 +1,42 @@
+#!/usr/bin/env bash
+
+source ../../conf/env.sh
+
+# Data file is from Spark Examples (data/mllib/sample_linear_regression_data.txt) and put in examples/data
+# The data file should be copied to $HDFS_ROOT before running examples
+DATA_FILE=$HDFS_ROOT/data/sample_linear_regression_data.txt
+
+APP_JAR=target/oap-mllib-examples-$OAP_MLLIB_VERSION.jar
+APP_CLASS=org.apache.spark.examples.ml.LinearRegressionExample
+
+DEVICE=GPU
+RESOURCE_FILE=$PWD/IntelGpuResourceFile.json
+WORKER_GPU_AMOUNT=4
+EXECUTOR_GPU_AMOUNT=1
+TASK_GPU_AMOUNT=1
+
+# Should run in standalone mode
+time $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER \
+    --num-executors $SPARK_NUM_EXECUTORS \
+    --executor-cores $SPARK_EXECUTOR_CORES \
+    --total-executor-cores $SPARK_TOTAL_CORES \
+    --driver-memory $SPARK_DRIVER_MEMORY \
+    --executor-memory $SPARK_EXECUTOR_MEMORY \
+    --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \
+    --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \
+    --conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \
+    --conf "spark.driver.extraClassPath=$SPARK_DRIVER_CLASSPATH" \
+    --conf "spark.executor.extraClassPath=$SPARK_EXECUTOR_CLASSPATH" \
+    --conf "spark.oap.mllib.device=$DEVICE" \
+    --conf "spark.worker.resourcesFile=$RESOURCE_FILE" \
+    --conf "spark.worker.resource.gpu.amount=$WORKER_GPU_AMOUNT" \
+    --conf "spark.executor.resource.gpu.amount=$EXECUTOR_GPU_AMOUNT" \
+    --conf "spark.task.resource.gpu.amount=$TASK_GPU_AMOUNT" \
+    --conf "spark.shuffle.reduceLocality.enabled=false" \
+    --conf "spark.network.timeout=1200s" \
+    --conf "spark.task.maxFailures=1" \
+    --jars $OAP_MLLIB_JAR \
+    --class $APP_CLASS \
+    $APP_JAR $DATA_FILE \
+    2>&1 | tee LinearRegression-$(date +%m%d_%H_%M_%S).log
+
diff --git a/examples/linear-regression/run.sh b/examples/linear-regression/run.sh
@@ -19,13 +19,14 @@ time $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER \
     --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \
     --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \
     --conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \
-    --conf "spark.oap.mllib.device=$DEVICE" \
     --conf "spark.driver.extraClassPath=$SPARK_DRIVER_CLASSPATH" \
     --conf "spark.executor.extraClassPath=$SPARK_EXECUTOR_CLASSPATH" \
+    --conf "spark.oap.mllib.device=$DEVICE" \
     --conf "spark.shuffle.reduceLocality.enabled=false" \
     --conf "spark.network.timeout=1200s" \
     --conf "spark.task.maxFailures=1" \
     --jars $OAP_MLLIB_JAR \
     --class $APP_CLASS \
     $APP_JAR $DATA_FILE \
     2>&1 | tee LinearRegression-$(date +%m%d_%H_%M_%S).log
+
diff --git a/...near-regression/src/main/scala/org/apache/spark/examples/ml/LinearRegressionExample.scala b/...near-regression/src/main/scala/org/apache/spark/examples/ml/LinearRegressionExample.scala
@@ -18,8 +18,10 @@
 // scalastyle:off println
 package org.apache.spark.examples.ml
 
+import org.apache.spark.sql.Row
+import org.apache.spark.ml.linalg.Vector
 import scopt.OptionParser
-
+import org.apache.spark.ml.feature.LabeledPoint
 import org.apache.spark.ml.regression.LinearRegression
 import org.apache.spark.sql.{DataFrame, SparkSession}
 
@@ -110,6 +112,14 @@ object LinearRegressionExample {
     val training = spark.read.format("libsvm")
       .load(params.input).toDF("label", "features")
 
+    training.select("label", "features").printSchema()
+    val featuresRDD = training
+      .select("label", "features").rdd.map{
+        case Row(label: Double, feature: Vector) => new LabeledPoint(label, feature.toDense)
+      }
+      import spark.implicits._
+      val df = featuresRDD.toDF("label", "features")
+      df.show(false)
     val lir = new LinearRegression()
       .setFeaturesCol("features")
       .setLabelCol("label")
@@ -120,7 +130,7 @@ object LinearRegressionExample {
 
     // Train the model
     val startTime = System.nanoTime()
-    val lirModel = lir.fit(training)
+    val lirModel = lir.fit(df)
     val elapsedTime = (System.nanoTime() - startTime) / 1e9
     println(s"Training time: $elapsedTime seconds")
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		[{"id":{"componentName": "spark.worker","resourceName":"gpu"},"addresses":["0","1","2","3"]}]