[ML-282] Refactor CPU & GPU examples (#306)

* First move * Move device discover for scala * Delete old gpu discover * Add run-all-gpu * Add clean up * Add tmp utils file * Add exe * Rename run script * Scala gpu donw * Scala cpu done * For ci * pyspark ci * Rename scala * Rename scala file in scripts * Pyspark unit done * Update pyspark utils * Update ci * Remove tmp utils * Reaname utils * Change absolute path, rm als gpu.sh * Scala absolute path * Change sanity check * Rename ci * Split random_forest * Fix name change in ci * Fix path typo * Fix typo
oap-project · Jun 27, 2023 · e19ab80 · e19ab80
1 parent f4e6b49
commit e19ab80
Show file tree

Hide file tree

Showing 64 changed files with 212 additions and 90 deletions.
diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml
@@ -72,7 +72,7 @@ jobs:
             ${{ runner.os }}-
       - name: Cluster Test
         run: |
-          ${{github.workspace}}/dev/ci/ci-yarn-test.sh
+          ${{github.workspace}}/dev/ci/ci-yarn-test-cpu.sh
   standalone-test:
     name: Standalone CPU_GPU_PROFILE Test for Examples (CPU)
     runs-on: ubuntu-20.04
@@ -95,4 +95,4 @@ jobs:
             ${{ runner.os }}-
       - name: Cluster Test
         run: |
-          ${{github.workspace}}/dev/ci/ci-standalone-test.sh
+          ${{github.workspace}}/dev/ci/ci-standalone-test-cpu.sh
diff --git a/README.md b/README.md
@@ -131,9 +131,8 @@ Edit related variables in "`Minimun Settings`" of `env.sh`
 #### Run K-means
 
 ```bash
-    $ cd examples/kmeans
-    $ ./build.sh
-    $ ./run.sh
+    $ cd examples/python/kmeans-pyspark
+    $ ./run-cpu.sh
 ```
 
 ### PySpark Support

diff --git a/examples/kmeans/GetIntelGpuResources.sh → conf/GetIntelGpuResources.sh b/examples/kmeans/GetIntelGpuResources.sh → conf/GetIntelGpuResources.sh
diff --git a/...les/correlation/IntelGpuResourceFile.json → conf/IntelGpuResourceFile.json b/...les/correlation/IntelGpuResourceFile.json → conf/IntelGpuResourceFile.json
diff --git a/dev/ci/ci-standalone-test.sh → dev/ci/ci-standalone-test-cpu.sh b/dev/ci/ci-standalone-test.sh → dev/ci/ci-standalone-test-cpu.sh
@@ -33,5 +33,5 @@ echo "========================================="
 
 # Build and run all examples
 ./build-all-scala.sh
-./run-all-scala.sh
-./run-all-pyspark.sh
+./run-all-scala-cpu.sh
+./run-all-pyspark-cpu.sh
diff --git a/dev/ci/ci-yarn-test.sh → dev/ci/ci-yarn-test-cpu.sh b/dev/ci/ci-yarn-test.sh → dev/ci/ci-yarn-test-cpu.sh
@@ -36,5 +36,5 @@ echo "========================================="
 
 # Build and run all examples
 ./build-all-scala.sh
-./run-all-scala.sh
-./run-all-pyspark.sh
+./run-all-scala-cpu.sh
+./run-all-pyspark-cpu.sh
diff --git a/examples/build-all-scala.sh b/examples/build-all-scala.sh
@@ -1,6 +1,9 @@
 #!/usr/bin/env bash
 
-exampleDirs=(kmeans pca als naive-bayes linear-regression correlation summarizer)
+exampleDirs=(kmeans-scala pca-scala als-scala naive-bayes-scala \
+       	linear-regression-scala correlation-scala summarizer-scala)
+
+cd scala
 
 for dir in ${exampleDirs[*]}
 do
@@ -13,3 +16,5 @@ do
   ./build.sh
   cd ..
 done
+
+cd ..
diff --git a/examples/clean-all-scala-targets.sh b/examples/clean-all-scala-targets.sh
@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+
+exampleDirs=(kmeans-scala pca-scala als-scala naive-bayes-scala \
+       	linear-regression-scala correlation-scala summarizer-scala)
+
+cd scala
+
+for dir in ${exampleDirs[*]}
+do
+  cd $dir
+  echo
+  echo ==========================
+  echo Cleaning $dir ...
+  echo ==========================
+  echo
+  rm -rf ./target/
+  cd ..
+done
+
+cd ..
diff --git a/examples/kmeans/IntelGpuResourceFile.json b/examples/kmeans/IntelGpuResourceFile.json
diff --git a/examples/linear-regression/GetIntelGpuResources.sh b/examples/linear-regression/GetIntelGpuResources.sh
diff --git a/examples/linear-regression/IntelGpuResourceFile.json b/examples/linear-regression/IntelGpuResourceFile.json
diff --git a/examples/pca/GetIntelGpuResources.sh b/examples/pca/GetIntelGpuResources.sh
diff --git a/examples/pca/IntelGpuResourceFile.json b/examples/pca/IntelGpuResourceFile.json
diff --git a/examples/als-pyspark/als-pyspark.py → examples/python/als-pyspark/als-pyspark.py b/examples/als-pyspark/als-pyspark.py → examples/python/als-pyspark/als-pyspark.py
diff --git a/examples/als-pyspark/run.sh → examples/python/als-pyspark/run-cpu.sh b/examples/als-pyspark/run.sh → examples/python/als-pyspark/run-cpu.sh
@@ -1,6 +1,7 @@
 #!/usr/bin/env bash
 
-source ../../conf/env.sh
+CONF_PATH=$PWD/../../../conf
+source $CONF_PATH/env.sh
 
 # Data file is converted from oneDAL examples ($DAALROOT/examples/daal/data/batch/implicit_als_csr.csv)
 # The data file should be copied to $HDFS_ROOT before running examples

diff --git a/examples/kmeans-pyspark/kmeans-pyspark.py → ...s/python/kmeans-pyspark/kmeans-pyspark.py b/examples/kmeans-pyspark/kmeans-pyspark.py → ...s/python/kmeans-pyspark/kmeans-pyspark.py
diff --git a/examples/kmeans-pyspark/run.sh → examples/python/kmeans-pyspark/run-cpu.sh b/examples/kmeans-pyspark/run.sh → examples/python/kmeans-pyspark/run-cpu.sh
@@ -1,6 +1,7 @@
 #!/usr/bin/env bash
 
-source ../../conf/env.sh
+CONF_PATH=$PWD/../../../conf
+source $CONF_PATH/env.sh
 
 # Data file is from Spark Examples (data/mllib/sample_kmeans_data.txt) and put in examples/data
 # The data file should be copied to $HDFS_ROOT before running examples

diff --git a/examples/python/kmeans-pyspark/run-gpu.sh b/examples/python/kmeans-pyspark/run-gpu.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+
+CONF_PATH=$PWD/../../../conf
+source $CONF_PATH/env.sh
+
+# Data file is from Spark Examples (data/mllib/sample_kmeans_data.txt) and put in examples/data
+# The data file should be copied to $HDFS_ROOT before running examples
+DATA_FILE=$HDFS_ROOT/data/sample_kmeans_data.txt
+
+DEVICE=GPU
+RESOURCE_FILE=$CONF_PATH/IntelGpuResourceFile.json
+WORKER_GPU_AMOUNT=4
+EXECUTOR_GPU_AMOUNT=1
+TASK_GPU_AMOUNT=1
+APP_PY=kmeans-pyspark.py
+
+
+# Should run in standalone mode
+time $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER \
+    --num-executors $SPARK_NUM_EXECUTORS \
+    --executor-cores $SPARK_EXECUTOR_CORES \
+    --total-executor-cores $SPARK_TOTAL_CORES \
+    --driver-memory $SPARK_DRIVER_MEMORY \
+    --executor-memory $SPARK_EXECUTOR_MEMORY \
+    --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \
+    --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \
+    --conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \
+    --conf "spark.driver.extraClassPath=$SPARK_DRIVER_CLASSPATH" \
+    --conf "spark.executor.extraClassPath=$SPARK_EXECUTOR_CLASSPATH" \
+    --conf "spark.oap.mllib.device=$DEVICE" \
+    --conf "spark.worker.resourcesFile=$RESOURCE_FILE" \
+    --conf "spark.worker.resource.gpu.amount=$WORKER_GPU_AMOUNT" \
+    --conf "spark.executor.resource.gpu.amount=$EXECUTOR_GPU_AMOUNT" \
+    --conf "spark.task.resource.gpu.amount=$TASK_GPU_AMOUNT" \
+    --conf "spark.shuffle.reduceLocality.enabled=false" \
+    --conf "spark.network.timeout=1200s" \
+    --conf "spark.task.maxFailures=1" \
+    --jars $OAP_MLLIB_JAR \
+    $APP_PY $DATA_FILE \
+    2>&1 | tee KMeans-$(date +%m%d_%H_%M_%S).log
diff --git a/examples/pca-pyspark/pca-pyspark.py → examples/python/pca-pyspark/pca-pyspark.py b/examples/pca-pyspark/pca-pyspark.py → examples/python/pca-pyspark/pca-pyspark.py
diff --git a/examples/pca-pyspark/run.sh → examples/python/pca-pyspark/run-cpu.sh b/examples/pca-pyspark/run.sh → examples/python/pca-pyspark/run-cpu.sh
@@ -1,6 +1,7 @@
 #!/usr/bin/env bash
 
-source ../../conf/env.sh
+CONF_PATH=$PWD/../../../conf
+source $CONF_PATH/env.sh
 
 # CSV data is the same as in Spark example "ml/pca_example.py"
 # The data file should be copied to $HDFS_ROOT before running examples

diff --git a/examples/python/pca-pyspark/run-gpu.sh b/examples/python/pca-pyspark/run-gpu.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+
+CONF_PATH=$PWD/../../../conf
+source $CONF_PATH/env.sh
+
+# CSV data is the same as in Spark example "ml/pca_example.py"
+# The data file should be copied to $HDFS_ROOT before running examples
+DATA_FILE=$HDFS_ROOT/data/pca_data.csv
+
+DEVICE=GPU
+RESOURCE_FILE=$CONF_PATH/IntelGpuResourceFile.json
+WORKER_GPU_AMOUNT=4
+EXECUTOR_GPU_AMOUNT=1
+TASK_GPU_AMOUNT=1
+APP_PY=pca-pyspark.py
+
+
+# Should run in standalone mode
+time $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER \
+    --num-executors $SPARK_NUM_EXECUTORS \
+    --executor-cores $SPARK_EXECUTOR_CORES \
+    --total-executor-cores $SPARK_TOTAL_CORES \
+    --driver-memory $SPARK_DRIVER_MEMORY \
+    --executor-memory $SPARK_EXECUTOR_MEMORY \
+    --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \
+    --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \
+    --conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \
+    --conf "spark.driver.extraClassPath=$SPARK_DRIVER_CLASSPATH" \
+    --conf "spark.executor.extraClassPath=$SPARK_EXECUTOR_CLASSPATH" \
+    --conf "spark.oap.mllib.device=$DEVICE" \
+    --conf "spark.worker.resourcesFile=$RESOURCE_FILE" \
+    --conf "spark.worker.resource.gpu.amount=$WORKER_GPU_AMOUNT" \
+    --conf "spark.executor.resource.gpu.amount=$EXECUTOR_GPU_AMOUNT" \
+    --conf "spark.task.resource.gpu.amount=$TASK_GPU_AMOUNT" \
+    --conf "spark.shuffle.reduceLocality.enabled=false" \
+    --conf "spark.network.timeout=1200s" \
+    --conf "spark.task.maxFailures=1" \
+    --jars $OAP_MLLIB_JAR \
+    $APP_PY $DATA_FILE \
+    2>&1 | tee PCA-$(date +%m%d_%H_%M_%S).log
diff --git a/...spark/random_forest_classifier_example.py → ...spark/random_forest_classifier_example.py b/...spark/random_forest_classifier_example.py → ...spark/random_forest_classifier_example.py
diff --git a/...ndom-forest-pyspark/run-gpu-standalone.sh → ...ndom-forest-classifier-pyspark/run-gpu.sh b/...ndom-forest-pyspark/run-gpu-standalone.sh → ...ndom-forest-classifier-pyspark/run-gpu.sh
@@ -1,19 +1,21 @@
 #!/usr/bin/env bash
 
-source ../../conf/env.sh
+CONF_PATH=$PWD/../../../conf
+source $CONF_PATH/env.sh
 
 # CSV data is the same as in Spark example "ml/pca_example.py"
 # The data file should be copied to $HDFS_ROOT before running examples
 DATA_FILE=$HDFS_ROOT/data/sample_libsvm_data.txt
 
 DEVICE=GPU
-RESOURCE_FILE=$PWD/IntelGpuResourceFile.json
+RESOURCE_FILE=$CONF_PATH/IntelGpuResourceFile.json
 WORKER_GPU_AMOUNT=4
 EXECUTOR_GPU_AMOUNT=1
 TASK_GPU_AMOUNT=1
 APP_PY=random_forest_classifier_example.py
 
 
+# Should run in standalone mode
 time $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER \
     --num-executors $SPARK_NUM_EXECUTORS \
     --executor-cores $SPARK_EXECUTOR_CORES \
@@ -34,5 +36,5 @@ time $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER \
     --conf "spark.network.timeout=1200s" \
     --conf "spark.task.maxFailures=1" \
     --jars $OAP_MLLIB_JAR \
-    $APP_PY DATA_FILE \
+    $APP_PY $DATA_FILE \
     2>&1 | tee random_forest_classifier-$(date +%m%d_%H_%M_%S).log
diff --git a/...yspark/random_forest_regressor_example.py → ...yspark/random_forest_regressor_example.py b/...yspark/random_forest_regressor_example.py → ...yspark/random_forest_regressor_example.py
diff --git a/...t-pyspark/run-gpu-standalone-regressor.sh → ...andom-forest-regressor-pyspark/run-gpu.sh b/...t-pyspark/run-gpu-standalone-regressor.sh → ...andom-forest-regressor-pyspark/run-gpu.sh
@@ -1,19 +1,21 @@
 #!/usr/bin/env bash
 
-source ../../conf/env.sh
+CONF_PATH=$PWD/../../../conf
+source $CONF_PATH/env.sh
 
 # CSV data is the same as in Spark example "ml/pca_example.py"
 # The data file should be copied to $HDFS_ROOT before running examples
 DATA_FILE=$HDFS_ROOT/data/sample_libsvm_data.txt
 
 DEVICE=GPU
-RESOURCE_FILE=$PWD/IntelGpuResourceFile.json
+RESOURCE_FILE=$CONF_PATH/IntelGpuResourceFile.json
 WORKER_GPU_AMOUNT=4
 EXECUTOR_GPU_AMOUNT=1
 TASK_GPU_AMOUNT=1
 APP_PY=random_forest_regressor_example.py
 
 
+# Should run in standalone mode
 time $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER \
     --num-executors $SPARK_NUM_EXECUTORS \
     --executor-cores $SPARK_EXECUTOR_CORES \
@@ -34,5 +36,5 @@ time $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER \
     --conf "spark.network.timeout=1200s" \
     --conf "spark.task.maxFailures=1" \
     --jars $OAP_MLLIB_JAR \
-    $APP_PY DATA_FILE \
+    $APP_PY $DATA_FILE \
     2>&1 | tee random_forest_regressor-$(date +%m%d_%H_%M_%S).log
diff --git a/examples/random-forest-pyspark/IntelGpuResourceFile.json b/examples/random-forest-pyspark/IntelGpuResourceFile.json
diff --git a/examples/run-all-pyspark.sh → examples/run-all-pyspark-cpu.sh b/examples/run-all-pyspark.sh → examples/run-all-pyspark-cpu.sh
@@ -2,6 +2,8 @@
 
 exampleDirs=(kmeans-pyspark pca-pyspark als-pyspark)
 
+cd python
+
 for dir in ${exampleDirs[*]}
 do
   cd $dir
@@ -10,6 +12,8 @@ do
   echo Running $dir ...
   echo ==========================
   echo
-  ./run.sh
+  ./run-cpu.sh
   cd ..
 done
+
+cd ..
diff --git a/examples/run-all-pyspark-gpu.sh b/examples/run-all-pyspark-gpu.sh
@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+
+exampleDirs=(kmeans-pyspark pca-pyspark als-pyspark \
+	random-forest-regressor-pyspark random-forest-classifier-pyspark)
+
+cd python
+
+for dir in ${exampleDirs[*]}
+do
+  cd $dir
+  echo
+  echo ==========================
+  echo Running $dir ...
+  echo ==========================
+  echo
+  ./run-gpu.sh
+  cd ..
+done
+
+cd ..
diff --git a/examples/run-all-scala-cpu.sh b/examples/run-all-scala-cpu.sh
@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+
+exampleDirs=(kmeans-scala pca-scala als-scala naive-bayes-scala \
+       	linear-regression-scala correlation-scala summarizer-scala)
+
+cd scala
+
+for dir in ${exampleDirs[*]}
+do
+  cd $dir
+  echo
+  echo ==========================
+  echo Running $dir ...
+  echo ==========================
+  echo
+  ./run-cpu.sh
+  cd ..
+done
+
+cd ..
diff --git a/examples/run-all-scala.sh → examples/run-all-scala-gpu.sh b/examples/run-all-scala.sh → examples/run-all-scala-gpu.sh
@@ -1,6 +1,8 @@
 #!/usr/bin/env bash
 
-exampleDirs=(kmeans pca als naive-bayes linear-regression correlation summarizer)
+exampleDirs=(kmeans-scala pca-scala linear-regression-scala correlation-scala summarizer-scala)
+
+cd scala
 
 for dir in ${exampleDirs[*]}
 do
@@ -10,6 +12,8 @@ do
   echo Running $dir ...
   echo ==========================
   echo
-  ./run.sh
+  ./run-gpu.sh
   cd ..
 done
+
+cd ..
diff --git a/examples/als/build.sh → examples/scala/als-scala/build.sh b/examples/als/build.sh → examples/scala/als-scala/build.sh
diff --git a/examples/als/pom.xml → examples/scala/als-scala/pom.xml b/examples/als/pom.xml → examples/scala/als-scala/pom.xml
diff --git a/examples/als/run.sh → examples/scala/als-scala/run-cpu.sh b/examples/als/run.sh → examples/scala/als-scala/run-cpu.sh
@@ -1,6 +1,7 @@
 #!/usr/bin/env bash
 
-source ../../conf/env.sh
+CONF_PATH=$PWD/../../../conf
+source $CONF_PATH/env.sh
 
 # Data file is converted from oneDAL examples ($DAALROOT/examples/daal/data/batch/implicit_als_csr.csv)
 # The data file should be copied to $HDFS_ROOT before running examples

diff --git a/...apache/spark/examples/ml/ALSExample.scala → ...apache/spark/examples/ml/ALSExample.scala b/...apache/spark/examples/ml/ALSExample.scala → ...apache/spark/examples/ml/ALSExample.scala
diff --git a/examples/correlation/build.sh → examples/scala/correlation-scala/build.sh b/examples/correlation/build.sh → examples/scala/correlation-scala/build.sh
diff --git a/examples/correlation/pom.xml → examples/scala/correlation-scala/pom.xml b/examples/correlation/pom.xml → examples/scala/correlation-scala/pom.xml