From e920b9834b49f77b925e82aa9b148cc5c992d18d Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Tue, 30 Mar 2021 12:29:41 +0800 Subject: [PATCH 01/62] remove hibench examples --- examples/als-hibench/build.sh | 3 - examples/als-hibench/pom.xml | 100 ---------------- examples/als-hibench/run-hibench-oap-mllib.sh | 73 ------------ examples/als-hibench/run-hibench-vanilla.sh | 61 ---------- .../hibench/sparkbench/ml/ALSExample.scala | 111 ------------------ examples/kmeans-hibench/build.sh | 3 - examples/kmeans-hibench/pom.xml | 99 ---------------- .../kmeans-hibench/run-hibench-oap-mllib.sh | 86 -------------- .../kmeans-hibench/run-hibench-vanilla.sh | 58 --------- .../hibench/sparkbench/ml/DenseKMeansDS.scala | 107 ----------------- 10 files changed, 701 deletions(-) delete mode 100755 examples/als-hibench/build.sh delete mode 100644 examples/als-hibench/pom.xml delete mode 100755 examples/als-hibench/run-hibench-oap-mllib.sh delete mode 100755 examples/als-hibench/run-hibench-vanilla.sh delete mode 100644 examples/als-hibench/src/main/scala/com/intel/hibench/sparkbench/ml/ALSExample.scala delete mode 100755 examples/kmeans-hibench/build.sh delete mode 100644 examples/kmeans-hibench/pom.xml delete mode 100755 examples/kmeans-hibench/run-hibench-oap-mllib.sh delete mode 100755 examples/kmeans-hibench/run-hibench-vanilla.sh delete mode 100644 examples/kmeans-hibench/src/main/scala/com/intel/hibench/sparkbench/ml/DenseKMeansDS.scala diff --git a/examples/als-hibench/build.sh b/examples/als-hibench/build.sh deleted file mode 100755 index 8cbc692be..000000000 --- a/examples/als-hibench/build.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/usr/bin/env bash - -mvn clean package \ No newline at end of file diff --git a/examples/als-hibench/pom.xml b/examples/als-hibench/pom.xml deleted file mode 100644 index 68e02c256..000000000 --- a/examples/als-hibench/pom.xml +++ /dev/null @@ -1,100 +0,0 @@ - - 4.0.0 - - com.intel.oap - oap-mllib-examples - 0.9.0-with-spark-3.0.0 - jar - - ALSHiBenchExample - https://github.com/Intel-bigdata/OAP - - - UTF-8 - 2.12.10 - 2.12 - 3.0.0 - - - - - - org.scala-lang - scala-library - 2.12.10 - - - - com.github.scopt - scopt_2.12 - 3.7.0 - - - - - - - - - - - org.apache.spark - spark-sql_2.12 - ${spark.version} - provided - - - - org.apache.spark - spark-mllib_2.12 - ${spark.version} - provided - - - - - - - - org.scala-tools - maven-scala-plugin - 2.15.2 - - - - compile - testCompile - - - - - ${scala.version} - - -target:jvm-1.8 - - - - - maven-assembly-plugin - 3.0.0 - - false - - jar-with-dependencies - - - - - assembly - package - - single - - - - - - - - diff --git a/examples/als-hibench/run-hibench-oap-mllib.sh b/examples/als-hibench/run-hibench-oap-mllib.sh deleted file mode 100755 index 050b80558..000000000 --- a/examples/als-hibench/run-hibench-oap-mllib.sh +++ /dev/null @@ -1,73 +0,0 @@ -#!/usr/bin/env bash - -export HDFS_ROOT=hdfs://sr591:8020 -export OAP_MLLIB_ROOT=/home/xiaochang/Works/OAP-xwu99-als/oap-mllib - -SPARK_MASTER=yarn -SPARK_DRIVER_MEMORY=16G -SPARK_NUM_EXECUTORS=6 -SPARK_EXECUTOR_CORES=28 -SPARK_EXECUTOR_MEMORY_OVERHEAD=25G -SPARK_EXECUTOR_MEMORY=100G - -SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES '*' 2) -#SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES) - -# ======================================================= # - -# for log suffix -SUFFIX=$( basename -s .sh "${BASH_SOURCE[0]}" ) - -# Check envs -if [[ -z $SPARK_HOME ]]; then - echo SPARK_HOME not defined! - exit 1 -fi - -if [[ -z $HADOOP_HOME ]]; then - echo HADOOP_HOME not defined! - exit 1 -fi - -export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop - -# Target jar built -OAP_MLLIB_JAR_NAME=oap-mllib-0.9.0-with-spark-3.0.0.jar -OAP_MLLIB_JAR=$OAP_MLLIB_ROOT/mllib-dal/target/$OAP_MLLIB_JAR_NAME - -# Use absolute path -SPARK_DRIVER_CLASSPATH=$OAP_MLLIB_JAR -# Use relative path -SPARK_EXECUTOR_CLASSPATH=./$OAP_MLLIB_JAR_NAME - -APP_JAR=target/oap-mllib-examples-0.9.0-with-spark-3.0.0.jar -APP_CLASS=com.intel.hibench.sparkbench.ml.ALSExample - -HDFS_INPUT=hdfs://sr591:8020/HiBench/ALS/Input -RANK=10 -NUM_ITERATIONS=1 -LAMBDA=0.1 -IMPLICIT=true - -/usr/bin/time -p $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \ - --num-executors $SPARK_NUM_EXECUTORS \ - --driver-memory $SPARK_DRIVER_MEMORY \ - --executor-cores $SPARK_EXECUTOR_CORES \ - --executor-memory $SPARK_EXECUTOR_MEMORY \ - --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \ - --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \ - --conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \ - --conf "spark.driver.extraClassPath=$SPARK_DRIVER_CLASSPATH" \ - --conf "spark.executor.extraClassPath=$SPARK_EXECUTOR_CLASSPATH" \ - --conf "spark.shuffle.reduceLocality.enabled=false" \ - --conf "spark.executor.memoryOverhead=$SPARK_EXECUTOR_MEMORY_OVERHEAD" \ - --conf "spark.network.timeout=1200s" \ - --conf "spark.task.maxFailures=1" \ - --jars $OAP_MLLIB_JAR \ - --class $APP_CLASS \ - $APP_JAR \ - --rank $RANK --numIterations $NUM_ITERATIONS --implicitPrefs $IMPLICIT --lambda $LAMBDA \ - --numProductBlocks $SPARK_DEFAULT_PARALLELISM --numUserBlocks $SPARK_DEFAULT_PARALLELISM \ - $HDFS_INPUT \ - 2>&1 | tee ALS-$SUFFIX-$(date +%m%d_%H_%M_%S).log - diff --git a/examples/als-hibench/run-hibench-vanilla.sh b/examples/als-hibench/run-hibench-vanilla.sh deleted file mode 100755 index 6cb6b3ae7..000000000 --- a/examples/als-hibench/run-hibench-vanilla.sh +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env bash - -export HDFS_ROOT=hdfs://sr591:8020 - -SPARK_MASTER=yarn -SPARK_DRIVER_MEMORY=16G -SPARK_NUM_EXECUTORS=6 -SPARK_EXECUTOR_CORES=28 -SPARK_EXECUTOR_MEMORY_OVERHEAD=25G -SPARK_EXECUTOR_MEMORY=100G - -SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES '*' 2) - -# ======================================================= # - -# for log suffix -SUFFIX=$( basename -s .sh "${BASH_SOURCE[0]}" ) - -# Check envs -if [[ -z $SPARK_HOME ]]; then - echo SPARK_HOME not defined! - exit 1 -fi - -if [[ -z $HADOOP_HOME ]]; then - echo HADOOP_HOME not defined! - exit 1 -fi - -export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop - -APP_JAR=target/oap-mllib-examples-0.9.0-with-spark-3.0.0.jar -APP_CLASS=com.intel.hibench.sparkbench.ml.ALSExample - -HDFS_INPUT=hdfs://sr591:8020/HiBench/ALS/Input -RANK=10 -NUM_ITERATIONS=1 -LAMBDA=0.1 -IMPLICIT=true - -/usr/bin/time -p $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \ - --num-executors $SPARK_NUM_EXECUTORS \ - --driver-memory $SPARK_DRIVER_MEMORY \ - --executor-cores $SPARK_EXECUTOR_CORES \ - --executor-memory $SPARK_EXECUTOR_MEMORY \ - --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \ - --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \ - --conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \ - --conf "spark.driver.extraClassPath=$SPARK_DRIVER_CLASSPATH" \ - --conf "spark.executor.extraClassPath=$SPARK_EXECUTOR_CLASSPATH" \ - --conf "spark.shuffle.reduceLocality.enabled=false" \ - --conf "spark.executor.memoryOverhead=$SPARK_EXECUTOR_MEMORY_OVERHEAD" \ - --conf "spark.network.timeout=1200s" \ - --conf "spark.task.maxFailures=1" \ - --class $APP_CLASS \ - $APP_JAR \ - --rank $RANK --numIterations $NUM_ITERATIONS --implicitPrefs $IMPLICIT --lambda $LAMBDA \ - --numProductBlocks $SPARK_DEFAULT_PARALLELISM --numUserBlocks $SPARK_DEFAULT_PARALLELISM \ - $HDFS_INPUT \ - 2>&1 | tee ALS-$SUFFIX-$(date +%m%d_%H_%M_%S).log - diff --git a/examples/als-hibench/src/main/scala/com/intel/hibench/sparkbench/ml/ALSExample.scala b/examples/als-hibench/src/main/scala/com/intel/hibench/sparkbench/ml/ALSExample.scala deleted file mode 100644 index 5a29bcc80..000000000 --- a/examples/als-hibench/src/main/scala/com/intel/hibench/sparkbench/ml/ALSExample.scala +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.intel.hibench.sparkbench.ml - -import org.apache.spark.ml.evaluation.RegressionEvaluator -import org.apache.spark.ml.recommendation.ALS -import org.apache.spark.ml.recommendation.ALS.Rating -import org.apache.spark.sql.SparkSession -import scopt.OptionParser - -object ALSExample { - - case class Params( - dataPath: String = null, - numIterations: Int = 10, - lambda: Double = 0.1, - rank: Int = 10, - numUserBlocks: Int = 10, - numItemBlocks: Int = 10, - implicitPrefs: Boolean = false) - - def main(args: Array[String]) { - val defaultParams = Params() - - val parser = new OptionParser[Params]("ALS") { - head("ALS: an example app for ALS on User-Item data.") - opt[Int]("rank") - .text(s"rank, default: ${defaultParams.rank}") - .action((x, c) => c.copy(rank = x)) - opt[Int]("numIterations") - .text(s"number of iterations, default: ${defaultParams.numIterations}") - .action((x, c) => c.copy(numIterations = x)) - opt[Double]("lambda") - .text(s"regularization parameter, default: ${defaultParams.lambda}") - .action((x, c) => c.copy(lambda = x)) - opt[Int]("numUserBlocks") - .text(s"number of user blocks, default: ${defaultParams.numUserBlocks}") - .action((x, c) => c.copy(numUserBlocks = x)) - opt[Int]("numProductBlocks") - .text(s"number of product blocks, default: ${defaultParams.numItemBlocks}") - .action((x, c) => c.copy(numItemBlocks = x)) - opt[Boolean]("implicitPrefs") - .text("implicit preference, default: ${defaultParams.implicitPrefs}") - .action((x, c) => c.copy(implicitPrefs = x)) - arg[String]("") - .required() - .text("Input paths to a User-Product dataset of ratings") - .action((x, c) => c.copy(dataPath = x)) - } - parser.parse(args, defaultParams) match { - case Some(params) => run(params) - case _ => sys.exit(1) - } - } - - def run(params: Params): Unit = { - val spark = SparkSession - .builder - .appName(s"ALS with $params") - .getOrCreate() - val sc = spark.sparkContext - - import spark.implicits._ - - val ratings = sc.objectFile[Rating[Int]](params.dataPath).toDF() - - val Array(training, test) = ratings.randomSplit(Array(0.8, 0.2), 1L) - - // Build the recommendation model using ALS on the training data - val als = new ALS() - .setRank(params.rank) - .setMaxIter(params.numIterations) - .setRegParam(params.lambda) - .setImplicitPrefs(params.implicitPrefs) - .setNumUserBlocks(params.numUserBlocks) - .setNumItemBlocks(params.numItemBlocks) - .setUserCol("user") - .setItemCol("item") - .setRatingCol("rating") - val model = als.fit(training) - - // Evaluate the model by computing the RMSE on the test data - // Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics - model.setColdStartStrategy("drop") - val predictions = model.transform(test) - - val evaluator = new RegressionEvaluator() - .setMetricName("rmse") - .setLabelCol("rating") - .setPredictionCol("prediction") - val rmse = evaluator.evaluate(predictions) - println(s"Root-mean-square error = $rmse") - - spark.stop() - } -} diff --git a/examples/kmeans-hibench/build.sh b/examples/kmeans-hibench/build.sh deleted file mode 100755 index da373645b..000000000 --- a/examples/kmeans-hibench/build.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/usr/bin/env bash - -mvn clean package diff --git a/examples/kmeans-hibench/pom.xml b/examples/kmeans-hibench/pom.xml deleted file mode 100644 index 3f5b56e29..000000000 --- a/examples/kmeans-hibench/pom.xml +++ /dev/null @@ -1,99 +0,0 @@ - - 4.0.0 - - com.intel.oap - oap-mllib-examples - 1.1.0-with-spark-3.0.0 - jar - - KMeansHiBenchExample - https://github.com/oap-project/oap-mllib.git - - - UTF-8 - 2.12.10 - 2.12 - 3.0.0 - - - - - - org.scala-lang - scala-library - 2.12.10 - - - - com.github.scopt - scopt_2.12 - 3.7.0 - - - - org.apache.mahout - mahout-hdfs - 14.1 - - - - org.apache.spark - spark-sql_2.12 - ${spark.version} - provided - - - - org.apache.spark - spark-mllib_2.12 - ${spark.version} - provided - - - - - - - - org.scala-tools - maven-scala-plugin - 2.15.2 - - - - compile - testCompile - - - - - ${scala.version} - - -target:jvm-1.8 - - - - - maven-assembly-plugin - 3.0.0 - - false - - jar-with-dependencies - - - - - assembly - package - - single - - - - - - - - diff --git a/examples/kmeans-hibench/run-hibench-oap-mllib.sh b/examples/kmeans-hibench/run-hibench-oap-mllib.sh deleted file mode 100755 index caa42584f..000000000 --- a/examples/kmeans-hibench/run-hibench-oap-mllib.sh +++ /dev/null @@ -1,86 +0,0 @@ -#!/usr/bin/env bash - -# == User to customize the following environments ======= # - -# Set user Spark and Hadoop home directory -export SPARK_HOME=/path/to/your/spark/home -export HADOOP_HOME=/path/to/your/hadoop/home -# Set user HDFS Root -export HDFS_ROOT=hdfs://your_hostname:8020 -# Set user Intel MLlib Root directory -export OAP_MLLIB_ROOT=/path/to/your/OAP/oap-mllib -# Set IP and Port for oneCCL KVS, you can select any one of the worker nodes and set CCL_KVS_IP_PORT to its IP and Port -# IP can be got with `hostname -I`, if multiple IPs are returned, the first IP should be used. Port can be any available port. -# For example, if one of the worker IP is 192.168.0.1 and an available port is 51234. -# CCL_KVS_IP_PORT can be set in the format of 192.168.0.1_51234 -# Incorrectly setting this value will result in hanging when oneCCL initialize -export CCL_KVS_IP_PORT=192.168.0.1_51234 - -# == User to customize Spark executor cores and memory == # - -# User should check the requested resources are acturally allocated by cluster manager or Intel MLlib will behave incorrectly -SPARK_MASTER=yarn -SPARK_DRIVER_MEMORY=8G -SPARK_NUM_EXECUTORS=6 -SPARK_EXECUTOR_CORES=15 -SPARK_EXECUTOR_MEMORY_OVERHEAD=25G -SPARK_EXECUTOR_MEMORY=50G - -SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES '*' 2) - -# ======================================================= # - -# for log suffix -SUFFIX=$( basename -s .sh "${BASH_SOURCE[0]}" ) - -# Check envs -if [[ -z $SPARK_HOME ]]; then - echo SPARK_HOME not defined! - exit 1 -fi - -if [[ -z $HADOOP_HOME ]]; then - echo HADOOP_HOME not defined! - exit 1 -fi - -export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop - -# Target jar built -OAP_MLLIB_JAR_NAME=oap-mllib-0.9.0-with-spark-3.0.0.jar -OAP_MLLIB_JAR=$OAP_MLLIB_ROOT/mllib-dal/target/$OAP_MLLIB_JAR_NAME - -# Use absolute path -SPARK_DRIVER_CLASSPATH=$OAP_MLLIB_JAR -# Use relative path -SPARK_EXECUTOR_CLASSPATH=./$OAP_MLLIB_JAR_NAME - -APP_JAR=target/oap-mllib-examples-0.9.0-with-spark-3.0.0.jar -APP_CLASS=com.intel.hibench.sparkbench.ml.DenseKMeansDS - -K=200 -INIT_MODE=Random -MAX_ITERATION=20 -INPUT_HDFS=$HDFS_ROOT/HiBench/Kmeans/Input/samples - -/usr/bin/time -p $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \ - --num-executors $SPARK_NUM_EXECUTORS \ - --driver-memory $SPARK_DRIVER_MEMORY \ - --executor-cores $SPARK_EXECUTOR_CORES \ - --executor-memory $SPARK_EXECUTOR_MEMORY \ - --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \ - --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \ - --conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \ - --conf "spark.driver.extraClassPath=$SPARK_DRIVER_CLASSPATH" \ - --conf "spark.executor.extraClassPath=$SPARK_EXECUTOR_CLASSPATH" \ - --conf "spark.executorEnv.CCL_KVS_IP_PORT=$CCL_KVS_IP_PORT" \ - --conf "spark.shuffle.reduceLocality.enabled=false" \ - --conf "spark.executor.memoryOverhead=$SPARK_EXECUTOR_MEMORY_OVERHEAD" \ - --conf "spark.memory.fraction=0.8" \ - --conf "spark.network.timeout=1200s" \ - --conf "spark.task.maxFailures=1" \ - --jars $OAP_MLLIB_JAR \ - --class $APP_CLASS \ - $APP_JAR \ - -k $K --initMode $INIT_MODE --numIterations $MAX_ITERATION $INPUT_HDFS \ - 2>&1 | tee KMeansHiBench-$SUFFIX-$(date +%m%d_%H_%M_%S).log diff --git a/examples/kmeans-hibench/run-hibench-vanilla.sh b/examples/kmeans-hibench/run-hibench-vanilla.sh deleted file mode 100755 index 475c25aff..000000000 --- a/examples/kmeans-hibench/run-hibench-vanilla.sh +++ /dev/null @@ -1,58 +0,0 @@ -#!/usr/bin/env bash - -# == User to customize the following environments ======= # - -# Set user Spark and Hadoop home directory -export SPARK_HOME=/path/to/your/spark/home -export HADOOP_HOME=/path/to/your/hadoop/home -# Set user HDFS Root -export HDFS_ROOT=hdfs://your_hostname:8020 - -# == User to customize Spark executor cores and memory == # - -SPARK_MASTER=yarn -SPARK_DRIVER_MEMORY=8G -SPARK_NUM_EXECUTORS=6 -SPARK_EXECUTOR_CORES=15 -SPARK_EXECUTOR_MEMORY=75G - -SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES '*' 2) - -# ======================================================= # - -# for log suffix -SUFFIX=$( basename -s .sh "${BASH_SOURCE[0]}" ) - -# Check envs -if [[ -z $SPARK_HOME ]]; then - echo SPARK_HOME not defined! - exit 1 -fi - -if [[ -z $HADOOP_HOME ]]; then - echo HADOOP_HOME not defined! - exit 1 -fi - -export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop - -APP_JAR=target/oap-mllib-examples-0.9.0-with-spark-3.0.0.jar -APP_CLASS=com.intel.hibench.sparkbench.ml.DenseKMeansDS - -K=200 -INIT_MODE=Random -MAX_ITERATION=20 -INPUT_HDFS=$HDFS_ROOT/HiBench/Kmeans/Input/samples - -/usr/bin/time -p $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \ - --num-executors $SPARK_NUM_EXECUTORS \ - --driver-memory $SPARK_DRIVER_MEMORY \ - --executor-cores $SPARK_EXECUTOR_CORES \ - --executor-memory $SPARK_EXECUTOR_MEMORY \ - --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \ - --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \ - --conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \ - --class $APP_CLASS \ - $APP_JAR \ - -k $K --initMode $INIT_MODE --numIterations $MAX_ITERATION $INPUT_HDFS \ - 2>&1 | tee KMeansHiBench-$SUFFIX-$(date +%m%d_%H_%M_%S).log diff --git a/examples/kmeans-hibench/src/main/scala/com/intel/hibench/sparkbench/ml/DenseKMeansDS.scala b/examples/kmeans-hibench/src/main/scala/com/intel/hibench/sparkbench/ml/DenseKMeansDS.scala deleted file mode 100644 index 3a949bb1c..000000000 --- a/examples/kmeans-hibench/src/main/scala/com/intel/hibench/sparkbench/ml/DenseKMeansDS.scala +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.intel.hibench.sparkbench.ml - -import breeze.linalg.DenseVector -import org.apache.hadoop.io.LongWritable -import org.apache.mahout.math.VectorWritable -import org.apache.spark.ml.clustering.KMeans -import org.apache.spark.ml.evaluation.ClusteringEvaluator -import org.apache.spark.ml.linalg.{Vector, Vectors} -import org.apache.spark.{SparkConf, SparkContext} -import org.apache.spark.sql._ -import scopt.OptionParser -import org.apache.spark.sql.SparkSession - -object DenseKMeansDS { - - object InitializationMode extends Enumeration { - type InitializationMode = Value - val Random, Parallel = Value - } - - import com.intel.hibench.sparkbench.ml.DenseKMeansDS.InitializationMode._ - - case class Params(input: String = null, - k: Int = -1, - numIterations: Int = 10, - initializationMode: InitializationMode = Random) - - def main(args: Array[String]) { - val defaultParams = Params() - - val parser = new OptionParser[Params]("DenseKMeans") { - head("DenseKMeans: an example k-means app for dense data.") - opt[Int]('k', "k") - .required() - .text(s"number of clusters, required") - .action((x, c) => c.copy(k = x)) - opt[Int]("numIterations") - .text(s"number of iterations, default; ${defaultParams.numIterations}") - .action((x, c) => c.copy(numIterations = x)) - opt[String]("initMode") - .text(s"initialization mode (${InitializationMode.values.mkString(",")}), " + - s"default: ${defaultParams.initializationMode}") - .action((x, c) => c.copy(initializationMode = InitializationMode.withName(x))) - arg[String]("") - .text("input paths to examples") - .required() - .action((x, c) => c.copy(input = x)) - } - - parser.parse(args, defaultParams).map { params => - run(params) - }.getOrElse { - sys.exit(1) - } - } - - def run(params: Params) { - val spark = SparkSession - .builder - .appName(s"DenseKMeansDS with $params") - .getOrCreate() - import spark.implicits._ - - val sc = spark.sparkContext - - val data = sc.sequenceFile[LongWritable, VectorWritable](params.input) - - // Should use Tuple1 to warp around for calling toDF - val dataset = data.map { case (k, v) => - var vector: Array[Double] = new Array[Double](v.get().size) - for (i <- 0 until v.get().size) vector(i) = v.get().get(i) - Tuple1(Vectors.dense(vector)) - }.toDF("features") - - val initMode = params.initializationMode match { - case Random => "random" - case Parallel => "k-means||" - } - - val model = new KMeans() - .setInitMode(initMode) - .setK(params.k) - .setMaxIter(params.numIterations) - .setSeed(1L) - .fit(dataset) - - spark.stop() - } -} - From bce123d3e24631d8f5a4edf878c54d7443b01ecf Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Wed, 31 Mar 2021 11:43:35 +0800 Subject: [PATCH 02/62] Fix tbb linking --- mllib-dal/src/main/native/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib-dal/src/main/native/Makefile b/mllib-dal/src/main/native/Makefile index 23222e646..0a81dd05a 100644 --- a/mllib-dal/src/main/native/Makefile +++ b/mllib-dal/src/main/native/Makefile @@ -33,7 +33,7 @@ INCS := -I $(JAVA_HOME)/include \ LIBS := -L${CCL_ROOT}/lib -lccl \ -L$(DAALROOT)/lib/intel64 -l:libdaal_core.a -l:libdaal_thread.a \ - -L$(TBBROOT)/lib -ltbb -ltbbmalloc + -L$(TBBROOT)/lib/intel64/gcc4.8 -ltbb -ltbbmalloc # TODO: Add signal chaining support, should fix linking, package so and loading # -L$(JAVA_HOME)/jre/lib/amd64 -ljsig From 04097cb5365b01f4d602d5c5dbaeeb17019baccc Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Wed, 31 Mar 2021 11:44:23 +0800 Subject: [PATCH 03/62] Add data --- examples/data/onedal_als_csr_ratings.txt | 167 +++++++++++++++++++++++ examples/data/pca_data.csv | 3 + examples/data/sample_kmeans_data.txt | 6 + 3 files changed, 176 insertions(+) create mode 100644 examples/data/onedal_als_csr_ratings.txt create mode 100644 examples/data/pca_data.csv create mode 100644 examples/data/sample_kmeans_data.txt diff --git a/examples/data/onedal_als_csr_ratings.txt b/examples/data/onedal_als_csr_ratings.txt new file mode 100644 index 000000000..e3f5d3fae --- /dev/null +++ b/examples/data/onedal_als_csr_ratings.txt @@ -0,0 +1,167 @@ +0::17::0.938283 +1::1::0.124207 +1::8::0.411504 +1::13::0.746992 +1::16::0.169437 +2::6::0.42021 +2::12::0.322446 +2::17::0.561519 +3::2::0.760795 +4::6::0.252532 +4::8::0.155777 +5::1::0.593943 +5::2::0.5289 +5::9::0.827276 +5::16::0.291741 +6::1::0.642664 +6::9::0.843839 +6::18::0.801948 +7::17::0.162832 +8::15::0.764005 +8::19::0.545399 +9::4::0.871051 +9::8::0.2094 +9::9::0.900738 +9::17::0.998866 +9::19::0.154139 +10::0::0.59703 +10::6::0.727774 +10::15::0.877197 +11::3::0.598636 +11::5::0.999655 +11::7::0.23638 +11::14::0.463678 +11::17::0.802767 +11::18::0.828629 +12::3::0.449008 +12::4::0.108126 +12::5::0.17944 +12::11::0.14992 +12::15::0.645085 +12::17::0.356908 +13::7::0.54838 +13::13::0.719667 +14::1::0.144589 +14::2::0.956232 +14::4::0.410129 +14::5::0.237406 +14::9::0.701227 +15::5::0.598455 +15::13::0.534545 +15::18::0.85741 +16::0::0.08512 +16::1::0.306062 +17::2::0.87395 +17::6::0.680554 +17::15::0.383043 +17::20::0.16813 +18::2::0.641488 +18::4::0.542261 +18::10::0.69714 +18::11::0.776203 +18::17::0.498716 +18::18::0.788093 +18::20::0.52406 +19::4::0.10402 +19::7::0.276732 +20::1::0.666263 +20::6::0.280048 +21::5::0.898574 +21::6::0.892768 +21::9::0.061185 +21::18::0.691028 +22::7::0.813807 +22::16::0.293614 +23::10::0.217541 +23::14::0.98958 +23::15::0.20269 +24::7::0.67432 +24::8::0.520428 +24::10::0.138665 +24::13::0.364809 +24::14::0.970167 +24::19::0.68381 +25::0::0.166145 +25::1::0.194913 +25::2::0.265607 +25::18::0.740052 +25::19::0.209377 +26::2::0.122306 +26::8::0.742562 +26::11::0.405206 +26::16::0.442783 +27::12::0.010994 +27::16::0.632512 +27::17::0.421555 +28::1::0.854519 +28::3::0.843519 +28::7::0.388753 +28::12::0.020689 +28::13::0.071531 +28::14::0.537579 +28::16::0.079456 +29::17::0.548573 +30::1::0.959732 +30::3::0.913432 +30::4::0.88553 +31::0::0.653987 +31::13::0.736684 +31::20::0.629751 +32::2::0.420538 +32::6::0.110444 +32::12::0.55993 +32::13::0.730668 +32::17::0.588223 +32::18::0.188579 +33::8::0.717314 +33::9::0.249797 +33::10::0.404286 +33::18::0.83197 +34::4::0.364628 +34::6::0.023655 +34::10::0.94169 +35::3::0.015393 +35::11::0.356229 +35::18::0.328241 +36::0::0.03866 +36::1::0.21685 +36::5::0.725101 +36::8::0.191972 +36::9::0.658415 +36::12::0.592436 +37::2::0.812225 +37::4::0.411506 +37::6::0.613151 +37::9::0.345352 +37::10::0.89008 +37::12::0.139664 +37::17::0.7633 +37::20::0.488679 +38::0::0.594923 +38::1::0.441561 +38::13::0.467085 +39::0::0.949957 +39::7::0.360488 +39::12::0.354949 +39::15::0.976556 +39::17::0.024024 +40::1::0.121904 +40::4::0.871203 +40::8::0.102956 +41::9::0.593112 +42::3::0.542693 +42::4::0.340404 +42::6::0.997438 +42::7::0.335679 +42::10::0.657767 +42::11::0.382666 +42::14::0.621782 +42::17::0.150028 +43::16::0.318803 +43::17::0.83869 +44::10::0.460685 +45::0::0.926797 +45::4::0.257822 +45::8::0.714351 +45::17::0.333358 +45::18::0.134587 diff --git a/examples/data/pca_data.csv b/examples/data/pca_data.csv new file mode 100644 index 000000000..2b8fac3c4 --- /dev/null +++ b/examples/data/pca_data.csv @@ -0,0 +1,3 @@ +0.0,1.0,0.0,7.0,0.0 +2.0,0.0,3.0,4.0,5.0 +4.0,0.0,0.0,6.0,7.0 \ No newline at end of file diff --git a/examples/data/sample_kmeans_data.txt b/examples/data/sample_kmeans_data.txt new file mode 100644 index 000000000..50013776b --- /dev/null +++ b/examples/data/sample_kmeans_data.txt @@ -0,0 +1,6 @@ +0 1:0.0 2:0.0 3:0.0 +1 1:0.1 2:0.1 3:0.1 +2 1:0.2 2:0.2 3:0.2 +3 1:9.0 2:9.0 3:9.0 +4 1:9.1 2:9.1 3:9.1 +5 1:9.2 2:9.2 3:9.2 From b7a3073df1155c79380e50c244e5bda881bf7499 Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Wed, 31 Mar 2021 11:44:51 +0800 Subject: [PATCH 04/62] Add env.sh.template --- conf/env.sh.template | 46 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 conf/env.sh.template diff --git a/conf/env.sh.template b/conf/env.sh.template new file mode 100644 index 000000000..2ff4f447f --- /dev/null +++ b/conf/env.sh.template @@ -0,0 +1,46 @@ +# == OAP MLlib users to customize the following environments for running examples ======= # + +# ============== Minimum Settings ============= # + +# Set OAP MLlib version (e.g. 1.1.0) +OAP_MLLIB_VERSION=x.x.x +# Set Spark master +SPARK_MASTER=yarn +# Set Hadoop home path +export HADOOP_HOME=/path/to/your/hadoop/home +# Set Spark home path +export SPARK_HOME=/path/to/your/spark/home +# Set HDFS Root +export HDFS_ROOT=hdfs://localhost:8020 +# Set OAP MLlib source code root directory +export OAP_MLLIB_ROOT=/path/to/oap-mllib/home + +# ============================================= # + +# Set HADOOP_CONF_DIR for Spark +export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop + +# Set JAR name & path +OAP_MLLIB_JAR_NAME=oap-mllib-$OAP_MLLIB_VERSION.jar +OAP_MLLIB_JAR=$OAP_MLLIB_ROOT/mllib-dal/target/$OAP_MLLIB_JAR_NAME +# Set Spark driver & executor classpaths, +# absolute path for driver, relative path for executor +SPARK_DRIVER_CLASSPATH=$OAP_MLLIB_JAR +SPARK_EXECUTOR_CLASSPATH=./$OAP_MLLIB_JAR_NAME + +# Set Spark resources, can be overwritten in example +SPARK_DRIVER_MEMORY=1G +SPARK_NUM_EXECUTORS=2 +SPARK_EXECUTOR_CORES=1 +SPARK_EXECUTOR_MEMORY=1G +SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES '*' 2) + +# Checks + +for dir in $SPARK_HOME $HADOOP_HOME $OAP_MLLIB_JAR +do + if [[ ! -e $dir ]]; then + echo $dir does not exist! + exit 1 + fi +done From bdbc4c724373d56fc4a8bafea9d5ab8c36013a72 Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Wed, 31 Mar 2021 11:46:44 +0800 Subject: [PATCH 05/62] Revise examples --- examples/als-pyspark/als-pyspark.py | 2 +- examples/als-pyspark/run.sh | 57 ++---------------------- examples/kmeans-pyspark/run.sh | 57 ++---------------------- examples/kmeans/pom.xml | 3 +- examples/kmeans/run.sh | 60 +++----------------------- examples/pca-pyspark/data/pca_data.csv | 3 -- examples/pca-pyspark/run.sh | 50 ++------------------- examples/pca/build.sh | 3 ++ examples/pca/pom.xml | 3 +- examples/pca/run.sh | 23 +++++++++- 10 files changed, 47 insertions(+), 214 deletions(-) delete mode 100644 examples/pca-pyspark/data/pca_data.csv create mode 100755 examples/pca/build.sh diff --git a/examples/als-pyspark/als-pyspark.py b/examples/als-pyspark/als-pyspark.py index 8847ca2b9..65d257c56 100644 --- a/examples/als-pyspark/als-pyspark.py +++ b/examples/als-pyspark/als-pyspark.py @@ -55,7 +55,7 @@ print("\nALS training with implicitPrefs={}, rank={}, maxIter={}, regParam={}, alpha={}, seed={}\n".format( als.getImplicitPrefs(), als.getRank(), als.getMaxIter(), als.getRegParam(), als.getAlpha(), als.getSeed() )) - model = als.fit(ratings) + model = als.fit(ratings) # Evaluate the model by computing the RMSE on the test data # predictions = model.transform(test) diff --git a/examples/als-pyspark/run.sh b/examples/als-pyspark/run.sh index b3ba1b6d2..044c857f6 100755 --- a/examples/als-pyspark/run.sh +++ b/examples/als-pyspark/run.sh @@ -1,62 +1,14 @@ #!/usr/bin/env bash -# == User to customize the following environments ======= # +source ../../conf/env.sh -# Set user Spark and Hadoop home directory -#export SPARK_HOME=/path/to/your/spark/home -#export HADOOP_HOME=/path/to/your/hadoop/home -# Set user HDFS Root -export HDFS_ROOT=hdfs://sr549:8020 -# Set user Intel MLlib Root directory -export OAP_MLLIB_ROOT=/home/xiaochang/Works/OAP-xwu99-als/oap-mllib -# Set IP and Port for oneCCL KVS, you can select any one of the worker nodes and set CCL_KVS_IP_PORT to its IP and Port -# IP can be got with `hostname -I`, if multiple IPs are returned, the first IP should be used. Port can be any available port. -# For example, if one of the worker IP is 192.168.0.1 and an available port is 51234. -# CCL_KVS_IP_PORT can be set in the format of 192.168.0.1_51234 -# Incorrectly setting this value will result in hanging when oneCCL initialize -export CCL_KVS_IP_PORT=10.0.2.149_51234 - -# Data file is from Spark Examples (data/mllib/sample_kmeans_data.txt), the data file should be copied to HDFS +# Data file is converted from oneDAL examples ($DAALROOT/examples/daal/data/batch/implicit_als_csr.csv) +# The data file should be copied to $HDFS_ROOT before running examples DATA_FILE=data/onedal_als_csr_ratings.txt -# == User to customize Spark executor cores and memory == # - -# User should check the requested resources are acturally allocated by cluster manager or Intel MLlib will behave incorrectly -SPARK_MASTER=yarn -SPARK_DRIVER_MEMORY=1G -SPARK_NUM_EXECUTORS=2 -SPARK_EXECUTOR_CORES=1 -SPARK_EXECUTOR_MEMORY=1G - -SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES '*' 2) - -# ======================================================= # - -# Check env -if [[ -z $SPARK_HOME ]]; then - echo SPARK_HOME not defined! - exit 1 -fi - -if [[ -z $HADOOP_HOME ]]; then - echo HADOOP_HOME not defined! - exit 1 -fi - -export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop - -# Target jar built -OAP_MLLIB_JAR_NAME=oap-mllib-0.9.0-with-spark-3.0.0.jar -OAP_MLLIB_JAR=$OAP_MLLIB_ROOT/mllib-dal/target/$OAP_MLLIB_JAR_NAME - -# Use absolute path -SPARK_DRIVER_CLASSPATH=$OAP_MLLIB_JAR -# Use relative path -SPARK_EXECUTOR_CLASSPATH=./$OAP_MLLIB_JAR_NAME - APP_PY=als-pyspark.py -/usr/bin/time -p $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \ +time $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \ --num-executors $SPARK_NUM_EXECUTORS \ --driver-memory $SPARK_DRIVER_MEMORY \ --executor-cores $SPARK_EXECUTOR_CORES \ @@ -66,7 +18,6 @@ APP_PY=als-pyspark.py --conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \ --conf "spark.driver.extraClassPath=$SPARK_DRIVER_CLASSPATH" \ --conf "spark.executor.extraClassPath=$SPARK_EXECUTOR_CLASSPATH" \ - --conf "spark.executorEnv.CCL_KVS_IP_PORT=$CCL_KVS_IP_PORT" \ --conf "spark.shuffle.reduceLocality.enabled=false" \ --conf "spark.network.timeout=1200s" \ --conf "spark.task.maxFailures=1" \ diff --git a/examples/kmeans-pyspark/run.sh b/examples/kmeans-pyspark/run.sh index d029bf294..0fa2a7bcb 100755 --- a/examples/kmeans-pyspark/run.sh +++ b/examples/kmeans-pyspark/run.sh @@ -1,62 +1,14 @@ #!/usr/bin/env bash -# == User to customize the following environments ======= # +source ../../conf/env.sh -# Set user Spark and Hadoop home directory -export SPARK_HOME=/path/to/your/spark/home -export HADOOP_HOME=/path/to/your/hadoop/home -# Set user HDFS Root -export HDFS_ROOT=hdfs://your_hostname:8020 -# Set user Intel MLlib Root directory -export OAP_MLLIB_ROOT=/path/to/your/OAP/oap-mllib -# Set IP and Port for oneCCL KVS, you can select any one of the worker nodes and set CCL_KVS_IP_PORT to its IP and Port -# IP can be got with `hostname -I`, if multiple IPs are returned, the first IP should be used. Port can be any available port. -# For example, if one of the worker IP is 192.168.0.1 and an available port is 51234. -# CCL_KVS_IP_PORT can be set in the format of 192.168.0.1_51234 -# Incorrectly setting this value will result in hanging when oneCCL initialize -export CCL_KVS_IP_PORT=192.168.0.1_51234 - -# Data file is from Spark Examples (data/mllib/sample_kmeans_data.txt), the data file should be copied to HDFS +# Data file is from Spark Examples (data/mllib/sample_kmeans_data.txt) and put in examples/data +# The data file should be copied to $HDFS_ROOT before running examples DATA_FILE=data/sample_kmeans_data.txt -# == User to customize Spark executor cores and memory == # - -# User should check the requested resources are acturally allocated by cluster manager or Intel MLlib will behave incorrectly -SPARK_MASTER=yarn -SPARK_DRIVER_MEMORY=1G -SPARK_NUM_EXECUTORS=2 -SPARK_EXECUTOR_CORES=1 -SPARK_EXECUTOR_MEMORY=1G - -SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES '*' 2) - -# ======================================================= # - -# Check env -if [[ -z $SPARK_HOME ]]; then - echo SPARK_HOME not defined! - exit 1 -fi - -if [[ -z $HADOOP_HOME ]]; then - echo HADOOP_HOME not defined! - exit 1 -fi - -export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop - -# Target jar built -OAP_MLLIB_JAR_NAME=oap-mllib-0.9.0-with-spark-3.0.0.jar -OAP_MLLIB_JAR=$OAP_MLLIB_ROOT/mllib-dal/target/$OAP_MLLIB_JAR_NAME - -# Use absolute path -SPARK_DRIVER_CLASSPATH=$OAP_MLLIB_JAR -# Use relative path -SPARK_EXECUTOR_CLASSPATH=./$OAP_MLLIB_JAR_NAME - APP_PY=kmeans-pyspark.py -/usr/bin/time -p $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \ +time $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \ --num-executors $SPARK_NUM_EXECUTORS \ --driver-memory $SPARK_DRIVER_MEMORY \ --executor-cores $SPARK_EXECUTOR_CORES \ @@ -66,7 +18,6 @@ APP_PY=kmeans-pyspark.py --conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \ --conf "spark.driver.extraClassPath=$SPARK_DRIVER_CLASSPATH" \ --conf "spark.executor.extraClassPath=$SPARK_EXECUTOR_CLASSPATH" \ - --conf "spark.executorEnv.CCL_KVS_IP_PORT=$CCL_KVS_IP_PORT" \ --conf "spark.shuffle.reduceLocality.enabled=false" \ --conf "spark.network.timeout=1200s" \ --conf "spark.task.maxFailures=1" \ diff --git a/examples/kmeans/pom.xml b/examples/kmeans/pom.xml index 71c4ea6d7..476a98ce6 100644 --- a/examples/kmeans/pom.xml +++ b/examples/kmeans/pom.xml @@ -4,7 +4,7 @@ com.intel.oap oap-mllib-examples - 1.1.0-with-spark-3.0.0 + ${oap.version}-with-spark-${spark.version} jar KMeansExample @@ -12,6 +12,7 @@ UTF-8 + 1.1.0 2.12.10 2.12 3.0.0 diff --git a/examples/kmeans/run.sh b/examples/kmeans/run.sh index bb0f9ac78..00b782464 100755 --- a/examples/kmeans/run.sh +++ b/examples/kmeans/run.sh @@ -1,64 +1,15 @@ #!/usr/bin/env bash -# == User to customize the following environments ======= # +source ../../conf/env.sh -# Set user Spark and Hadoop home directory -export SPARK_HOME=/path/to/your/spark/home -export HADOOP_HOME=/path/to/your/hadoop/home -# Set user HDFS Root -export HDFS_ROOT=hdfs://your_hostname:8020 -# Set user Intel MLlib Root directory -export OAP_MLLIB_ROOT=/path/to/your/OAP/oap-mllib -# Set IP and Port for oneCCL KVS, you can select any one of the worker nodes and set CCL_KVS_IP_PORT to its IP and Port -# IP can be got with `hostname -I`, if multiple IPs are returned, the first IP should be used. Port can be any available port. -# For example, if one of the worker IP is 192.168.0.1 and an available port is 51234. -# CCL_KVS_IP_PORT can be set in the format of 192.168.0.1_51234 -# Incorrectly setting this value will result in hanging when oneCCL initialize -export CCL_KVS_IP_PORT=192.168.0.1_51234 - -# Data file is from Spark examples' data (data/mllib/sample_kmeans_data.txt) -# This data file should be copied to HDFS hdfs://your_hostname:8020/user//data/sample_kmeans_data.txt +# Data file is from Spark Examples (data/mllib/sample_kmeans_data.txt) and put in examples/data +# The data file should be copied to $HDFS_ROOT before running examples DATA_FILE=data/sample_kmeans_data.txt -# == User to customize Spark executor cores and memory == # - -# User should check the requested resources are acturally allocated by cluster manager or Intel MLlib will behave incorrectly -SPARK_MASTER=yarn -SPARK_DRIVER_MEMORY=1G -SPARK_NUM_EXECUTORS=2 -SPARK_EXECUTOR_CORES=1 -SPARK_EXECUTOR_MEMORY=1G - -SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES '*' 2) - -# ======================================================= # - -# Check envs -if [[ -z $SPARK_HOME ]]; then - echo SPARK_HOME not defined! - exit 1 -fi - -if [[ -z $HADOOP_HOME ]]; then - echo HADOOP_HOME not defined! - exit 1 -fi - -export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop - -# Target jar built -OAP_MLLIB_JAR_NAME=oap-mllib-0.9.0-with-spark-3.0.0.jar -OAP_MLLIB_JAR=$OAP_MLLIB_ROOT/mllib-dal/target/$OAP_MLLIB_JAR_NAME - -# Use absolute path -SPARK_DRIVER_CLASSPATH=$OAP_MLLIB_JAR -# Use relative path -SPARK_EXECUTOR_CLASSPATH=./$OAP_MLLIB_JAR_NAME - -APP_JAR=target/oap-mllib-examples-0.9.0-with-spark-3.0.0.jar +APP_JAR=target/oap-mllib-examples-$OAP_MLLIB_VERSION-with-spark-3.0.0.jar APP_CLASS=org.apache.spark.examples.ml.KMeansExample -/usr/bin/time -p $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \ +time $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \ --num-executors $SPARK_NUM_EXECUTORS \ --driver-memory $SPARK_DRIVER_MEMORY \ --executor-cores $SPARK_EXECUTOR_CORES \ @@ -68,7 +19,6 @@ APP_CLASS=org.apache.spark.examples.ml.KMeansExample --conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \ --conf "spark.driver.extraClassPath=$SPARK_DRIVER_CLASSPATH" \ --conf "spark.executor.extraClassPath=$SPARK_EXECUTOR_CLASSPATH" \ - --conf "spark.executorEnv.CCL_KVS_IP_PORT=$CCL_KVS_IP_PORT" \ --conf "spark.shuffle.reduceLocality.enabled=false" \ --conf "spark.network.timeout=1200s" \ --conf "spark.task.maxFailures=1" \ diff --git a/examples/pca-pyspark/data/pca_data.csv b/examples/pca-pyspark/data/pca_data.csv deleted file mode 100644 index 2b8fac3c4..000000000 --- a/examples/pca-pyspark/data/pca_data.csv +++ /dev/null @@ -1,3 +0,0 @@ -0.0,1.0,0.0,7.0,0.0 -2.0,0.0,3.0,4.0,5.0 -4.0,0.0,0.0,6.0,7.0 \ No newline at end of file diff --git a/examples/pca-pyspark/run.sh b/examples/pca-pyspark/run.sh index 20010ce6b..b3776e877 100755 --- a/examples/pca-pyspark/run.sh +++ b/examples/pca-pyspark/run.sh @@ -1,57 +1,15 @@ #!/usr/bin/env bash -# == User to customize the following environments ======= # +source ../../conf/env.sh -# Set user Spark and Hadoop home directory -export SPARK_HOME=/path/to/your/spark/home -export HADOOP_HOME=/path/to/your/hadoop/home -# Set user HDFS Root -export HDFS_ROOT=hdfs://your_hostname:8020 -# Set user Intel MLlib Root directory -export OAP_MLLIB_ROOT=/path/to/your/OAP/oap-mllib - -# CSV data is the same as in Spark example "ml/pca_example.py", the data file should be copied to HDFS +# CSV data is the same as in Spark example "ml/pca_example.py" +# The data file should be copied to $HDFS_ROOT before running examples DATA_FILE=data/pca_data.csv -# == User to customize Spark executor cores and memory == # - -# User should check the requested resources are acturally allocated by cluster manager or Intel MLlib will behave incorrectly -SPARK_MASTER=yarn -SPARK_DRIVER_MEMORY=1G -SPARK_NUM_EXECUTORS=2 -SPARK_EXECUTOR_CORES=1 -SPARK_EXECUTOR_MEMORY=1G - -SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES '*' 2) - -# ======================================================= # - -# Check env -if [[ -z $SPARK_HOME ]]; then - echo SPARK_HOME not defined! - exit 1 -fi - -if [[ -z $HADOOP_HOME ]]; then - echo HADOOP_HOME not defined! - exit 1 -fi - -export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop - -# Target jar built -OAP_MLLIB_JAR_NAME=oap-mllib-0.9.0-with-spark-3.0.0.jar -OAP_MLLIB_JAR=$OAP_MLLIB_ROOT/mllib-dal/target/$OAP_MLLIB_JAR_NAME - -# Use absolute path -SPARK_DRIVER_CLASSPATH=$OAP_MLLIB_JAR -# Use relative path -SPARK_EXECUTOR_CLASSPATH=./$OAP_MLLIB_JAR_NAME - APP_PY=pca-pyspark.py K=3 -/usr/bin/time -p $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \ +time $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \ --num-executors $SPARK_NUM_EXECUTORS \ --driver-memory $SPARK_DRIVER_MEMORY \ --executor-cores $SPARK_EXECUTOR_CORES \ diff --git a/examples/pca/build.sh b/examples/pca/build.sh new file mode 100755 index 000000000..da373645b --- /dev/null +++ b/examples/pca/build.sh @@ -0,0 +1,3 @@ +#!/usr/bin/env bash + +mvn clean package diff --git a/examples/pca/pom.xml b/examples/pca/pom.xml index 06cf2343c..75641da81 100644 --- a/examples/pca/pom.xml +++ b/examples/pca/pom.xml @@ -4,7 +4,7 @@ com.intel.oap oap-mllib-examples - 1.1.0-with-spark-3.0.0 + ${oap.version}-with-spark-${spark.version} jar PCAExample @@ -12,6 +12,7 @@ UTF-8 + 1.1.0 2.12.10 2.12 3.0.0 diff --git a/examples/pca/run.sh b/examples/pca/run.sh index da373645b..64d22c6fe 100755 --- a/examples/pca/run.sh +++ b/examples/pca/run.sh @@ -1,3 +1,24 @@ #!/usr/bin/env bash -mvn clean package +source ../../conf/env.sh + +APP_JAR=target/oap-mllib-examples-$OAP_MLLIB_VERSION-with-spark-3.0.0.jar +APP_CLASS=org.apache.spark.examples.ml.PCAExample + +time $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \ + --num-executors $SPARK_NUM_EXECUTORS \ + --driver-memory $SPARK_DRIVER_MEMORY \ + --executor-cores $SPARK_EXECUTOR_CORES \ + --executor-memory $SPARK_EXECUTOR_MEMORY \ + --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \ + --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \ + --conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \ + --conf "spark.driver.extraClassPath=$SPARK_DRIVER_CLASSPATH" \ + --conf "spark.executor.extraClassPath=$SPARK_EXECUTOR_CLASSPATH" \ + --conf "spark.shuffle.reduceLocality.enabled=false" \ + --conf "spark.network.timeout=1200s" \ + --conf "spark.task.maxFailures=1" \ + --jars $OAP_MLLIB_JAR \ + --class $APP_CLASS \ + $APP_JAR $DATA_FILE $K \ + 2>&1 | tee PCA-$(date +%m%d_%H_%M_%S).log From 723a05af19a5bc1df7e39586f036bf53a28aa3b0 Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Wed, 31 Mar 2021 12:08:47 +0800 Subject: [PATCH 06/62] Add ALS scala and modify als-pyspark.py --- examples/als-pyspark/als-pyspark.py | 18 ++-- examples/als/build.sh | 4 + examples/als/pom.xml | 94 +++++++++++++++++++ examples/als/run.sh | 28 ++++++ .../apache/spark/examples/ml/ALSExample.scala | 91 ++++++++++++++++++ 5 files changed, 226 insertions(+), 9 deletions(-) create mode 100755 examples/als/build.sh create mode 100644 examples/als/pom.xml create mode 100755 examples/als/run.sh create mode 100644 examples/als/src/main/scala/org/apache/spark/examples/ml/ALSExample.scala diff --git a/examples/als-pyspark/als-pyspark.py b/examples/als-pyspark/als-pyspark.py index 65d257c56..12622a4fd 100644 --- a/examples/als-pyspark/als-pyspark.py +++ b/examples/als-pyspark/als-pyspark.py @@ -44,8 +44,8 @@ parts = lines.map(lambda row: row.value.split("::")) ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]), rating=float(p[2]))) - ratings = spark.createDataFrame(ratingsRDD) - # (training, test) = ratings.randomSplit([0.8, 0.2]) + ratings = spark.createDataFrame(ratingsRDD) + (training, test) = ratings.randomSplit([0.8, 0.2]) # Build the recommendation model using ALS on the training data # Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics @@ -55,13 +55,13 @@ print("\nALS training with implicitPrefs={}, rank={}, maxIter={}, regParam={}, alpha={}, seed={}\n".format( als.getImplicitPrefs(), als.getRank(), als.getMaxIter(), als.getRegParam(), als.getAlpha(), als.getSeed() )) - model = als.fit(ratings) + model = als.fit(training) - # Evaluate the model by computing the RMSE on the test data - # predictions = model.transform(test) - # evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", - # predictionCol="prediction") - # rmse = evaluator.evaluate(predictions) - # print("Root-mean-square error = " + str(rmse)) + # Evaluate the model by computing the RMSE on the test data + predictions = model.transform(test) + evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", + predictionCol="prediction") + rmse = evaluator.evaluate(predictions) + print("Root-mean-square error = " + str(rmse)) spark.stop() diff --git a/examples/als/build.sh b/examples/als/build.sh new file mode 100755 index 000000000..3c01d1689 --- /dev/null +++ b/examples/als/build.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env bash + +mvn clean package + diff --git a/examples/als/pom.xml b/examples/als/pom.xml new file mode 100644 index 000000000..94a72189a --- /dev/null +++ b/examples/als/pom.xml @@ -0,0 +1,94 @@ + + 4.0.0 + + com.intel.oap + oap-mllib-examples + ${oap.version}-with-spark-${spark.version} + jar + + ALSExample + https://github.com/oap-project/oap-mllib.git + + + UTF-8 + 1.1.0 + 2.12.10 + 2.12 + 3.0.0 + + + + + + org.scala-lang + scala-library + 2.12.10 + + + + com.github.scopt + scopt_2.12 + 3.7.0 + + + + org.apache.spark + spark-sql_2.12 + ${spark.version} + provided + + + + org.apache.spark + spark-mllib_2.12 + ${spark.version} + provided + + + + + + + + org.scala-tools + maven-scala-plugin + 2.15.2 + + + + compile + testCompile + + + + + ${scala.version} + + -target:jvm-1.8 + + + + + maven-assembly-plugin + 3.0.0 + + false + + jar-with-dependencies + + + + + assembly + package + + single + + + + + + + + diff --git a/examples/als/run.sh b/examples/als/run.sh new file mode 100755 index 000000000..8a317dbc7 --- /dev/null +++ b/examples/als/run.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash + +source ../../conf/env.sh + +# Data file is converted from oneDAL examples ($DAALROOT/examples/daal/data/batch/implicit_als_csr.csv) +# The data file should be copied to $HDFS_ROOT before running examples +DATA_FILE=data/onedal_als_csr_ratings.txt + +APP_JAR=target/oap-mllib-examples-$OAP_MLLIB_VERSION-with-spark-3.0.0.jar +APP_CLASS=org.apache.spark.examples.ml.ALSExample + +time $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \ + --num-executors $SPARK_NUM_EXECUTORS \ + --driver-memory $SPARK_DRIVER_MEMORY \ + --executor-cores $SPARK_EXECUTOR_CORES \ + --executor-memory $SPARK_EXECUTOR_MEMORY \ + --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \ + --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \ + --conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \ + --conf "spark.driver.extraClassPath=$SPARK_DRIVER_CLASSPATH" \ + --conf "spark.executor.extraClassPath=$SPARK_EXECUTOR_CLASSPATH" \ + --conf "spark.shuffle.reduceLocality.enabled=false" \ + --conf "spark.network.timeout=1200s" \ + --conf "spark.task.maxFailures=1" \ + --jars $OAP_MLLIB_JAR \ + --class $APP_CLASS \ + $APP_JAR $DATA_FILE \ + 2>&1 | tee ALS-$(date +%m%d_%H_%M_%S).log diff --git a/examples/als/src/main/scala/org/apache/spark/examples/ml/ALSExample.scala b/examples/als/src/main/scala/org/apache/spark/examples/ml/ALSExample.scala new file mode 100644 index 000000000..1071e906b --- /dev/null +++ b/examples/als/src/main/scala/org/apache/spark/examples/ml/ALSExample.scala @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.ml + +// $example on$ +import org.apache.spark.ml.evaluation.RegressionEvaluator +import org.apache.spark.ml.recommendation.ALS +// $example off$ +import org.apache.spark.sql.SparkSession + +/** + * An example demonstrating ALS. + * Run with + * {{{ + * bin/run-example ml.ALSExample + * }}} + */ +object ALSExample { + + // $example on$ + case class Rating(userId: Int, movieId: Int, rating: Float) + def parseRating(str: String): Rating = { + val fields = str.split("::") + assert(fields.size == 3) + Rating(fields(0).toInt, fields(1).toInt, fields(2).toFloat) + } + // $example off$ + + def main(args: Array[String]): Unit = { + val spark = SparkSession + .builder + .appName("ALSExample") + .getOrCreate() + import spark.implicits._ + + if (args.length != 1) { + println("Require data file path as input parameter") + sys.exit(1) + } + + // $example on$ + val ratings = spark.read.textFile(args(0)) + .map(parseRating) + .toDF() + val Array(training, test) = ratings.randomSplit(Array(0.8, 0.2)) + + // Build the recommendation model using ALS on the training data + val als = new ALS() + .setImplicitPrefs(true) + .setRank(10) + .setMaxIter(5) + .setRegParam(0.01) + .setAlpha(40.0) + .setUserCol("userId") + .setItemCol("movieId") + .setRatingCol("rating") + val model = als.fit(training) + + // Evaluate the model by computing the RMSE on the test data + // Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics + model.setColdStartStrategy("drop") + val predictions = model.transform(test) + + val evaluator = new RegressionEvaluator() + .setMetricName("rmse") + .setLabelCol("rating") + .setPredictionCol("prediction") + val rmse = evaluator.evaluate(predictions) + println(s"Root-mean-square error = $rmse") + + spark.stop() + } +} +// scalastyle:on println + From bf00ffdecf7641b5d98afc48c81a9850a0432fc9 Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Wed, 31 Mar 2021 12:20:59 +0800 Subject: [PATCH 07/62] nit --- conf/env.sh.template | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/env.sh.template b/conf/env.sh.template index 2ff4f447f..ddcc84665 100644 --- a/conf/env.sh.template +++ b/conf/env.sh.template @@ -10,7 +10,7 @@ SPARK_MASTER=yarn export HADOOP_HOME=/path/to/your/hadoop/home # Set Spark home path export SPARK_HOME=/path/to/your/spark/home -# Set HDFS Root +# Set HDFS Root, should be hdfs://xxx or file://xxx export HDFS_ROOT=hdfs://localhost:8020 # Set OAP MLlib source code root directory export OAP_MLLIB_ROOT=/path/to/oap-mllib/home From 05681cc4845d82c4ee899b80f56aece36198d389 Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Wed, 31 Mar 2021 13:36:30 +0800 Subject: [PATCH 08/62] Add build-all & run-all --- examples/build-all.sh | 8 ++++++++ examples/run-all-pyspark.sh | 8 ++++++++ examples/run-all-scala.sh | 8 ++++++++ 3 files changed, 24 insertions(+) create mode 100755 examples/build-all.sh create mode 100755 examples/run-all-pyspark.sh create mode 100755 examples/run-all-scala.sh diff --git a/examples/build-all.sh b/examples/build-all.sh new file mode 100755 index 000000000..62a95d255 --- /dev/null +++ b/examples/build-all.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +for dir in kmeans pca als +do + cd $dir + ./build.sh + cd .. +done diff --git a/examples/run-all-pyspark.sh b/examples/run-all-pyspark.sh new file mode 100755 index 000000000..06decd242 --- /dev/null +++ b/examples/run-all-pyspark.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +for dir in kmeans-pyspark pca-pyspark als-pyspark +do + cd $dir + ./run.sh + cd .. +done diff --git a/examples/run-all-scala.sh b/examples/run-all-scala.sh new file mode 100755 index 000000000..65636c250 --- /dev/null +++ b/examples/run-all-scala.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +for dir in kmeans pca als +do + cd $dir + ./run.sh + cd .. +done From 8717d3a57d3231dd3d9d1aa5254c958b140802e6 Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Wed, 31 Mar 2021 15:45:48 +0800 Subject: [PATCH 09/62] remove test-cluster/workloads and use examples for validation --- .github/workflows/oap-mllib-ci.yml | 1 + dev/test-cluster/env.sh | 46 ++++++++++++ dev/test-cluster/envs.sh | 22 ------ dev/test-cluster/test-cluster.sh | 14 ++++ dev/test-cluster/workloads/kmeans-pyspark.py | 70 ------------------- .../workloads/run-kmeans-pyspark.sh | 48 ------------- 6 files changed, 61 insertions(+), 140 deletions(-) create mode 100644 dev/test-cluster/env.sh delete mode 100644 dev/test-cluster/envs.sh create mode 100755 dev/test-cluster/test-cluster.sh delete mode 100644 dev/test-cluster/workloads/kmeans-pyspark.py delete mode 100755 dev/test-cluster/workloads/run-kmeans-pyspark.sh diff --git a/.github/workflows/oap-mllib-ci.yml b/.github/workflows/oap-mllib-ci.yml index 2c6973321..1eea5265d 100644 --- a/.github/workflows/oap-mllib-ci.yml +++ b/.github/workflows/oap-mllib-ci.yml @@ -40,3 +40,4 @@ jobs: source /tmp/oneCCL/build/_install/env/setvars.sh # temp disable and will enable for new release of oneCCL #./build.sh + ${{github.workspace}}/dev/test-cluster.sh diff --git a/dev/test-cluster/env.sh b/dev/test-cluster/env.sh new file mode 100644 index 000000000..f2c2afa57 --- /dev/null +++ b/dev/test-cluster/env.sh @@ -0,0 +1,46 @@ +# == OAP MLlib users to customize the following environments for running examples ======= # + +# ============== Minimum Settings ============= # + +# Set OAP MLlib version (e.g. 1.1.0) +OAP_MLLIB_VERSION=1.1.0 +# Set Spark master +SPARK_MASTER=yarn +# Set Hadoop home path +export HADOOP_HOME=$HADOOP_HOME +# Set Spark home path +export SPARK_HOME=$SPARK_HOME +# Set HDFS Root, should be hdfs://xxx or file://xxx +export HDFS_ROOT=hdfs://localhost:8020 +# Set OAP MLlib source code root directory +export OAP_MLLIB_ROOT=/home/xiaochang/Works/oap-mllib-xwu99-refactor-examples + +# ============================================= # + +# Set HADOOP_CONF_DIR for Spark +export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop + +# Set JAR name & path +OAP_MLLIB_JAR_NAME=oap-mllib-$OAP_MLLIB_VERSION.jar +OAP_MLLIB_JAR=$OAP_MLLIB_ROOT/mllib-dal/target/$OAP_MLLIB_JAR_NAME +# Set Spark driver & executor classpaths, +# absolute path for driver, relative path for executor +SPARK_DRIVER_CLASSPATH=$OAP_MLLIB_JAR +SPARK_EXECUTOR_CLASSPATH=./$OAP_MLLIB_JAR_NAME + +# Set Spark resources, can be overwritten in example +SPARK_DRIVER_MEMORY=1G +SPARK_NUM_EXECUTORS=2 +SPARK_EXECUTOR_CORES=1 +SPARK_EXECUTOR_MEMORY=1G +SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES '*' 2) + +# Checks + +for dir in $SPARK_HOME $HADOOP_HOME $OAP_MLLIB_JAR +do + if [[ ! -e $dir ]]; then + echo $dir does not exist! + exit 1 + fi +done diff --git a/dev/test-cluster/envs.sh b/dev/test-cluster/envs.sh deleted file mode 100644 index 71e8506e6..000000000 --- a/dev/test-cluster/envs.sh +++ /dev/null @@ -1,22 +0,0 @@ -# Set user Spark and Hadoop home directory -export HADOOP_HOME=~/opt/hadoop-2.7.7 -export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop -export SPARK_HOME=~/opt/spark-3.0.0-bin-hadoop2.7 - -export PYTHONPATH=$SPARK_HOME/python:$PYTHONPATH -export PYSPARK_PYTHON=python3 - -# Set user HDFS Root -export HDFS_ROOT=hdfs://localhost:8020 -export OAP_MLLIB_DATA_ROOT=OAPMLlib/Data -# Set user Intel MLlib Root directory -export OAP_MLLIB_ROOT=${GITHUB_WORKSPACE} - -# Target jar built -OAP_MLLIB_JAR_NAME=oap-mllib-1.1.0.jar -OAP_MLLIB_JAR=$OAP_MLLIB_ROOT/mllib-dal/target/$OAP_MLLIB_JAR_NAME - -# Use absolute path -SPARK_DRIVER_CLASSPATH=$OAP_MLLIB_JAR -# Use relative path -SPARK_EXECUTOR_CLASSPATH=./$OAP_MLLIB_JAR_NAME diff --git a/dev/test-cluster/test-cluster.sh b/dev/test-cluster/test-cluster.sh new file mode 100755 index 000000000..3152cccda --- /dev/null +++ b/dev/test-cluster/test-cluster.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +cd $GITHUB_WORKSPACE + +source ./env.sh + +cd examples + +hadoop fs -copyFromLocal data +hadoop fs -ls data + +./build.sh +./run-all-scala.sh +./run-all-pyspark.sh diff --git a/dev/test-cluster/workloads/kmeans-pyspark.py b/dev/test-cluster/workloads/kmeans-pyspark.py deleted file mode 100644 index cf93e6034..000000000 --- a/dev/test-cluster/workloads/kmeans-pyspark.py +++ /dev/null @@ -1,70 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -An example demonstrating k-means clustering. -Run with: - bin/spark-submit examples/src/main/python/ml/kmeans_example.py - -This example requires NumPy (http://www.numpy.org/). -""" -from __future__ import print_function -import sys - -# $example on$ -from pyspark.ml.clustering import KMeans -from pyspark.ml.evaluation import ClusteringEvaluator -# $example off$ - -from pyspark.sql import SparkSession - -if __name__ == "__main__": - spark = SparkSession\ - .builder\ - .appName("KMeansExample")\ - .getOrCreate() - - if (len(sys.argv) != 2) : - println("Require data file path as input parameter") - sys.exit(1) - - # $example on$ - # Loads data. - dataset = spark.read.format("libsvm").load(sys.argv[1]) - - # Trains a k-means model. - kmeans = KMeans().setK(2).setSeed(1) - model = kmeans.fit(dataset) - - # Make predictions - predictions = model.transform(dataset) - - # Evaluate clustering by computing Silhouette score - evaluator = ClusteringEvaluator() - - silhouette = evaluator.evaluate(predictions) - print("Silhouette with squared euclidean distance = " + str(silhouette)) - - # Shows the result. - centers = model.clusterCenters() - print("Cluster Centers: ") - for center in centers: - print(center) - # $example off$ - - spark.stop() - diff --git a/dev/test-cluster/workloads/run-kmeans-pyspark.sh b/dev/test-cluster/workloads/run-kmeans-pyspark.sh deleted file mode 100755 index e07f3f7b6..000000000 --- a/dev/test-cluster/workloads/run-kmeans-pyspark.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env bash - -source ../envs.sh - -# Data file is from Spark Examples (data/mllib/sample_kmeans_data.txt), the data file should be copied to HDFS -$HADOOP_HOME/bin/hadoop fs -mkdir -p $OAP_MLLIB_DATA_ROOT -$HADOOP_HOME/bin/hadoop fs -copyFromLocal $SPARK_HOME/data/mllib/sample_kmeans_data.txt $OAP_MLLIB_DATA_ROOT - -# User should check the requested resources are acturally allocated by cluster manager or Intel MLlib will behave incorrectly -SPARK_MASTER=yarn -SPARK_DRIVER_MEMORY=1G -SPARK_NUM_EXECUTORS=2 -SPARK_EXECUTOR_CORES=1 -SPARK_EXECUTOR_MEMORY=1G - -SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES '*' 2) - -# ======================================================= # - -# Check env -if [[ -z $SPARK_HOME ]]; then - echo SPARK_HOME not defined! - exit 1 -fi - -if [[ -z $HADOOP_HOME ]]; then - echo HADOOP_HOME not defined! - exit 1 -fi - -APP_PY="$OAP_MLLIB_ROOT/dev/test-cluster/workloads/kmeans-pyspark.py" -DATA_FILE=$OAP_MLLIB_DATA_ROOT/sample_kmeans_data.txt - -$SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \ - --num-executors $SPARK_NUM_EXECUTORS \ - --driver-memory $SPARK_DRIVER_MEMORY \ - --executor-cores $SPARK_EXECUTOR_CORES \ - --executor-memory $SPARK_EXECUTOR_MEMORY \ - --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \ - --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \ - --conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \ - --conf "spark.driver.extraClassPath=$SPARK_DRIVER_CLASSPATH" \ - --conf "spark.executor.extraClassPath=$SPARK_EXECUTOR_CLASSPATH" \ - --conf "spark.shuffle.reduceLocality.enabled=false" \ - --conf "spark.network.timeout=1200s" \ - --conf "spark.task.maxFailures=1" \ - --jars $OAP_MLLIB_JAR \ - $APP_PY $DATA_FILE From 1eb3eb3c5445ac8e3c231b85c8acf1ac74ec111b Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Wed, 31 Mar 2021 15:59:53 +0800 Subject: [PATCH 10/62] add setup-python3 and setup-cluster, fix paths --- .github/workflows/oap-mllib-ci.yml | 2 +- dev/test-cluster/test-cluster.sh | 12 ++++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/.github/workflows/oap-mllib-ci.yml b/.github/workflows/oap-mllib-ci.yml index 1eea5265d..bbe59f335 100644 --- a/.github/workflows/oap-mllib-ci.yml +++ b/.github/workflows/oap-mllib-ci.yml @@ -40,4 +40,4 @@ jobs: source /tmp/oneCCL/build/_install/env/setvars.sh # temp disable and will enable for new release of oneCCL #./build.sh - ${{github.workspace}}/dev/test-cluster.sh + ${{github.workspace}}/dev/test-cluster/test-cluster.sh diff --git a/dev/test-cluster/test-cluster.sh b/dev/test-cluster/test-cluster.sh index 3152cccda..6ad9bdfa0 100755 --- a/dev/test-cluster/test-cluster.sh +++ b/dev/test-cluster/test-cluster.sh @@ -1,14 +1,18 @@ #!/usr/bin/env bash -cd $GITHUB_WORKSPACE +# Setup Python3 and Spark cluster +cd $GITHUB_WORKSPACE/dev/test-cluster +./setup-python3-env.sh +./setup-cluster.sh -source ./env.sh +# Build and run all examples +source $GITHUB_WORKSPACE/dev/test-cluster/env.sh -cd examples +cd $GITHUB_WORKSPACE/examples hadoop fs -copyFromLocal data hadoop fs -ls data -./build.sh +./build-all.sh ./run-all-scala.sh ./run-all-pyspark.sh From 6310e2898a38279bab017858fbf34918f9c74abc Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Wed, 31 Mar 2021 16:34:58 +0800 Subject: [PATCH 11/62] fix java home --- dev/test-cluster/hadoop-env.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dev/test-cluster/hadoop-env.sh b/dev/test-cluster/hadoop-env.sh index bee6c1f69..f60b65a0b 100755 --- a/dev/test-cluster/hadoop-env.sh +++ b/dev/test-cluster/hadoop-env.sh @@ -22,8 +22,7 @@ # remote nodes. # The java implementation to use. -# export JAVA_HOME=${JAVA_HOME} -export JAVA_HOME=/usr/local/lib/jvm/openjdk8 +export JAVA_HOME=${JAVA_HOME} # The jsvc implementation to use. Jsvc is required to run secure datanodes # that bind to privileged ports to provide authentication of data transfer From e904f38956694ffe631c2d329e66bd64d351fc65 Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Wed, 31 Mar 2021 20:22:16 +0800 Subject: [PATCH 12/62] add config ssh --- dev/test-cluster/test-cluster.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/dev/test-cluster/test-cluster.sh b/dev/test-cluster/test-cluster.sh index 6ad9bdfa0..37892de37 100755 --- a/dev/test-cluster/test-cluster.sh +++ b/dev/test-cluster/test-cluster.sh @@ -3,6 +3,7 @@ # Setup Python3 and Spark cluster cd $GITHUB_WORKSPACE/dev/test-cluster ./setup-python3-env.sh +./config-ssh.sh ./setup-cluster.sh # Build and run all examples From b118779bfee5d71e593dfee5404bea8ca9492732 Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Wed, 31 Mar 2021 21:00:35 +0800 Subject: [PATCH 13/62] add config ssh --- dev/test-cluster/test-cluster.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dev/test-cluster/test-cluster.sh b/dev/test-cluster/test-cluster.sh index 37892de37..0320f38b2 100755 --- a/dev/test-cluster/test-cluster.sh +++ b/dev/test-cluster/test-cluster.sh @@ -1,10 +1,12 @@ #!/usr/bin/env bash +set -x + # Setup Python3 and Spark cluster cd $GITHUB_WORKSPACE/dev/test-cluster -./setup-python3-env.sh ./config-ssh.sh ./setup-cluster.sh +./setup-python3-env.sh # Build and run all examples source $GITHUB_WORKSPACE/dev/test-cluster/env.sh From f1310dbf8c6ee01d99ff271cbfb0ce9114903b06 Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Wed, 31 Mar 2021 21:13:13 +0800 Subject: [PATCH 14/62] add config ssh --- .github/workflows/oap-mllib-ci.yml | 30 +++++++++++++++--------------- dev/test-cluster/config-ssh.sh | 1 + dev/test-cluster/setup-cluster.sh | 4 ++-- 3 files changed, 18 insertions(+), 17 deletions(-) diff --git a/.github/workflows/oap-mllib-ci.yml b/.github/workflows/oap-mllib-ci.yml index bbe59f335..16264c9a4 100644 --- a/.github/workflows/oap-mllib-ci.yml +++ b/.github/workflows/oap-mllib-ci.yml @@ -23,21 +23,21 @@ jobs: key: ${{ runner.os }}-${{ hashFiles('**/pom.xml', '{{github.workspace}}/dev/install-build-deps-ubuntu.sh') }} restore-keys: | ${{ runner.os }}- - - name: Set up dependencies - run: | - [ -d ~/downloads ] || mkdir ~/downloads - cd ~/downloads - [ -f spark-3.0.0-bin-hadoop2.7.tgz ] || wget http://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop2.7.tgz - [ -d spark-3.0.0-bin-hadoop2.7 ] || cd ~ && tar -zxf downloads/spark-3.0.0-bin-hadoop2.7.tgz - export SPARK_HOME=~/spark-3.0.0-bin-hadoop2.7 - ${{github.workspace}}/dev/install-build-deps-ubuntu.sh + # - name: Set up dependencies + # run: | + # [ -d ~/downloads ] || mkdir ~/downloads + # cd ~/downloads + # [ -f spark-3.0.0-bin-hadoop2.7.tgz ] || wget http://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop2.7.tgz + # [ -d spark-3.0.0-bin-hadoop2.7 ] || cd ~ && tar -zxf downloads/spark-3.0.0-bin-hadoop2.7.tgz + # export SPARK_HOME=~/spark-3.0.0-bin-hadoop2.7 + # ${{github.workspace}}/dev/install-build-deps-ubuntu.sh - name: Build and Test run: | - cd ${{github.workspace}}/mllib-dal - export ONEAPI_ROOT=/opt/intel/oneapi - source /opt/intel/oneapi/dal/latest/env/vars.sh - source /opt/intel/oneapi/tbb/latest/env/vars.sh - source /tmp/oneCCL/build/_install/env/setvars.sh - # temp disable and will enable for new release of oneCCL - #./build.sh + # cd ${{github.workspace}}/mllib-dal + # export ONEAPI_ROOT=/opt/intel/oneapi + # source /opt/intel/oneapi/dal/latest/env/vars.sh + # source /opt/intel/oneapi/tbb/latest/env/vars.sh + # source /tmp/oneCCL/build/_install/env/setvars.sh + # # temp disable and will enable for new release of oneCCL + # #./build.sh ${{github.workspace}}/dev/test-cluster/test-cluster.sh diff --git a/dev/test-cluster/config-ssh.sh b/dev/test-cluster/config-ssh.sh index d093fa17a..70ada1027 100755 --- a/dev/test-cluster/config-ssh.sh +++ b/dev/test-cluster/config-ssh.sh @@ -1,5 +1,6 @@ #!/usr/bin/env bash +mkdir ~/.ssh ssh-keygen -q -N "" -t rsa -f ~/.ssh/id_rsa cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys echo " StrictHostKeyChecking no " | sudo tee -a /etc/ssh/ssh_config diff --git a/dev/test-cluster/setup-cluster.sh b/dev/test-cluster/setup-cluster.sh index eea058f80..357912e77 100755 --- a/dev/test-cluster/setup-cluster.sh +++ b/dev/test-cluster/setup-cluster.sh @@ -8,9 +8,9 @@ echo JAVA_HOME is $JAVA_HOME mkdir ~/opt cd ~/opt -wget https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop2.7.tgz +wget --no-verbose https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop2.7.tgz tar -xzf spark-3.0.0-bin-hadoop2.7.tgz -wget https://archive.apache.org/dist/hadoop/core/hadoop-2.7.7/hadoop-2.7.7.tar.gz +wget --no-verbose https://archive.apache.org/dist/hadoop/core/hadoop-2.7.7/hadoop-2.7.7.tar.gz tar -xzf hadoop-2.7.7.tar.gz cd $WORK_DIR From ef919c236baa9a890ed10a702acbcb07f4021a17 Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Wed, 31 Mar 2021 21:20:46 +0800 Subject: [PATCH 15/62] add config ssh --- dev/test-cluster/config-ssh.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dev/test-cluster/config-ssh.sh b/dev/test-cluster/config-ssh.sh index 70ada1027..09debed06 100755 --- a/dev/test-cluster/config-ssh.sh +++ b/dev/test-cluster/config-ssh.sh @@ -2,6 +2,9 @@ mkdir ~/.ssh ssh-keygen -q -N "" -t rsa -f ~/.ssh/id_rsa +cat ~/.ssh/id_rsa cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys +cat ~/.ssh/authorized_keys echo " StrictHostKeyChecking no " | sudo tee -a /etc/ssh/ssh_config +cat /etc/ssh/ssh_config sudo service ssh restart From 2aed68fdcd7c0879c69e0e3044deb12cb126b6b4 Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Wed, 31 Mar 2021 21:29:14 +0800 Subject: [PATCH 16/62] fix config-ssh --- dev/test-cluster/config-ssh.sh | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/dev/test-cluster/config-ssh.sh b/dev/test-cluster/config-ssh.sh index 09debed06..2f5d098ed 100755 --- a/dev/test-cluster/config-ssh.sh +++ b/dev/test-cluster/config-ssh.sh @@ -1,10 +1,8 @@ #!/usr/bin/env bash -mkdir ~/.ssh ssh-keygen -q -N "" -t rsa -f ~/.ssh/id_rsa -cat ~/.ssh/id_rsa +chmod 600 ~/.ssh/id_rsa cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys -cat ~/.ssh/authorized_keys +chmod 600 ~/.ssh/authorized_keys echo " StrictHostKeyChecking no " | sudo tee -a /etc/ssh/ssh_config -cat /etc/ssh/ssh_config sudo service ssh restart From c8594fe43955ef0573a7d15ab7e076f2feae80e2 Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Wed, 31 Mar 2021 21:34:12 +0800 Subject: [PATCH 17/62] fix config-ssh --- .github/workflows/oap-mllib-ci.yml | 22 +++++++++++----------- dev/test-cluster/config-ssh.sh | 6 ++++++ dev/test-cluster/test-cluster.sh | 20 ++++++++++---------- 3 files changed, 27 insertions(+), 21 deletions(-) diff --git a/.github/workflows/oap-mllib-ci.yml b/.github/workflows/oap-mllib-ci.yml index 16264c9a4..8093ad35c 100644 --- a/.github/workflows/oap-mllib-ci.yml +++ b/.github/workflows/oap-mllib-ci.yml @@ -12,17 +12,17 @@ jobs: uses: actions/setup-java@v1 with: java-version: 1.8 - - name: Restore cached dependencies - uses: actions/cache@v2 - with: - path: | - ~/.m2/repository - ~/downloads - /opt/intel/inteloneapi - /opt/intel/oneapi - key: ${{ runner.os }}-${{ hashFiles('**/pom.xml', '{{github.workspace}}/dev/install-build-deps-ubuntu.sh') }} - restore-keys: | - ${{ runner.os }}- + # - name: Restore cached dependencies + # uses: actions/cache@v2 + # with: + # path: | + # ~/.m2/repository + # ~/downloads + # /opt/intel/inteloneapi + # /opt/intel/oneapi + # key: ${{ runner.os }}-${{ hashFiles('**/pom.xml', '{{github.workspace}}/dev/install-build-deps-ubuntu.sh') }} + # restore-keys: | + # ${{ runner.os }}- # - name: Set up dependencies # run: | # [ -d ~/downloads ] || mkdir ~/downloads diff --git a/dev/test-cluster/config-ssh.sh b/dev/test-cluster/config-ssh.sh index 2f5d098ed..f08d4a2ac 100755 --- a/dev/test-cluster/config-ssh.sh +++ b/dev/test-cluster/config-ssh.sh @@ -1,8 +1,14 @@ #!/usr/bin/env bash +set -x + ssh-keygen -q -N "" -t rsa -f ~/.ssh/id_rsa chmod 600 ~/.ssh/id_rsa cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys chmod 600 ~/.ssh/authorized_keys echo " StrictHostKeyChecking no " | sudo tee -a /etc/ssh/ssh_config + +ls -l ~/.ssh sudo service ssh restart + +ssh localhost \ No newline at end of file diff --git a/dev/test-cluster/test-cluster.sh b/dev/test-cluster/test-cluster.sh index 0320f38b2..69f7d65fb 100755 --- a/dev/test-cluster/test-cluster.sh +++ b/dev/test-cluster/test-cluster.sh @@ -5,17 +5,17 @@ set -x # Setup Python3 and Spark cluster cd $GITHUB_WORKSPACE/dev/test-cluster ./config-ssh.sh -./setup-cluster.sh -./setup-python3-env.sh +# ./setup-cluster.sh +# ./setup-python3-env.sh -# Build and run all examples -source $GITHUB_WORKSPACE/dev/test-cluster/env.sh +# # Build and run all examples +# source $GITHUB_WORKSPACE/dev/test-cluster/env.sh -cd $GITHUB_WORKSPACE/examples +# cd $GITHUB_WORKSPACE/examples -hadoop fs -copyFromLocal data -hadoop fs -ls data +# hadoop fs -copyFromLocal data +# hadoop fs -ls data -./build-all.sh -./run-all-scala.sh -./run-all-pyspark.sh +# ./build-all.sh +# ./run-all-scala.sh +# ./run-all-pyspark.sh From 7b6aafcc8b087eea1a55b1f2f11cd48436f0bd95 Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Wed, 31 Mar 2021 21:44:13 +0800 Subject: [PATCH 18/62] fix config-ssh --- dev/test-cluster/config-ssh.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dev/test-cluster/config-ssh.sh b/dev/test-cluster/config-ssh.sh index f08d4a2ac..49b14f79b 100755 --- a/dev/test-cluster/config-ssh.sh +++ b/dev/test-cluster/config-ssh.sh @@ -7,8 +7,9 @@ chmod 600 ~/.ssh/id_rsa cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys chmod 600 ~/.ssh/authorized_keys echo " StrictHostKeyChecking no " | sudo tee -a /etc/ssh/ssh_config +echo " PasswordAuthentication no " | sudo tee -a /etc/ssh/ssh_config ls -l ~/.ssh sudo service ssh restart -ssh localhost \ No newline at end of file +ssh -vvv localhost \ No newline at end of file From e7d65e6eb85d1dcfaecceef601ea2d86b60e8a1c Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Wed, 31 Mar 2021 22:10:19 +0800 Subject: [PATCH 19/62] fix config-ssh --- dev/test-cluster/config-ssh.sh | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/dev/test-cluster/config-ssh.sh b/dev/test-cluster/config-ssh.sh index 49b14f79b..5ec2ae471 100755 --- a/dev/test-cluster/config-ssh.sh +++ b/dev/test-cluster/config-ssh.sh @@ -2,14 +2,20 @@ set -x +mkdir ~/.ssh +chmod 0700 ~/.ssh + ssh-keygen -q -N "" -t rsa -f ~/.ssh/id_rsa chmod 600 ~/.ssh/id_rsa + cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys chmod 600 ~/.ssh/authorized_keys + echo " StrictHostKeyChecking no " | sudo tee -a /etc/ssh/ssh_config -echo " PasswordAuthentication no " | sudo tee -a /etc/ssh/ssh_config +ls -ld ~/.ssh ls -l ~/.ssh -sudo service ssh restart + +sudo systemctl restart sshd ssh -vvv localhost \ No newline at end of file From 1042a150a50325f4d4f1c0279cb27b3adf155484 Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Wed, 31 Mar 2021 22:19:33 +0800 Subject: [PATCH 20/62] fix config-ssh --- dev/test-cluster/config-ssh.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dev/test-cluster/config-ssh.sh b/dev/test-cluster/config-ssh.sh index 5ec2ae471..e5e7e875c 100755 --- a/dev/test-cluster/config-ssh.sh +++ b/dev/test-cluster/config-ssh.sh @@ -3,7 +3,7 @@ set -x mkdir ~/.ssh -chmod 0700 ~/.ssh +chmod 700 ~/.ssh ssh-keygen -q -N "" -t rsa -f ~/.ssh/id_rsa chmod 600 ~/.ssh/id_rsa @@ -18,4 +18,5 @@ ls -l ~/.ssh sudo systemctl restart sshd -ssh -vvv localhost \ No newline at end of file +# ssh -vvv localhost +ssh -vvv `hostname` \ No newline at end of file From 356e16f347d6c32dca2de9c027b60d364231de83 Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Wed, 31 Mar 2021 22:30:29 +0800 Subject: [PATCH 21/62] fix config-ssh --- .github/workflows/oap-mllib-ci.yml | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/.github/workflows/oap-mllib-ci.yml b/.github/workflows/oap-mllib-ci.yml index 8093ad35c..2647e36f4 100644 --- a/.github/workflows/oap-mllib-ci.yml +++ b/.github/workflows/oap-mllib-ci.yml @@ -12,6 +12,8 @@ jobs: uses: actions/setup-java@v1 with: java-version: 1.8 + - name: Setup tmate session + uses: mxschmitt/action-tmate@v3 # - name: Restore cached dependencies # uses: actions/cache@v2 # with: @@ -31,13 +33,13 @@ jobs: # [ -d spark-3.0.0-bin-hadoop2.7 ] || cd ~ && tar -zxf downloads/spark-3.0.0-bin-hadoop2.7.tgz # export SPARK_HOME=~/spark-3.0.0-bin-hadoop2.7 # ${{github.workspace}}/dev/install-build-deps-ubuntu.sh - - name: Build and Test - run: | - # cd ${{github.workspace}}/mllib-dal - # export ONEAPI_ROOT=/opt/intel/oneapi - # source /opt/intel/oneapi/dal/latest/env/vars.sh - # source /opt/intel/oneapi/tbb/latest/env/vars.sh - # source /tmp/oneCCL/build/_install/env/setvars.sh - # # temp disable and will enable for new release of oneCCL - # #./build.sh - ${{github.workspace}}/dev/test-cluster/test-cluster.sh + # - name: Build and Test + # run: | + # # cd ${{github.workspace}}/mllib-dal + # # export ONEAPI_ROOT=/opt/intel/oneapi + # # source /opt/intel/oneapi/dal/latest/env/vars.sh + # # source /opt/intel/oneapi/tbb/latest/env/vars.sh + # # source /tmp/oneCCL/build/_install/env/setvars.sh + # # # temp disable and will enable for new release of oneCCL + # # #./build.sh + # ${{github.workspace}}/dev/test-cluster/test-cluster.sh From bd7eab25cc12aa0d06fb06bd1a254d8a2cfdfe56 Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Thu, 1 Apr 2021 10:10:08 +0800 Subject: [PATCH 22/62] set strict modes no --- .github/workflows/oap-mllib-ci.yml | 62 +++++++++++++++--------------- dev/test-cluster/config-ssh.sh | 15 +++----- 2 files changed, 35 insertions(+), 42 deletions(-) diff --git a/.github/workflows/oap-mllib-ci.yml b/.github/workflows/oap-mllib-ci.yml index 2647e36f4..d50589f14 100644 --- a/.github/workflows/oap-mllib-ci.yml +++ b/.github/workflows/oap-mllib-ci.yml @@ -11,35 +11,33 @@ jobs: - name: Set up JDK 1.8 uses: actions/setup-java@v1 with: - java-version: 1.8 - - name: Setup tmate session - uses: mxschmitt/action-tmate@v3 - # - name: Restore cached dependencies - # uses: actions/cache@v2 - # with: - # path: | - # ~/.m2/repository - # ~/downloads - # /opt/intel/inteloneapi - # /opt/intel/oneapi - # key: ${{ runner.os }}-${{ hashFiles('**/pom.xml', '{{github.workspace}}/dev/install-build-deps-ubuntu.sh') }} - # restore-keys: | - # ${{ runner.os }}- - # - name: Set up dependencies - # run: | - # [ -d ~/downloads ] || mkdir ~/downloads - # cd ~/downloads - # [ -f spark-3.0.0-bin-hadoop2.7.tgz ] || wget http://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop2.7.tgz - # [ -d spark-3.0.0-bin-hadoop2.7 ] || cd ~ && tar -zxf downloads/spark-3.0.0-bin-hadoop2.7.tgz - # export SPARK_HOME=~/spark-3.0.0-bin-hadoop2.7 - # ${{github.workspace}}/dev/install-build-deps-ubuntu.sh - # - name: Build and Test - # run: | - # # cd ${{github.workspace}}/mllib-dal - # # export ONEAPI_ROOT=/opt/intel/oneapi - # # source /opt/intel/oneapi/dal/latest/env/vars.sh - # # source /opt/intel/oneapi/tbb/latest/env/vars.sh - # # source /tmp/oneCCL/build/_install/env/setvars.sh - # # # temp disable and will enable for new release of oneCCL - # # #./build.sh - # ${{github.workspace}}/dev/test-cluster/test-cluster.sh + java-version: 1.8 + - name: Restore cached dependencies + uses: actions/cache@v2 + with: + path: | + ~/.m2/repository + ~/downloads + /opt/intel/inteloneapi + /opt/intel/oneapi + key: ${{ runner.os }}-${{ hashFiles('**/pom.xml', '{{github.workspace}}/dev/install-build-deps-ubuntu.sh') }} + restore-keys: | + ${{ runner.os }}- + - name: Set up dependencies + run: | + [ -d ~/downloads ] || mkdir ~/downloads + cd ~/downloads + [ -f spark-3.0.0-bin-hadoop2.7.tgz ] || wget http://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop2.7.tgz + [ -d spark-3.0.0-bin-hadoop2.7 ] || cd ~ && tar -zxf downloads/spark-3.0.0-bin-hadoop2.7.tgz + export SPARK_HOME=~/spark-3.0.0-bin-hadoop2.7 + ${{github.workspace}}/dev/install-build-deps-ubuntu.sh + - name: Build and Test + run: | + # cd ${{github.workspace}}/mllib-dal + # export ONEAPI_ROOT=/opt/intel/oneapi + # source /opt/intel/oneapi/dal/latest/env/vars.sh + # source /opt/intel/oneapi/tbb/latest/env/vars.sh + # source /tmp/oneCCL/build/_install/env/setvars.sh + # # temp disable and will enable for new release of oneCCL + # #./build.sh + ${{github.workspace}}/dev/test-cluster/test-cluster.sh diff --git a/dev/test-cluster/config-ssh.sh b/dev/test-cluster/config-ssh.sh index e5e7e875c..6e8e283e6 100755 --- a/dev/test-cluster/config-ssh.sh +++ b/dev/test-cluster/config-ssh.sh @@ -1,22 +1,17 @@ #!/usr/bin/env bash -set -x - -mkdir ~/.ssh -chmod 700 ~/.ssh - ssh-keygen -q -N "" -t rsa -f ~/.ssh/id_rsa -chmod 600 ~/.ssh/id_rsa - cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys chmod 600 ~/.ssh/authorized_keys +# Disable strict host key checking echo " StrictHostKeyChecking no " | sudo tee -a /etc/ssh/ssh_config +# Disable strict modes for less strict permission checking +echo "StrictModes no" | sudo tee -a /etc/ssh/sshd_config ls -ld ~/.ssh ls -l ~/.ssh -sudo systemctl restart sshd +sudo systemctl restart ssh -# ssh -vvv localhost -ssh -vvv `hostname` \ No newline at end of file +ssh localhost \ No newline at end of file From 6174a26d40415307e5bd3a6afb8a13f3598dedd7 Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Thu, 1 Apr 2021 10:21:05 +0800 Subject: [PATCH 23/62] clear out comments --- .github/workflows/oap-mllib-ci.yml | 10 +++++----- dev/test-cluster/config-ssh.sh | 2 -- dev/test-cluster/test-cluster.sh | 20 ++++++++++---------- 3 files changed, 15 insertions(+), 17 deletions(-) diff --git a/.github/workflows/oap-mllib-ci.yml b/.github/workflows/oap-mllib-ci.yml index d50589f14..81f4743f6 100644 --- a/.github/workflows/oap-mllib-ci.yml +++ b/.github/workflows/oap-mllib-ci.yml @@ -33,11 +33,11 @@ jobs: ${{github.workspace}}/dev/install-build-deps-ubuntu.sh - name: Build and Test run: | - # cd ${{github.workspace}}/mllib-dal - # export ONEAPI_ROOT=/opt/intel/oneapi - # source /opt/intel/oneapi/dal/latest/env/vars.sh - # source /opt/intel/oneapi/tbb/latest/env/vars.sh - # source /tmp/oneCCL/build/_install/env/setvars.sh + cd ${{github.workspace}}/mllib-dal + export ONEAPI_ROOT=/opt/intel/oneapi + source /opt/intel/oneapi/dal/latest/env/vars.sh + source /opt/intel/oneapi/tbb/latest/env/vars.sh + source /tmp/oneCCL/build/_install/env/setvars.sh # # temp disable and will enable for new release of oneCCL # #./build.sh ${{github.workspace}}/dev/test-cluster/test-cluster.sh diff --git a/dev/test-cluster/config-ssh.sh b/dev/test-cluster/config-ssh.sh index 6e8e283e6..a6fc2699e 100755 --- a/dev/test-cluster/config-ssh.sh +++ b/dev/test-cluster/config-ssh.sh @@ -13,5 +13,3 @@ ls -ld ~/.ssh ls -l ~/.ssh sudo systemctl restart ssh - -ssh localhost \ No newline at end of file diff --git a/dev/test-cluster/test-cluster.sh b/dev/test-cluster/test-cluster.sh index 69f7d65fb..0320f38b2 100755 --- a/dev/test-cluster/test-cluster.sh +++ b/dev/test-cluster/test-cluster.sh @@ -5,17 +5,17 @@ set -x # Setup Python3 and Spark cluster cd $GITHUB_WORKSPACE/dev/test-cluster ./config-ssh.sh -# ./setup-cluster.sh -# ./setup-python3-env.sh +./setup-cluster.sh +./setup-python3-env.sh -# # Build and run all examples -# source $GITHUB_WORKSPACE/dev/test-cluster/env.sh +# Build and run all examples +source $GITHUB_WORKSPACE/dev/test-cluster/env.sh -# cd $GITHUB_WORKSPACE/examples +cd $GITHUB_WORKSPACE/examples -# hadoop fs -copyFromLocal data -# hadoop fs -ls data +hadoop fs -copyFromLocal data +hadoop fs -ls data -# ./build-all.sh -# ./run-all-scala.sh -# ./run-all-pyspark.sh +./build-all.sh +./run-all-scala.sh +./run-all-pyspark.sh From b4cd1696a2e66e609481d32d7f983a4da4f76fe1 Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Mon, 12 Apr 2021 15:18:07 +0800 Subject: [PATCH 24/62] Update oneCCL and oneDAL to oneAPI 2021.2.0, don't build oneCCL from source --- .github/workflows/oap-mllib-ci.yml | 8 ++--- dev/install-build-deps-centos.sh | 14 ++------ dev/install-build-deps-ubuntu.sh | 13 ++----- mllib-dal/pom.xml | 56 ++++++++++++++++++----------- mllib-dal/src/assembly/assembly.xml | 16 ++++----- mllib-dal/src/main/native/Makefile | 6 ++-- 6 files changed, 54 insertions(+), 59 deletions(-) diff --git a/.github/workflows/oap-mllib-ci.yml b/.github/workflows/oap-mllib-ci.yml index 81f4743f6..04eb7e074 100644 --- a/.github/workflows/oap-mllib-ci.yml +++ b/.github/workflows/oap-mllib-ci.yml @@ -37,7 +37,7 @@ jobs: export ONEAPI_ROOT=/opt/intel/oneapi source /opt/intel/oneapi/dal/latest/env/vars.sh source /opt/intel/oneapi/tbb/latest/env/vars.sh - source /tmp/oneCCL/build/_install/env/setvars.sh - # # temp disable and will enable for new release of oneCCL - # #./build.sh - ${{github.workspace}}/dev/test-cluster/test-cluster.sh + source /opt/intel/oneapi/ccl/latest/env/vars.sh + ./build.sh + ./test.sh + ./test-cluster.sh diff --git a/dev/install-build-deps-centos.sh b/dev/install-build-deps-centos.sh index 8a347fdef..e7e20b4c6 100755 --- a/dev/install-build-deps-centos.sh +++ b/dev/install-build-deps-centos.sh @@ -13,26 +13,16 @@ repo_gpgcheck=1 gpgkey=https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2023.PUB EOF sudo mv /tmp/oneAPI.repo /etc/yum.repos.d - sudo yum install -y intel-oneapi-dal-devel-2021.1.1 intel-oneapi-tbb-devel-2021.1.1 + sudo yum install -y intel-oneapi-dal-devel-2021.2.0 intel-oneapi-tbb-devel-2021.2.0 intel-oneapi-ccl-devel-2021.2.0 else echo "oneAPI components already installed!" fi -echo "Building oneCCL ..." -cd /tmp -rm -rf oneCCL -git clone https://github.com/oneapi-src/oneCCL -cd oneCCL -git checkout 2021.1 -mkdir -p build && cd build -cmake .. -make -j 2 install - # # Setup building environments manually: # # export ONEAPI_ROOT=/opt/intel/oneapi # source /opt/intel/oneapi/dal/latest/env/vars.sh # source /opt/intel/oneapi/tbb/latest/env/vars.sh -# source /tmp/oneCCL/build/_install/env/setvars.sh +# source /opt/intel/oneapi/ccl/latest/env/vars.sh # diff --git a/dev/install-build-deps-ubuntu.sh b/dev/install-build-deps-ubuntu.sh index d43e35b89..284a8139f 100755 --- a/dev/install-build-deps-ubuntu.sh +++ b/dev/install-build-deps-ubuntu.sh @@ -8,25 +8,16 @@ if [ ! -f /opt/intel/oneapi ]; then rm GPG-PUB-KEY-INTEL-SW-PRODUCTS-2023.PUB echo "deb https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list sudo apt-get update - sudo apt-get install intel-oneapi-dal-devel-2021.1.1 intel-oneapi-tbb-devel-2021.1.1 + sudo yum install -y intel-oneapi-dal-devel-2021.2.0 intel-oneapi-tbb-devel-2021.2.0 intel-oneapi-ccl-devel-2021.2.0 else echo "oneAPI components already installed!" fi -echo "Building oneCCL ..." -cd /tmp -git clone https://github.com/oneapi-src/oneCCL -cd oneCCL -git checkout 2021.1 -mkdir build && cd build -cmake .. -make -j 2 install - # # Setup building environments manually: # # export ONEAPI_ROOT=/opt/intel/oneapi # source /opt/intel/oneapi/dal/latest/env/vars.sh # source /opt/intel/oneapi/tbb/latest/env/vars.sh -# source /tmp/oneCCL/build/_install/env/setvars.sh +# source /opt/intel/oneapi/ccl/latest/env/vars.sh # diff --git a/mllib-dal/pom.xml b/mllib-dal/pom.xml index 4e51f9157..83b71d9d6 100644 --- a/mllib-dal/pom.xml +++ b/mllib-dal/pom.xml @@ -17,6 +17,13 @@ 2.12.10 2.12 3.0.0 + 2021.2.0 + libtbb.so.12.2 + libtbbmalloc.so.2.2 + libJavaAPI.so.1.1 + libccl.so.1.0 + libfabric.so.1 + libmpi.so.12.0.0 @@ -55,11 +62,11 @@ - com.intel.daal - daal - 2021.1 + com.intel.onedal + onedal + ${oneapi.version} system - ${env.ONEAPI_ROOT}/dal/latest/lib/onedal.jar + ${env.DAALROOT}/lib/onedal.jar @@ -216,18 +223,25 @@ ${project.build.testOutputDirectory}/lib - ${env.CCL_ROOT}/lib + ${env.CCL_ROOT}/lib/cpu_icc - - - libmpi.so.12.0.0 - libfabric.so.1 - libccl.so - + ${ccl.lib} - ${env.CCL_ROOT}/lib/prov + ${env.I_MPI_ROOT}/lib/release_mt + + ${ccl.mpi.lib} + + + + ${env.I_MPI_ROOT}/libfabric/lib + + ${ccl.fabric.lib} + + + + ${env.I_MPI_ROOT}/libfabric/lib/prov libsockets-fi.so @@ -236,8 +250,8 @@ ${env.TBBROOT}/lib/intel64/gcc4.8 - libtbb.so.12.1 - libtbbmalloc.so.2.1 + ${tbb.lib} + ${tbb.malloc.lib} @@ -265,21 +279,21 @@ - ${project.build.testOutputDirectory}/lib/libtbb.so.12.1 + ${project.build.testOutputDirectory}/lib/${ccl.lib} + ${project.build.testOutputDirectory}/lib/libccl.so + + + ${project.build.testOutputDirectory}/lib/${tbb.lib} ${project.build.testOutputDirectory}/lib/libtbb.so.2 - ${project.build.testOutputDirectory}/lib/libtbbmalloc.so.2.1 + ${project.build.testOutputDirectory}/lib/${tbb.malloc.lib} ${project.build.testOutputDirectory}/lib/libtbbmalloc.so.2 - ${project.build.testOutputDirectory}/lib/libmpi.so.12.0.0 + ${project.build.testOutputDirectory}/lib/${ccl.mpi.lib} ${project.build.testOutputDirectory}/lib/libmpi.so.12 - - - - diff --git a/mllib-dal/src/assembly/assembly.xml b/mllib-dal/src/assembly/assembly.xml index 498b90e02..50749db06 100644 --- a/mllib-dal/src/assembly/assembly.xml +++ b/mllib-dal/src/assembly/assembly.xml @@ -41,38 +41,38 @@ - ${env.TBBROOT}/lib/intel64/gcc4.8/libtbb.so.12.1 + ${env.TBBROOT}/lib/intel64/gcc4.8/${tbb.lib} lib libtbb.so.2 - ${env.TBBROOT}/lib/intel64/gcc4.8/libtbbmalloc.so.2.1 + ${env.TBBROOT}/lib/intel64/gcc4.8/${tbb.malloc.lib} lib libtbbmalloc.so.2 - ${env.DAALROOT}/lib/intel64/libJavaAPI.so.1.0 + ${env.DAALROOT}/lib/intel64/${dal.java.lib} lib libJavaAPI.so - ${env.CCL_ROOT}/lib/libfabric.so.1 + ${env.I_MPI_ROOT}/libfabric/lib/${ccl.fabric.lib} lib - ${env.CCL_ROOT}/lib/libmpi.so.12.0.0 + ${env.I_MPI_ROOT}/lib/release_mt/${ccl.mpi.lib} lib libmpi.so.12 - ${env.CCL_ROOT}/lib/libccl.so + ${env.CCL_ROOT}/lib/cpu_icc/libccl.so lib - ${env.CCL_ROOT}/lib/prov/libsockets-fi.so + ${env.I_MPI_ROOT}/libfabric/lib/prov/libsockets-fi.so lib - \ No newline at end of file + diff --git a/mllib-dal/src/main/native/Makefile b/mllib-dal/src/main/native/Makefile index 0a81dd05a..dccaf71ef 100644 --- a/mllib-dal/src/main/native/Makefile +++ b/mllib-dal/src/main/native/Makefile @@ -12,11 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -CC := gcc -CXX := g++ +CC := clang +CXX := clang++ RM := rm -rf -CFLAGS := -g -Wall -fPIC -std=c++11 +CFLAGS := -g -Wall -Wno-deprecated-declarations -fsycl -fPIC -std=c++11 # The following paths setting works for self-built libs from source code # https://github.com/oneapi-src/oneCCL. If oneCCL package in oneAPI Toolkit is used, From 5d4197e280b0bf663d9c1631c91e1b631fcdbfd6 Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Mon, 12 Apr 2021 15:23:04 +0800 Subject: [PATCH 25/62] nit --- .github/workflows/oap-mllib-ci.yml | 2 +- dev/install-build-deps-ubuntu.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/oap-mllib-ci.yml b/.github/workflows/oap-mllib-ci.yml index 04eb7e074..259ed6da1 100644 --- a/.github/workflows/oap-mllib-ci.yml +++ b/.github/workflows/oap-mllib-ci.yml @@ -27,7 +27,7 @@ jobs: run: | [ -d ~/downloads ] || mkdir ~/downloads cd ~/downloads - [ -f spark-3.0.0-bin-hadoop2.7.tgz ] || wget http://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop2.7.tgz + [ -f spark-3.0.0-bin-hadoop2.7.tgz ] || wget --no-verbose http://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop2.7.tgz [ -d spark-3.0.0-bin-hadoop2.7 ] || cd ~ && tar -zxf downloads/spark-3.0.0-bin-hadoop2.7.tgz export SPARK_HOME=~/spark-3.0.0-bin-hadoop2.7 ${{github.workspace}}/dev/install-build-deps-ubuntu.sh diff --git a/dev/install-build-deps-ubuntu.sh b/dev/install-build-deps-ubuntu.sh index 284a8139f..d5a337c00 100755 --- a/dev/install-build-deps-ubuntu.sh +++ b/dev/install-build-deps-ubuntu.sh @@ -8,7 +8,7 @@ if [ ! -f /opt/intel/oneapi ]; then rm GPG-PUB-KEY-INTEL-SW-PRODUCTS-2023.PUB echo "deb https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list sudo apt-get update - sudo yum install -y intel-oneapi-dal-devel-2021.2.0 intel-oneapi-tbb-devel-2021.2.0 intel-oneapi-ccl-devel-2021.2.0 + sudo apt-get install -y intel-oneapi-dal-devel-2021.2.0 intel-oneapi-tbb-devel-2021.2.0 intel-oneapi-ccl-devel-2021.2.0 else echo "oneAPI components already installed!" fi From 04aa72ccadc72f3b5d0e54e6ee0298c151d43cb1 Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Mon, 12 Apr 2021 15:35:44 +0800 Subject: [PATCH 26/62] Fix install oneapi and source setvars --- .github/workflows/oap-mllib-ci.yml | 9 +++------ dev/install-build-deps-centos.sh | 11 +---------- dev/install-build-deps-ubuntu.sh | 11 +---------- 3 files changed, 5 insertions(+), 26 deletions(-) diff --git a/.github/workflows/oap-mllib-ci.yml b/.github/workflows/oap-mllib-ci.yml index 259ed6da1..73974d48a 100644 --- a/.github/workflows/oap-mllib-ci.yml +++ b/.github/workflows/oap-mllib-ci.yml @@ -32,12 +32,9 @@ jobs: export SPARK_HOME=~/spark-3.0.0-bin-hadoop2.7 ${{github.workspace}}/dev/install-build-deps-ubuntu.sh - name: Build and Test - run: | - cd ${{github.workspace}}/mllib-dal - export ONEAPI_ROOT=/opt/intel/oneapi - source /opt/intel/oneapi/dal/latest/env/vars.sh - source /opt/intel/oneapi/tbb/latest/env/vars.sh - source /opt/intel/oneapi/ccl/latest/env/vars.sh + run: | + source /opt/intel/oneapi/setvars.sh + cd ${{github.workspace}}/mllib-dal ./build.sh ./test.sh ./test-cluster.sh diff --git a/dev/install-build-deps-centos.sh b/dev/install-build-deps-centos.sh index e7e20b4c6..c5447f811 100755 --- a/dev/install-build-deps-centos.sh +++ b/dev/install-build-deps-centos.sh @@ -13,16 +13,7 @@ repo_gpgcheck=1 gpgkey=https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2023.PUB EOF sudo mv /tmp/oneAPI.repo /etc/yum.repos.d - sudo yum install -y intel-oneapi-dal-devel-2021.2.0 intel-oneapi-tbb-devel-2021.2.0 intel-oneapi-ccl-devel-2021.2.0 + sudo yum install -y intel-basekit-2021.2.0 else echo "oneAPI components already installed!" fi - -# -# Setup building environments manually: -# -# export ONEAPI_ROOT=/opt/intel/oneapi -# source /opt/intel/oneapi/dal/latest/env/vars.sh -# source /opt/intel/oneapi/tbb/latest/env/vars.sh -# source /opt/intel/oneapi/ccl/latest/env/vars.sh -# diff --git a/dev/install-build-deps-ubuntu.sh b/dev/install-build-deps-ubuntu.sh index d5a337c00..7fa05a42d 100755 --- a/dev/install-build-deps-ubuntu.sh +++ b/dev/install-build-deps-ubuntu.sh @@ -8,16 +8,7 @@ if [ ! -f /opt/intel/oneapi ]; then rm GPG-PUB-KEY-INTEL-SW-PRODUCTS-2023.PUB echo "deb https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list sudo apt-get update - sudo apt-get install -y intel-oneapi-dal-devel-2021.2.0 intel-oneapi-tbb-devel-2021.2.0 intel-oneapi-ccl-devel-2021.2.0 + sudo apt-get install -y intel-basekit-2021.2.0 else echo "oneAPI components already installed!" fi - -# -# Setup building environments manually: -# -# export ONEAPI_ROOT=/opt/intel/oneapi -# source /opt/intel/oneapi/dal/latest/env/vars.sh -# source /opt/intel/oneapi/tbb/latest/env/vars.sh -# source /opt/intel/oneapi/ccl/latest/env/vars.sh -# From 59cddfc7d3ca6a1b05f420983a04aea583be4e2e Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Mon, 12 Apr 2021 16:04:30 +0800 Subject: [PATCH 27/62] nit --- .github/workflows/oap-mllib-ci.yml | 1 + dev/install-build-deps-centos.sh | 2 +- dev/install-build-deps-ubuntu.sh | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/oap-mllib-ci.yml b/.github/workflows/oap-mllib-ci.yml index 73974d48a..fbb237d01 100644 --- a/.github/workflows/oap-mllib-ci.yml +++ b/.github/workflows/oap-mllib-ci.yml @@ -34,6 +34,7 @@ jobs: - name: Build and Test run: | source /opt/intel/oneapi/setvars.sh + export SPARK_LOCAL=localhost cd ${{github.workspace}}/mllib-dal ./build.sh ./test.sh diff --git a/dev/install-build-deps-centos.sh b/dev/install-build-deps-centos.sh index c5447f811..b7b57683e 100755 --- a/dev/install-build-deps-centos.sh +++ b/dev/install-build-deps-centos.sh @@ -13,7 +13,7 @@ repo_gpgcheck=1 gpgkey=https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2023.PUB EOF sudo mv /tmp/oneAPI.repo /etc/yum.repos.d - sudo yum install -y intel-basekit-2021.2.0 + sudo yum install -y intel-oneapi-dpcpp-cpp-2021.2.0 intel-oneapi-dal-devel-2021.2.0 intel-oneapi-tbb-devel-2021.2.0 intel-oneapi-ccl-devel-2021.2.0 else echo "oneAPI components already installed!" fi diff --git a/dev/install-build-deps-ubuntu.sh b/dev/install-build-deps-ubuntu.sh index 7fa05a42d..eacab40cd 100755 --- a/dev/install-build-deps-ubuntu.sh +++ b/dev/install-build-deps-ubuntu.sh @@ -8,7 +8,7 @@ if [ ! -f /opt/intel/oneapi ]; then rm GPG-PUB-KEY-INTEL-SW-PRODUCTS-2023.PUB echo "deb https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list sudo apt-get update - sudo apt-get install -y intel-basekit-2021.2.0 + sudo apt-get install -y intel-oneapi-dpcpp-cpp-2021.2.0 intel-oneapi-dal-devel-2021.2.0 intel-oneapi-tbb-devel-2021.2.0 intel-oneapi-ccl-devel-2021.2.0 else echo "oneAPI components already installed!" fi From c34e78e2d82a5586a56cf5d3dd09596139bc23b1 Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Mon, 12 Apr 2021 16:56:57 +0800 Subject: [PATCH 28/62] Add spark.driver.host --- dev/test-cluster/spark-defaults.conf | 1 + 1 file changed, 1 insertion(+) diff --git a/dev/test-cluster/spark-defaults.conf b/dev/test-cluster/spark-defaults.conf index 1c25bb2ec..850f52049 100644 --- a/dev/test-cluster/spark-defaults.conf +++ b/dev/test-cluster/spark-defaults.conf @@ -28,6 +28,7 @@ spark.master yarn spark.serializer org.apache.spark.serializer.KryoSerializer +spark.driver.host 127.0.0.1 spark.driver.memory 3g spark.executor.num 2 spark.executor.cores 1 From f541ce84394380bae5c0c7dacf960920bfa78f3a Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Mon, 12 Apr 2021 17:05:45 +0800 Subject: [PATCH 29/62] Add ci-build --- .github/workflows/oap-mllib-ci.yml | 2 +- mllib-dal/build.sh | 8 ++++++- mllib-dal/ci-build.sh | 38 ++++++++++++++++++++++++++++++ 3 files changed, 46 insertions(+), 2 deletions(-) create mode 100755 mllib-dal/ci-build.sh diff --git a/.github/workflows/oap-mllib-ci.yml b/.github/workflows/oap-mllib-ci.yml index fbb237d01..0738cb05b 100644 --- a/.github/workflows/oap-mllib-ci.yml +++ b/.github/workflows/oap-mllib-ci.yml @@ -36,6 +36,6 @@ jobs: source /opt/intel/oneapi/setvars.sh export SPARK_LOCAL=localhost cd ${{github.workspace}}/mllib-dal - ./build.sh + ./ci-build.sh ./test.sh ./test-cluster.sh diff --git a/mllib-dal/build.sh b/mllib-dal/build.sh index da1d8df75..2a42e3aff 100755 --- a/mllib-dal/build.sh +++ b/mllib-dal/build.sh @@ -21,12 +21,18 @@ if [[ -z $CCL_ROOT ]]; then exit 1 fi +if [[ -z $I_MPI_ROOT ]]; then + echo I_MPI_ROOT not defined! + exit 1 +fi + echo === Building Environments === echo JAVA_HOME=$JAVA_HOME echo DAALROOT=$DAALROOT echo TBBROOT=$TBBROOT echo CCL_ROOT=$CCL_ROOT -echo GCC Version: $(gcc -dumpversion) +echo MPI_ROOT=$I_MPI_ROOT +echo Clang Version: $(clang -dumpversion) echo ============================= mvn -DskipTests clean package diff --git a/mllib-dal/ci-build.sh b/mllib-dal/ci-build.sh new file mode 100755 index 000000000..dd74fb3f6 --- /dev/null +++ b/mllib-dal/ci-build.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash + +# Check envs for building +if [[ -z $JAVA_HOME ]]; then + echo $JAVA_HOME not defined! + exit 1 +fi + +if [[ -z $DAALROOT ]]; then + echo DAALROOT not defined! + exit 1 +fi + +if [[ -z $TBBROOT ]]; then + echo TBBROOT not defined! + exit 1 +fi + +if [[ -z $CCL_ROOT ]]; then + echo CCL_ROOT not defined! + exit 1 +fi + +if [[ -z $I_MPI_ROOT ]]; then + echo I_MPI_ROOT not defined! + exit 1 +fi + +echo === Building Environments === +echo JAVA_HOME=$JAVA_HOME +echo DAALROOT=$DAALROOT +echo TBBROOT=$TBBROOT +echo CCL_ROOT=$CCL_ROOT +echo MPI_ROOT=$I_MPI_ROOT +echo Clang Version: $(clang -dumpversion) +echo ============================= + +mvn --no-transfer-progress -DskipTests clean package From 8738cbfea54f1729cd229317b3ac06c28e4373f8 Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Mon, 12 Apr 2021 17:21:44 +0800 Subject: [PATCH 30/62] nit --- mllib-dal/src/main/native/Makefile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mllib-dal/src/main/native/Makefile b/mllib-dal/src/main/native/Makefile index dccaf71ef..a25e97f83 100644 --- a/mllib-dal/src/main/native/Makefile +++ b/mllib-dal/src/main/native/Makefile @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -CC := clang +CC := clang++ CXX := clang++ RM := rm -rf @@ -31,7 +31,8 @@ INCS := -I $(JAVA_HOME)/include \ # Use static link if possible, TBB is only available as dynamic libs -LIBS := -L${CCL_ROOT}/lib -lccl \ +LIBS := -lstdc++ \ + -L${CCL_ROOT}/lib -lccl \ -L$(DAALROOT)/lib/intel64 -l:libdaal_core.a -l:libdaal_thread.a \ -L$(TBBROOT)/lib/intel64/gcc4.8 -ltbb -ltbbmalloc # TODO: Add signal chaining support, should fix linking, package so and loading From a1e786a156ab25cb3d2941fd602a39b4daa4b894 Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Tue, 13 Apr 2021 10:15:32 +0800 Subject: [PATCH 31/62] Update --- .github/workflows/oap-mllib-ci.yml | 8 +++++--- dev/install-build-deps-ubuntu.sh | 2 +- mllib-dal/src/main/native/Makefile | 2 +- mllib-dal/test-cluster.sh | 5 ----- 4 files changed, 7 insertions(+), 10 deletions(-) delete mode 100755 mllib-dal/test-cluster.sh diff --git a/.github/workflows/oap-mllib-ci.yml b/.github/workflows/oap-mllib-ci.yml index 0738cb05b..c4a90b702 100644 --- a/.github/workflows/oap-mllib-ci.yml +++ b/.github/workflows/oap-mllib-ci.yml @@ -16,10 +16,10 @@ jobs: uses: actions/cache@v2 with: path: | + /var/cache/apt ~/.m2/repository - ~/downloads - /opt/intel/inteloneapi /opt/intel/oneapi + ~/downloads key: ${{ runner.os }}-${{ hashFiles('**/pom.xml', '{{github.workspace}}/dev/install-build-deps-ubuntu.sh') }} restore-keys: | ${{ runner.os }}- @@ -34,8 +34,10 @@ jobs: - name: Build and Test run: | source /opt/intel/oneapi/setvars.sh + hostname + cat /etc/hosts + cat /etc/os-release export SPARK_LOCAL=localhost cd ${{github.workspace}}/mllib-dal ./ci-build.sh ./test.sh - ./test-cluster.sh diff --git a/dev/install-build-deps-ubuntu.sh b/dev/install-build-deps-ubuntu.sh index eacab40cd..055f696db 100755 --- a/dev/install-build-deps-ubuntu.sh +++ b/dev/install-build-deps-ubuntu.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -if [ ! -f /opt/intel/oneapi ]; then +if [ ! -d /opt/intel/oneapi ]; then echo "Installing oneAPI components ..." cd /tmp wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2023.PUB diff --git a/mllib-dal/src/main/native/Makefile b/mllib-dal/src/main/native/Makefile index a25e97f83..a4310cecb 100644 --- a/mllib-dal/src/main/native/Makefile +++ b/mllib-dal/src/main/native/Makefile @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -CC := clang++ +CC := clang CXX := clang++ RM := rm -rf diff --git a/mllib-dal/test-cluster.sh b/mllib-dal/test-cluster.sh deleted file mode 100755 index 4f5a6132a..000000000 --- a/mllib-dal/test-cluster.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/usr/bin/env bash - -cd ../dev/test-cluster/workloads - -./run-kmeans-pyspark.sh From 29de3dcf80ce03648f6e1ddb3b1eb4cc318af8fd Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Tue, 13 Apr 2021 10:38:02 +0800 Subject: [PATCH 32/62] Update --- .github/workflows/oap-mllib-ci.yml | 8 +++++--- mllib-dal/test.sh | 10 ++++++++-- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/.github/workflows/oap-mllib-ci.yml b/.github/workflows/oap-mllib-ci.yml index c4a90b702..cabcd867a 100644 --- a/.github/workflows/oap-mllib-ci.yml +++ b/.github/workflows/oap-mllib-ci.yml @@ -34,10 +34,12 @@ jobs: - name: Build and Test run: | source /opt/intel/oneapi/setvars.sh + echo "---" hostname + echo "---" cat /etc/hosts - cat /etc/os-release - export SPARK_LOCAL=localhost + echo "---" + echo 127.0.0.1 $(hostname) | sudo tee -a /etc/hosts cd ${{github.workspace}}/mllib-dal - ./ci-build.sh + # ./ci-build.sh ./test.sh diff --git a/mllib-dal/test.sh b/mllib-dal/test.sh index 0157c22a4..a285eb95d 100755 --- a/mllib-dal/test.sh +++ b/mllib-dal/test.sh @@ -21,12 +21,18 @@ if [[ -z $CCL_ROOT ]]; then exit 1 fi -echo === Building Environments === +if [[ -z $I_MPI_ROOT ]]; then + echo I_MPI_ROOT not defined! + exit 1 +fi + +echo === Testing Environments === echo JAVA_HOME=$JAVA_HOME echo DAALROOT=$DAALROOT echo TBBROOT=$TBBROOT echo CCL_ROOT=$CCL_ROOT -echo GCC Version: $(gcc -dumpversion) +echo MPI_ROOT=$I_MPI_ROOT +echo Clang Version: $(clang -dumpversion) echo ============================= # Enable signal chaining support for JNI From a12cffd4afadad5fab11b79e1cdf7e9e00f35ec1 Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Tue, 13 Apr 2021 10:54:16 +0800 Subject: [PATCH 33/62] Add --ccl-configuration=cpu_icc --- .github/workflows/oap-mllib-ci.yml | 2 ++ mllib-dal/build.sh | 1 + mllib-dal/src/main/native/Makefile | 2 +- mllib-dal/test.sh | 1 + 4 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/oap-mllib-ci.yml b/.github/workflows/oap-mllib-ci.yml index cabcd867a..c2c184e97 100644 --- a/.github/workflows/oap-mllib-ci.yml +++ b/.github/workflows/oap-mllib-ci.yml @@ -34,6 +34,8 @@ jobs: - name: Build and Test run: | source /opt/intel/oneapi/setvars.sh + # Configure oneCCL to use cpu_icc libs + source /opt/intel/oneapi/ccl/latest/env/vars.sh --ccl-configuration=cpu_icc echo "---" hostname echo "---" diff --git a/mllib-dal/build.sh b/mllib-dal/build.sh index 2a42e3aff..a66c26947 100755 --- a/mllib-dal/build.sh +++ b/mllib-dal/build.sh @@ -31,6 +31,7 @@ echo JAVA_HOME=$JAVA_HOME echo DAALROOT=$DAALROOT echo TBBROOT=$TBBROOT echo CCL_ROOT=$CCL_ROOT +echo CCL_CONFIGURATION=$CCL_CONFIGURATION echo MPI_ROOT=$I_MPI_ROOT echo Clang Version: $(clang -dumpversion) echo ============================= diff --git a/mllib-dal/src/main/native/Makefile b/mllib-dal/src/main/native/Makefile index a4310cecb..6fe438846 100644 --- a/mllib-dal/src/main/native/Makefile +++ b/mllib-dal/src/main/native/Makefile @@ -16,7 +16,7 @@ CC := clang CXX := clang++ RM := rm -rf -CFLAGS := -g -Wall -Wno-deprecated-declarations -fsycl -fPIC -std=c++11 +CFLAGS := -g -Wall -Wno-deprecated-declarations -fPIC -std=c++11 # The following paths setting works for self-built libs from source code # https://github.com/oneapi-src/oneCCL. If oneCCL package in oneAPI Toolkit is used, diff --git a/mllib-dal/test.sh b/mllib-dal/test.sh index a285eb95d..4a37529a1 100755 --- a/mllib-dal/test.sh +++ b/mllib-dal/test.sh @@ -31,6 +31,7 @@ echo JAVA_HOME=$JAVA_HOME echo DAALROOT=$DAALROOT echo TBBROOT=$TBBROOT echo CCL_ROOT=$CCL_ROOT +echo CCL_CONFIGURATION=$CCL_CONFIGURATION echo MPI_ROOT=$I_MPI_ROOT echo Clang Version: $(clang -dumpversion) echo ============================= From 572e308630948947cda392007c76114bc7955f2e Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Tue, 13 Apr 2021 11:20:21 +0800 Subject: [PATCH 34/62] Update --- .github/workflows/oap-mllib-ci.yml | 12 +++---- dev/test-cluster/spark-defaults.conf | 1 - mllib-dal/ci-test.sh | 50 ++++++++++++++++++++++++++++ 3 files changed, 55 insertions(+), 8 deletions(-) create mode 100755 mllib-dal/ci-test.sh diff --git a/.github/workflows/oap-mllib-ci.yml b/.github/workflows/oap-mllib-ci.yml index c2c184e97..cdccb59d3 100644 --- a/.github/workflows/oap-mllib-ci.yml +++ b/.github/workflows/oap-mllib-ci.yml @@ -35,13 +35,11 @@ jobs: run: | source /opt/intel/oneapi/setvars.sh # Configure oneCCL to use cpu_icc libs - source /opt/intel/oneapi/ccl/latest/env/vars.sh --ccl-configuration=cpu_icc - echo "---" - hostname - echo "---" + source /opt/intel/oneapi/ccl/latest/env/vars.sh --ccl-configuration=cpu_icc + echo "---" + echo 127.0.0.1 $(hostname) | sudo tee -a /etc/hosts cat /etc/hosts echo "---" - echo 127.0.0.1 $(hostname) | sudo tee -a /etc/hosts cd ${{github.workspace}}/mllib-dal - # ./ci-build.sh - ./test.sh + ./ci-build.sh + # ./ci-test.sh diff --git a/dev/test-cluster/spark-defaults.conf b/dev/test-cluster/spark-defaults.conf index 850f52049..1c25bb2ec 100644 --- a/dev/test-cluster/spark-defaults.conf +++ b/dev/test-cluster/spark-defaults.conf @@ -28,7 +28,6 @@ spark.master yarn spark.serializer org.apache.spark.serializer.KryoSerializer -spark.driver.host 127.0.0.1 spark.driver.memory 3g spark.executor.num 2 spark.executor.cores 1 diff --git a/mllib-dal/ci-test.sh b/mllib-dal/ci-test.sh new file mode 100755 index 000000000..2b4aa8dde --- /dev/null +++ b/mllib-dal/ci-test.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash + +# Check envs for building +if [[ -z $JAVA_HOME ]]; then + echo $JAVA_HOME not defined! + exit 1 +fi + +if [[ -z $DAALROOT ]]; then + echo DAALROOT not defined! + exit 1 +fi + +if [[ -z $TBBROOT ]]; then + echo TBBROOT not defined! + exit 1 +fi + +if [[ -z $CCL_ROOT ]]; then + echo CCL_ROOT not defined! + exit 1 +fi + +if [[ -z $I_MPI_ROOT ]]; then + echo I_MPI_ROOT not defined! + exit 1 +fi + +echo === Testing Environments === +echo JAVA_HOME=$JAVA_HOME +echo DAALROOT=$DAALROOT +echo TBBROOT=$TBBROOT +echo CCL_ROOT=$CCL_ROOT +echo CCL_CONFIGURATION=$CCL_CONFIGURATION +echo MPI_ROOT=$I_MPI_ROOT +echo Clang Version: $(clang -dumpversion) +echo ============================= + +# Enable signal chaining support for JNI +export LD_PRELOAD=$JAVA_HOME/jre/lib/amd64/libjsig.so + +# -Dtest=none to turn off the Java tests + +# Test all +# mvn -Dtest=none -Dmaven.test.skip=false test + +# Individual test +mvn --no-transfer-progress -Dtest=none -DwildcardSuites=org.apache.spark.ml.clustering.IntelKMeansSuite test +# mvn --no-transfer-progress -Dtest=none -DwildcardSuites=org.apache.spark.ml.feature.IntelPCASuite test +# mvn -Dtest=none -DwildcardSuites=org.apache.spark.ml.recommendation.IntelALSSuite test From fd59a2dc7f6103109f144b3381eae14945b3bef3 Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Tue, 13 Apr 2021 12:00:44 +0800 Subject: [PATCH 35/62] Update --- .github/workflows/oap-mllib-ci.yml | 6 ++++-- mllib-dal/src/main/native/Makefile | 9 ++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/.github/workflows/oap-mllib-ci.yml b/.github/workflows/oap-mllib-ci.yml index cdccb59d3..f3b9f5f83 100644 --- a/.github/workflows/oap-mllib-ci.yml +++ b/.github/workflows/oap-mllib-ci.yml @@ -40,6 +40,8 @@ jobs: echo 127.0.0.1 $(hostname) | sudo tee -a /etc/hosts cat /etc/hosts echo "---" + dpkg -l | grep libstdc++ + echo "---" cd ${{github.workspace}}/mllib-dal - ./ci-build.sh - # ./ci-test.sh + # ./ci-build.sh + ./ci-test.sh diff --git a/mllib-dal/src/main/native/Makefile b/mllib-dal/src/main/native/Makefile index 6fe438846..34da569ea 100644 --- a/mllib-dal/src/main/native/Makefile +++ b/mllib-dal/src/main/native/Makefile @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -CC := clang -CXX := clang++ +CC := gcc +CXX := g++ RM := rm -rf CFLAGS := -g -Wall -Wno-deprecated-declarations -fPIC -std=c++11 @@ -24,15 +24,14 @@ CFLAGS := -g -Wall -Wno-deprecated-declarations -fPIC -std=c++11 INCS := -I $(JAVA_HOME)/include \ -I $(JAVA_HOME)/include/linux \ - -I ${CCL_ROOT}/include \ + -I ${CCL_ROOT}/include/cpu_icc \ -I $(DAALROOT)/include \ -I ./javah \ -I ./ # Use static link if possible, TBB is only available as dynamic libs -LIBS := -lstdc++ \ - -L${CCL_ROOT}/lib -lccl \ +LIBS := -L${CCL_ROOT}/lib/cpu_icc -lccl \ -L$(DAALROOT)/lib/intel64 -l:libdaal_core.a -l:libdaal_thread.a \ -L$(TBBROOT)/lib/intel64/gcc4.8 -ltbb -ltbbmalloc # TODO: Add signal chaining support, should fix linking, package so and loading From c05f52f543bfc74bb8da1f2e172eaee4f60c75b9 Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Tue, 13 Apr 2021 21:22:18 +0800 Subject: [PATCH 36/62] revert to build oneCCL from source and package related so --- .github/workflows/oap-mllib-ci.yml | 10 ++-------- dev/install-build-deps-centos.sh | 15 +++++++++++++-- dev/install-build-deps-ubuntu.sh | 14 ++++++++++++-- mllib-dal/build.sh | 7 ------- mllib-dal/pom.xml | 24 +++++------------------- mllib-dal/src/assembly/assembly.xml | 8 ++++---- mllib-dal/src/main/native/Makefile | 8 ++++---- mllib-dal/test.sh | 9 +-------- 8 files changed, 41 insertions(+), 54 deletions(-) diff --git a/.github/workflows/oap-mllib-ci.yml b/.github/workflows/oap-mllib-ci.yml index f3b9f5f83..fc42d5e95 100644 --- a/.github/workflows/oap-mllib-ci.yml +++ b/.github/workflows/oap-mllib-ci.yml @@ -34,14 +34,8 @@ jobs: - name: Build and Test run: | source /opt/intel/oneapi/setvars.sh - # Configure oneCCL to use cpu_icc libs - source /opt/intel/oneapi/ccl/latest/env/vars.sh --ccl-configuration=cpu_icc - echo "---" - echo 127.0.0.1 $(hostname) | sudo tee -a /etc/hosts - cat /etc/hosts - echo "---" - dpkg -l | grep libstdc++ - echo "---" + source /tmp/oneCCL/build/_install/env/setvars.sh + echo 127.0.0.1 $(hostname) | sudo tee -a /etc/hosts cd ${{github.workspace}}/mllib-dal # ./ci-build.sh ./ci-test.sh diff --git a/dev/install-build-deps-centos.sh b/dev/install-build-deps-centos.sh index b7b57683e..79f4545f2 100755 --- a/dev/install-build-deps-centos.sh +++ b/dev/install-build-deps-centos.sh @@ -12,8 +12,19 @@ gpgcheck=1 repo_gpgcheck=1 gpgkey=https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2023.PUB EOF - sudo mv /tmp/oneAPI.repo /etc/yum.repos.d - sudo yum install -y intel-oneapi-dpcpp-cpp-2021.2.0 intel-oneapi-dal-devel-2021.2.0 intel-oneapi-tbb-devel-2021.2.0 intel-oneapi-ccl-devel-2021.2.0 + sudo mv /tmp/oneAPI.repo /etc/yum.repos.d + # sudo yum groupinstall -y "Development Tools" + # sudo yum install -y cmake + sudo yum install -y intel-oneapi-dpcpp-cpp-2021.2.0 intel-oneapi-dal-devel-2021.2.0 intel-oneapi-tbb-devel-2021.2.0 else echo "oneAPI components already installed!" fi + +echo "Building oneCCL ..." +cd /tmp +git clone https://github.com/oneapi-src/oneCCL +cd oneCCL +git checkout 2021.2 +mkdir build && cd build +cmake .. +make -j 2 install \ No newline at end of file diff --git a/dev/install-build-deps-ubuntu.sh b/dev/install-build-deps-ubuntu.sh index 055f696db..95bbbb722 100755 --- a/dev/install-build-deps-ubuntu.sh +++ b/dev/install-build-deps-ubuntu.sh @@ -8,7 +8,17 @@ if [ ! -d /opt/intel/oneapi ]; then rm GPG-PUB-KEY-INTEL-SW-PRODUCTS-2023.PUB echo "deb https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list sudo apt-get update - sudo apt-get install -y intel-oneapi-dpcpp-cpp-2021.2.0 intel-oneapi-dal-devel-2021.2.0 intel-oneapi-tbb-devel-2021.2.0 intel-oneapi-ccl-devel-2021.2.0 + # sudo apt-get install -y build-essential cmake + sudo apt-get install -y intel-oneapi-dpcpp-cpp-2021.2.0 intel-oneapi-dal-devel-2021.2.0 intel-oneapi-tbb-devel-2021.2.0 else echo "oneAPI components already installed!" -fi +fi + +echo "Building oneCCL ..." +cd /tmp +git clone https://github.com/oneapi-src/oneCCL +cd oneCCL +git checkout 2021.2 +mkdir build && cd build +cmake .. +make -j 2 install diff --git a/mllib-dal/build.sh b/mllib-dal/build.sh index a66c26947..f55514094 100755 --- a/mllib-dal/build.sh +++ b/mllib-dal/build.sh @@ -21,18 +21,11 @@ if [[ -z $CCL_ROOT ]]; then exit 1 fi -if [[ -z $I_MPI_ROOT ]]; then - echo I_MPI_ROOT not defined! - exit 1 -fi - echo === Building Environments === echo JAVA_HOME=$JAVA_HOME echo DAALROOT=$DAALROOT echo TBBROOT=$TBBROOT echo CCL_ROOT=$CCL_ROOT -echo CCL_CONFIGURATION=$CCL_CONFIGURATION -echo MPI_ROOT=$I_MPI_ROOT echo Clang Version: $(clang -dumpversion) echo ============================= diff --git a/mllib-dal/pom.xml b/mllib-dal/pom.xml index 83b71d9d6..979c53d06 100644 --- a/mllib-dal/pom.xml +++ b/mllib-dal/pom.xml @@ -21,7 +21,7 @@ libtbb.so.12.2 libtbbmalloc.so.2.2 libJavaAPI.so.1.1 - libccl.so.1.0 + libccl.so libfabric.so.1 libmpi.so.12.0.0 @@ -223,25 +223,15 @@ ${project.build.testOutputDirectory}/lib - ${env.CCL_ROOT}/lib/cpu_icc + ${env.CCL_ROOT}/lib ${ccl.lib} - - - - ${env.I_MPI_ROOT}/lib/release_mt - ${ccl.mpi.lib} - - - - ${env.I_MPI_ROOT}/libfabric/lib - ${ccl.fabric.lib} - + - ${env.I_MPI_ROOT}/libfabric/lib/prov + ${env.CCL_ROOT}/lib/prov libsockets-fi.so @@ -277,11 +267,7 @@ - - ${project.build.testOutputDirectory}/lib/${ccl.lib} - ${project.build.testOutputDirectory}/lib/libccl.so - + rename to workaround. See https://github.com/oneapi-src/oneDAL/issues/1254 --> ${project.build.testOutputDirectory}/lib/${tbb.lib} ${project.build.testOutputDirectory}/lib/libtbb.so.2 diff --git a/mllib-dal/src/assembly/assembly.xml b/mllib-dal/src/assembly/assembly.xml index 50749db06..541c4f2bd 100644 --- a/mllib-dal/src/assembly/assembly.xml +++ b/mllib-dal/src/assembly/assembly.xml @@ -58,20 +58,20 @@ - ${env.I_MPI_ROOT}/libfabric/lib/${ccl.fabric.lib} + ${env.CCL_ROOT}/lib/${ccl.fabric.lib} lib - ${env.I_MPI_ROOT}/lib/release_mt/${ccl.mpi.lib} + ${env.CCL_ROOT}/lib/${ccl.mpi.lib} lib libmpi.so.12 - ${env.CCL_ROOT}/lib/cpu_icc/libccl.so + ${env.CCL_ROOT}/lib/libccl.so lib - ${env.I_MPI_ROOT}/libfabric/lib/prov/libsockets-fi.so + ${env.CCL_ROOT}/lib/prov/libsockets-fi.so lib diff --git a/mllib-dal/src/main/native/Makefile b/mllib-dal/src/main/native/Makefile index 34da569ea..e3a7e2161 100644 --- a/mllib-dal/src/main/native/Makefile +++ b/mllib-dal/src/main/native/Makefile @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -CC := gcc -CXX := g++ +CC := clang +CXX := clang++ RM := rm -rf CFLAGS := -g -Wall -Wno-deprecated-declarations -fPIC -std=c++11 @@ -24,14 +24,14 @@ CFLAGS := -g -Wall -Wno-deprecated-declarations -fPIC -std=c++11 INCS := -I $(JAVA_HOME)/include \ -I $(JAVA_HOME)/include/linux \ - -I ${CCL_ROOT}/include/cpu_icc \ + -I ${CCL_ROOT}/include \ -I $(DAALROOT)/include \ -I ./javah \ -I ./ # Use static link if possible, TBB is only available as dynamic libs -LIBS := -L${CCL_ROOT}/lib/cpu_icc -lccl \ +LIBS := -L${CCL_ROOT}/lib -lccl \ -L$(DAALROOT)/lib/intel64 -l:libdaal_core.a -l:libdaal_thread.a \ -L$(TBBROOT)/lib/intel64/gcc4.8 -ltbb -ltbbmalloc # TODO: Add signal chaining support, should fix linking, package so and loading diff --git a/mllib-dal/test.sh b/mllib-dal/test.sh index 4a37529a1..797849fb7 100755 --- a/mllib-dal/test.sh +++ b/mllib-dal/test.sh @@ -21,23 +21,16 @@ if [[ -z $CCL_ROOT ]]; then exit 1 fi -if [[ -z $I_MPI_ROOT ]]; then - echo I_MPI_ROOT not defined! - exit 1 -fi - echo === Testing Environments === echo JAVA_HOME=$JAVA_HOME echo DAALROOT=$DAALROOT echo TBBROOT=$TBBROOT echo CCL_ROOT=$CCL_ROOT -echo CCL_CONFIGURATION=$CCL_CONFIGURATION -echo MPI_ROOT=$I_MPI_ROOT echo Clang Version: $(clang -dumpversion) echo ============================= # Enable signal chaining support for JNI -export LD_PRELOAD=$JAVA_HOME/jre/lib/amd64/libjsig.so +# export LD_PRELOAD=$JAVA_HOME/jre/lib/amd64/libjsig.so # -Dtest=none to turn off the Java tests From e35c0d21f6d1accc93071ec0f1c160b6ad34f950 Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Tue, 13 Apr 2021 21:47:40 +0800 Subject: [PATCH 37/62] nit --- .github/workflows/oap-mllib-ci.yml | 6 +++--- mllib-dal/ci-test.sh | 6 ++---- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/.github/workflows/oap-mllib-ci.yml b/.github/workflows/oap-mllib-ci.yml index fc42d5e95..183e6b737 100644 --- a/.github/workflows/oap-mllib-ci.yml +++ b/.github/workflows/oap-mllib-ci.yml @@ -16,10 +16,10 @@ jobs: uses: actions/cache@v2 with: path: | - /var/cache/apt + /var/cache/apt/archives/*.deb ~/.m2/repository /opt/intel/oneapi - ~/downloads + ~/downloads key: ${{ runner.os }}-${{ hashFiles('**/pom.xml', '{{github.workspace}}/dev/install-build-deps-ubuntu.sh') }} restore-keys: | ${{ runner.os }}- @@ -32,7 +32,7 @@ jobs: export SPARK_HOME=~/spark-3.0.0-bin-hadoop2.7 ${{github.workspace}}/dev/install-build-deps-ubuntu.sh - name: Build and Test - run: | + run: | source /opt/intel/oneapi/setvars.sh source /tmp/oneCCL/build/_install/env/setvars.sh echo 127.0.0.1 $(hostname) | sudo tee -a /etc/hosts diff --git a/mllib-dal/ci-test.sh b/mllib-dal/ci-test.sh index 2b4aa8dde..30558e727 100755 --- a/mllib-dal/ci-test.sh +++ b/mllib-dal/ci-test.sh @@ -31,13 +31,11 @@ echo JAVA_HOME=$JAVA_HOME echo DAALROOT=$DAALROOT echo TBBROOT=$TBBROOT echo CCL_ROOT=$CCL_ROOT -echo CCL_CONFIGURATION=$CCL_CONFIGURATION -echo MPI_ROOT=$I_MPI_ROOT echo Clang Version: $(clang -dumpversion) echo ============================= # Enable signal chaining support for JNI -export LD_PRELOAD=$JAVA_HOME/jre/lib/amd64/libjsig.so +# export LD_PRELOAD=$JAVA_HOME/jre/lib/amd64/libjsig.so # -Dtest=none to turn off the Java tests @@ -46,5 +44,5 @@ export LD_PRELOAD=$JAVA_HOME/jre/lib/amd64/libjsig.so # Individual test mvn --no-transfer-progress -Dtest=none -DwildcardSuites=org.apache.spark.ml.clustering.IntelKMeansSuite test -# mvn --no-transfer-progress -Dtest=none -DwildcardSuites=org.apache.spark.ml.feature.IntelPCASuite test +mvn --no-transfer-progress -Dtest=none -DwildcardSuites=org.apache.spark.ml.feature.IntelPCASuite test # mvn -Dtest=none -DwildcardSuites=org.apache.spark.ml.recommendation.IntelALSSuite test From 365b034b7f867e0a087aafbcd975496ae6a6909c Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Tue, 13 Apr 2021 21:48:56 +0800 Subject: [PATCH 38/62] nit --- mllib-dal/ci-build.sh | 6 ------ mllib-dal/ci-test.sh | 5 ----- 2 files changed, 11 deletions(-) diff --git a/mllib-dal/ci-build.sh b/mllib-dal/ci-build.sh index dd74fb3f6..381867bff 100755 --- a/mllib-dal/ci-build.sh +++ b/mllib-dal/ci-build.sh @@ -21,17 +21,11 @@ if [[ -z $CCL_ROOT ]]; then exit 1 fi -if [[ -z $I_MPI_ROOT ]]; then - echo I_MPI_ROOT not defined! - exit 1 -fi - echo === Building Environments === echo JAVA_HOME=$JAVA_HOME echo DAALROOT=$DAALROOT echo TBBROOT=$TBBROOT echo CCL_ROOT=$CCL_ROOT -echo MPI_ROOT=$I_MPI_ROOT echo Clang Version: $(clang -dumpversion) echo ============================= diff --git a/mllib-dal/ci-test.sh b/mllib-dal/ci-test.sh index 30558e727..95e09a54d 100755 --- a/mllib-dal/ci-test.sh +++ b/mllib-dal/ci-test.sh @@ -21,11 +21,6 @@ if [[ -z $CCL_ROOT ]]; then exit 1 fi -if [[ -z $I_MPI_ROOT ]]; then - echo I_MPI_ROOT not defined! - exit 1 -fi - echo === Testing Environments === echo JAVA_HOME=$JAVA_HOME echo DAALROOT=$DAALROOT From 8fd68a24d614e42eafcaa089cc5ff2ef793a9c0b Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Tue, 13 Apr 2021 22:06:12 +0800 Subject: [PATCH 39/62] Add ci-test-cluster --- .github/workflows/oap-mllib-ci.yml | 8 ++++---- {mllib-dal => dev}/ci-build.sh | 1 + {mllib-dal => dev}/ci-test.sh | 5 +++++ dev/test-cluster/{test-cluster.sh => ci-test-cluster.sh} | 0 dev/test-cluster/env.sh | 2 +- 5 files changed, 11 insertions(+), 5 deletions(-) rename {mllib-dal => dev}/ci-build.sh (94%) rename {mllib-dal => dev}/ci-test.sh (90%) rename dev/test-cluster/{test-cluster.sh => ci-test-cluster.sh} (100%) diff --git a/.github/workflows/oap-mllib-ci.yml b/.github/workflows/oap-mllib-ci.yml index 183e6b737..a7044b1ef 100644 --- a/.github/workflows/oap-mllib-ci.yml +++ b/.github/workflows/oap-mllib-ci.yml @@ -35,7 +35,7 @@ jobs: run: | source /opt/intel/oneapi/setvars.sh source /tmp/oneCCL/build/_install/env/setvars.sh - echo 127.0.0.1 $(hostname) | sudo tee -a /etc/hosts - cd ${{github.workspace}}/mllib-dal - # ./ci-build.sh - ./ci-test.sh + echo 127.0.0.1 $(hostname) | sudo tee -a /etc/hosts + ${{github.workspace}}/dev/ci-build.sh + ${{github.workspace}}/dev/ci-test.sh + diff --git a/mllib-dal/ci-build.sh b/dev/ci-build.sh similarity index 94% rename from mllib-dal/ci-build.sh rename to dev/ci-build.sh index 381867bff..5e67d25b2 100755 --- a/mllib-dal/ci-build.sh +++ b/dev/ci-build.sh @@ -29,4 +29,5 @@ echo CCL_ROOT=$CCL_ROOT echo Clang Version: $(clang -dumpversion) echo ============================= +cd $GITHUB_WORKSPACE/mllib-dal mvn --no-transfer-progress -DskipTests clean package diff --git a/mllib-dal/ci-test.sh b/dev/ci-test.sh similarity index 90% rename from mllib-dal/ci-test.sh rename to dev/ci-test.sh index 95e09a54d..c865dd842 100755 --- a/mllib-dal/ci-test.sh +++ b/dev/ci-test.sh @@ -29,6 +29,8 @@ echo CCL_ROOT=$CCL_ROOT echo Clang Version: $(clang -dumpversion) echo ============================= +cd $GITHUB_WORKSPACE/mllib-dal + # Enable signal chaining support for JNI # export LD_PRELOAD=$JAVA_HOME/jre/lib/amd64/libjsig.so @@ -41,3 +43,6 @@ echo ============================= mvn --no-transfer-progress -Dtest=none -DwildcardSuites=org.apache.spark.ml.clustering.IntelKMeansSuite test mvn --no-transfer-progress -Dtest=none -DwildcardSuites=org.apache.spark.ml.feature.IntelPCASuite test # mvn -Dtest=none -DwildcardSuites=org.apache.spark.ml.recommendation.IntelALSSuite test + +# Yarn cluster test +$GITHUB_WORKSPACE/dev/test-cluster/ci-test-cluster.sh \ No newline at end of file diff --git a/dev/test-cluster/test-cluster.sh b/dev/test-cluster/ci-test-cluster.sh similarity index 100% rename from dev/test-cluster/test-cluster.sh rename to dev/test-cluster/ci-test-cluster.sh diff --git a/dev/test-cluster/env.sh b/dev/test-cluster/env.sh index f2c2afa57..225db0b7b 100644 --- a/dev/test-cluster/env.sh +++ b/dev/test-cluster/env.sh @@ -13,7 +13,7 @@ export SPARK_HOME=$SPARK_HOME # Set HDFS Root, should be hdfs://xxx or file://xxx export HDFS_ROOT=hdfs://localhost:8020 # Set OAP MLlib source code root directory -export OAP_MLLIB_ROOT=/home/xiaochang/Works/oap-mllib-xwu99-refactor-examples +export OAP_MLLIB_ROOT=$GITHUB_WORKSPACE # ============================================= # From 6bc312eb3e02be537cd5b8702794ca08efd2d82b Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Tue, 13 Apr 2021 22:30:48 +0800 Subject: [PATCH 40/62] update --- .github/workflows/oap-mllib-ci.yml | 15 +++++++-------- dev/install-build-deps-centos.sh | 1 + dev/install-build-deps-ubuntu.sh | 1 + dev/test-cluster/ci-test-cluster.sh | 10 ++++++---- dev/test-cluster/setup-cluster.sh | 8 ++++---- dev/test-cluster/setup-python3-env.sh | 12 ------------ 6 files changed, 19 insertions(+), 28 deletions(-) delete mode 100755 dev/test-cluster/setup-python3-env.sh diff --git a/.github/workflows/oap-mllib-ci.yml b/.github/workflows/oap-mllib-ci.yml index a7044b1ef..4c1c42965 100644 --- a/.github/workflows/oap-mllib-ci.yml +++ b/.github/workflows/oap-mllib-ci.yml @@ -19,23 +19,22 @@ jobs: /var/cache/apt/archives/*.deb ~/.m2/repository /opt/intel/oneapi - ~/downloads + ~/opt key: ${{ runner.os }}-${{ hashFiles('**/pom.xml', '{{github.workspace}}/dev/install-build-deps-ubuntu.sh') }} restore-keys: | ${{ runner.os }}- - name: Set up dependencies run: | - [ -d ~/downloads ] || mkdir ~/downloads - cd ~/downloads + [ -d ~/opt ] || mkdir ~/opt + cd ~/opt [ -f spark-3.0.0-bin-hadoop2.7.tgz ] || wget --no-verbose http://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop2.7.tgz - [ -d spark-3.0.0-bin-hadoop2.7 ] || cd ~ && tar -zxf downloads/spark-3.0.0-bin-hadoop2.7.tgz - export SPARK_HOME=~/spark-3.0.0-bin-hadoop2.7 + [ -d spark-3.0.0-bin-hadoop2.7 ] || tar -zxf spark-3.0.0-bin-hadoop2.7.tgz + export SPARK_HOME=~/opt/spark-3.0.0-bin-hadoop2.7 ${{github.workspace}}/dev/install-build-deps-ubuntu.sh - name: Build and Test run: | source /opt/intel/oneapi/setvars.sh source /tmp/oneCCL/build/_install/env/setvars.sh echo 127.0.0.1 $(hostname) | sudo tee -a /etc/hosts - ${{github.workspace}}/dev/ci-build.sh - ${{github.workspace}}/dev/ci-test.sh - + # ${{github.workspace}}/dev/ci-build.sh + ${{github.workspace}}/dev/ci-test.sh diff --git a/dev/install-build-deps-centos.sh b/dev/install-build-deps-centos.sh index 79f4545f2..5f8a6e5c8 100755 --- a/dev/install-build-deps-centos.sh +++ b/dev/install-build-deps-centos.sh @@ -22,6 +22,7 @@ fi echo "Building oneCCL ..." cd /tmp +rm -rf oneCCL git clone https://github.com/oneapi-src/oneCCL cd oneCCL git checkout 2021.2 diff --git a/dev/install-build-deps-ubuntu.sh b/dev/install-build-deps-ubuntu.sh index 95bbbb722..36fafc1f6 100755 --- a/dev/install-build-deps-ubuntu.sh +++ b/dev/install-build-deps-ubuntu.sh @@ -16,6 +16,7 @@ fi echo "Building oneCCL ..." cd /tmp +rm -rf oneCCL git clone https://github.com/oneapi-src/oneCCL cd oneCCL git checkout 2021.2 diff --git a/dev/test-cluster/ci-test-cluster.sh b/dev/test-cluster/ci-test-cluster.sh index 0320f38b2..50744d5a1 100755 --- a/dev/test-cluster/ci-test-cluster.sh +++ b/dev/test-cluster/ci-test-cluster.sh @@ -2,14 +2,16 @@ set -x -# Setup Python3 and Spark cluster +# Setup Password-less & Python3 cd $GITHUB_WORKSPACE/dev/test-cluster ./config-ssh.sh -./setup-cluster.sh -./setup-python3-env.sh +./setup-python3.sh + +# Setup Hadoop cluster and envs +source ./setup-cluster.sh # Build and run all examples -source $GITHUB_WORKSPACE/dev/test-cluster/env.sh +cp $GITHUB_WORKSPACE/dev/test-cluster/env.sh $GITHUB_WORKSPACE/conf cd $GITHUB_WORKSPACE/examples diff --git a/dev/test-cluster/setup-cluster.sh b/dev/test-cluster/setup-cluster.sh index 357912e77..e280d8ff7 100755 --- a/dev/test-cluster/setup-cluster.sh +++ b/dev/test-cluster/setup-cluster.sh @@ -8,10 +8,10 @@ echo JAVA_HOME is $JAVA_HOME mkdir ~/opt cd ~/opt -wget --no-verbose https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop2.7.tgz -tar -xzf spark-3.0.0-bin-hadoop2.7.tgz -wget --no-verbose https://archive.apache.org/dist/hadoop/core/hadoop-2.7.7/hadoop-2.7.7.tar.gz -tar -xzf hadoop-2.7.7.tar.gz +[ -f spark-3.0.0-bin-hadoop2.7.tgz ] || wget --no-verbose https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop2.7.tgz +[ -d spark-3.0.0-bin-hadoop2.7 ] || tar -xzf spark-3.0.0-bin-hadoop2.7.tgz +[ -f hadoop-2.7.7.tar.gz ] || wget --no-verbose https://archive.apache.org/dist/hadoop/core/hadoop-2.7.7/hadoop-2.7.7.tar.gz +[ -d hadoop-2.7.7 ] || tar -xzf hadoop-2.7.7.tar.gz cd $WORK_DIR diff --git a/dev/test-cluster/setup-python3-env.sh b/dev/test-cluster/setup-python3-env.sh deleted file mode 100755 index 29208dc5e..000000000 --- a/dev/test-cluster/setup-python3-env.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/usr/bin/env bash - -sudo apt-get update -sudo apt-get install python3-pip python3-setuptools python3-wheel - -pip3 install --user numpy - -echo python is in $(which python) -python --version - -echo python3 is in $(which python3) -python3 --version From e1ffd2a01abfea3aecf8236ee29196820bb75360 Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Tue, 13 Apr 2021 22:31:06 +0800 Subject: [PATCH 41/62] update --- dev/test-cluster/setup-python3.sh | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100755 dev/test-cluster/setup-python3.sh diff --git a/dev/test-cluster/setup-python3.sh b/dev/test-cluster/setup-python3.sh new file mode 100755 index 000000000..29208dc5e --- /dev/null +++ b/dev/test-cluster/setup-python3.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash + +sudo apt-get update +sudo apt-get install python3-pip python3-setuptools python3-wheel + +pip3 install --user numpy + +echo python is in $(which python) +python --version + +echo python3 is in $(which python3) +python3 --version From 848b02bedb0bd7152c65ea4880670b34ff6c5b68 Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Tue, 13 Apr 2021 22:57:16 +0800 Subject: [PATCH 42/62] update --- .github/workflows/oap-mllib-ci.yml | 3 +-- dev/ci-test.sh | 2 ++ dev/test-cluster/ci-test-cluster.sh | 3 ++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/oap-mllib-ci.yml b/.github/workflows/oap-mllib-ci.yml index 4c1c42965..34cde10af 100644 --- a/.github/workflows/oap-mllib-ci.yml +++ b/.github/workflows/oap-mllib-ci.yml @@ -35,6 +35,5 @@ jobs: run: | source /opt/intel/oneapi/setvars.sh source /tmp/oneCCL/build/_install/env/setvars.sh - echo 127.0.0.1 $(hostname) | sudo tee -a /etc/hosts - # ${{github.workspace}}/dev/ci-build.sh + echo 127.0.0.1 $(hostname) | sudo tee -a /etc/hosts ${{github.workspace}}/dev/ci-test.sh diff --git a/dev/ci-test.sh b/dev/ci-test.sh index c865dd842..46ba2ba0d 100755 --- a/dev/ci-test.sh +++ b/dev/ci-test.sh @@ -31,6 +31,8 @@ echo ============================= cd $GITHUB_WORKSPACE/mllib-dal +$GITHUB_WORKSPACE/dev/ci-build.sh + # Enable signal chaining support for JNI # export LD_PRELOAD=$JAVA_HOME/jre/lib/amd64/libjsig.so diff --git a/dev/test-cluster/ci-test-cluster.sh b/dev/test-cluster/ci-test-cluster.sh index 50744d5a1..f376bec18 100755 --- a/dev/test-cluster/ci-test-cluster.sh +++ b/dev/test-cluster/ci-test-cluster.sh @@ -15,7 +15,8 @@ cp $GITHUB_WORKSPACE/dev/test-cluster/env.sh $GITHUB_WORKSPACE/conf cd $GITHUB_WORKSPACE/examples -hadoop fs -copyFromLocal data +hadoop fs -mkdir data +hadoop fs -copyFromLocal -f data hadoop fs -ls data ./build-all.sh From 37f6462606604e03c467cd74bcf4395886ec5f77 Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Wed, 14 Apr 2021 09:24:20 +0800 Subject: [PATCH 43/62] update --- dev/ci-test.sh | 1 + dev/test-cluster/ci-test-cluster.sh | 4 ++-- dev/test-cluster/setup-cluster.sh | 6 +++++- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/dev/ci-test.sh b/dev/ci-test.sh index 46ba2ba0d..490c358e2 100755 --- a/dev/ci-test.sh +++ b/dev/ci-test.sh @@ -31,6 +31,7 @@ echo ============================= cd $GITHUB_WORKSPACE/mllib-dal +# Build test $GITHUB_WORKSPACE/dev/ci-build.sh # Enable signal chaining support for JNI diff --git a/dev/test-cluster/ci-test-cluster.sh b/dev/test-cluster/ci-test-cluster.sh index f376bec18..6c919e2ac 100755 --- a/dev/test-cluster/ci-test-cluster.sh +++ b/dev/test-cluster/ci-test-cluster.sh @@ -15,8 +15,8 @@ cp $GITHUB_WORKSPACE/dev/test-cluster/env.sh $GITHUB_WORKSPACE/conf cd $GITHUB_WORKSPACE/examples -hadoop fs -mkdir data -hadoop fs -copyFromLocal -f data +hadoop fs -mkdir -p /user/$USER +hadoop fs -copyFromLocal data hadoop fs -ls data ./build-all.sh diff --git a/dev/test-cluster/setup-cluster.sh b/dev/test-cluster/setup-cluster.sh index e280d8ff7..75114d9c5 100755 --- a/dev/test-cluster/setup-cluster.sh +++ b/dev/test-cluster/setup-cluster.sh @@ -6,7 +6,7 @@ cd $WORK_DIR echo JAVA_HOME is $JAVA_HOME -mkdir ~/opt +[ -d ~/opt ] || mkdir ~/opt cd ~/opt [ -f spark-3.0.0-bin-hadoop2.7.tgz ] || wget --no-verbose https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop2.7.tgz [ -d spark-3.0.0-bin-hadoop2.7 ] || tar -xzf spark-3.0.0-bin-hadoop2.7.tgz @@ -30,7 +30,11 @@ mkdir -p /tmp/run/hdfs/datanode export HADOOP_HOME=~/opt/hadoop-2.7.7 export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HADOOP_HOME/lib/native + export SPARK_HOME=~/opt/spark-3.0.0-bin-hadoop2.7 +export PYTHONPATH=$SPARK_HOME/python:$PYTHONPATH +export PYSPARK_PYTHON=python3 export PATH=$HADOOP_HOME/bin:$SPARK_HOME/bin:$PATH From c6dadf4fe8f6ff801113ec1518226608be59fdee Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Wed, 14 Apr 2021 10:32:27 +0800 Subject: [PATCH 44/62] update --- .github/workflows/oap-mllib-ci.yml | 2 +- dev/ci-build.sh | 8 +++++++- dev/ci-test.sh | 8 +++++++- dev/test-cluster/ci-test-cluster.sh | 4 ++-- mllib-dal/build.sh | 8 +++++++- mllib-dal/test.sh | 8 +++++++- 6 files changed, 31 insertions(+), 7 deletions(-) diff --git a/.github/workflows/oap-mllib-ci.yml b/.github/workflows/oap-mllib-ci.yml index 34cde10af..a23487e8b 100644 --- a/.github/workflows/oap-mllib-ci.yml +++ b/.github/workflows/oap-mllib-ci.yml @@ -16,7 +16,7 @@ jobs: uses: actions/cache@v2 with: path: | - /var/cache/apt/archives/*.deb + # /var/cache/apt/archives/*.deb ~/.m2/repository /opt/intel/oneapi ~/opt diff --git a/dev/ci-build.sh b/dev/ci-build.sh index 5e67d25b2..f51b9067c 100755 --- a/dev/ci-build.sh +++ b/dev/ci-build.sh @@ -2,7 +2,12 @@ # Check envs for building if [[ -z $JAVA_HOME ]]; then - echo $JAVA_HOME not defined! + echo JAVA_HOME not defined! + exit 1 +fi + +if [[ -z $(which mvn) ]]; then + echo Maven not found! exit 1 fi @@ -26,6 +31,7 @@ echo JAVA_HOME=$JAVA_HOME echo DAALROOT=$DAALROOT echo TBBROOT=$TBBROOT echo CCL_ROOT=$CCL_ROOT +echo Maven Version: $(mvn -v | head -n 1 | cut -f3 -d" ") echo Clang Version: $(clang -dumpversion) echo ============================= diff --git a/dev/ci-test.sh b/dev/ci-test.sh index 490c358e2..5815db38c 100755 --- a/dev/ci-test.sh +++ b/dev/ci-test.sh @@ -2,7 +2,12 @@ # Check envs for building if [[ -z $JAVA_HOME ]]; then - echo $JAVA_HOME not defined! + echo JAVA_HOME not defined! + exit 1 +fi + +if [[ -z $(which mvn) ]]; then + echo Maven not found! exit 1 fi @@ -26,6 +31,7 @@ echo JAVA_HOME=$JAVA_HOME echo DAALROOT=$DAALROOT echo TBBROOT=$TBBROOT echo CCL_ROOT=$CCL_ROOT +echo Maven Version: $(mvn -v | head -n 1 | cut -f3 -d" ") echo Clang Version: $(clang -dumpversion) echo ============================= diff --git a/dev/test-cluster/ci-test-cluster.sh b/dev/test-cluster/ci-test-cluster.sh index 6c919e2ac..32b0519ab 100755 --- a/dev/test-cluster/ci-test-cluster.sh +++ b/dev/test-cluster/ci-test-cluster.sh @@ -20,5 +20,5 @@ hadoop fs -copyFromLocal data hadoop fs -ls data ./build-all.sh -./run-all-scala.sh -./run-all-pyspark.sh +# ./run-all-scala.sh +# ./run-all-pyspark.sh diff --git a/mllib-dal/build.sh b/mllib-dal/build.sh index f55514094..bd88c998c 100755 --- a/mllib-dal/build.sh +++ b/mllib-dal/build.sh @@ -2,7 +2,12 @@ # Check envs for building if [[ -z $JAVA_HOME ]]; then - echo $JAVA_HOME not defined! + echo JAVA_HOME not defined! + exit 1 +fi + +if [[ -z $(which mvn) ]]; then + echo Maven not found! exit 1 fi @@ -26,6 +31,7 @@ echo JAVA_HOME=$JAVA_HOME echo DAALROOT=$DAALROOT echo TBBROOT=$TBBROOT echo CCL_ROOT=$CCL_ROOT +echo Maven Version: $(mvn -v | head -n 1 | cut -f3 -d" ") echo Clang Version: $(clang -dumpversion) echo ============================= diff --git a/mllib-dal/test.sh b/mllib-dal/test.sh index 797849fb7..befd66495 100755 --- a/mllib-dal/test.sh +++ b/mllib-dal/test.sh @@ -2,7 +2,12 @@ # Check envs for building if [[ -z $JAVA_HOME ]]; then - echo $JAVA_HOME not defined! + echo JAVA_HOME not defined! + exit 1 +fi + +if [[ -z $(which mvn) ]]; then + echo Maven not found! exit 1 fi @@ -26,6 +31,7 @@ echo JAVA_HOME=$JAVA_HOME echo DAALROOT=$DAALROOT echo TBBROOT=$TBBROOT echo CCL_ROOT=$CCL_ROOT +echo Maven Version: $(mvn -v | head -n 1 | cut -f3 -d" ") echo Clang Version: $(clang -dumpversion) echo ============================= From 7838283635a0946901bdd3e835b1ff977ea1dc84 Mon Sep 17 00:00:00 2001 From: Xiaochang Wu Date: Wed, 14 Apr 2021 15:58:09 +0800 Subject: [PATCH 45/62] Add check: OneCCL doesn't support loopback IP --- .../src/main/scala/org/apache/spark/ml/util/Utils.scala | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mllib-dal/src/main/scala/org/apache/spark/ml/util/Utils.scala b/mllib-dal/src/main/scala/org/apache/spark/ml/util/Utils.scala index aa8eb8979..0dd43d24f 100644 --- a/mllib-dal/src/main/scala/org/apache/spark/ml/util/Utils.scala +++ b/mllib-dal/src/main/scala/org/apache/spark/ml/util/Utils.scala @@ -72,6 +72,12 @@ object Utils { } def checkExecutorAvailPort(data: RDD[_], localIP: String) : Int = { + + if (localIP == "127.0.0.1" || localIP == "127.0.1.1") { + println(s"\nOneCCL: Error: doesn't support loopback IP ${localIP}, please assign IP address to your host.\n") + System.exit(-1) + } + val sc = data.sparkContext val result = data.mapPartitions { p => LibLoader.loadLibraries() From ec64f9e8f9a4192abbd3b6563d54d3dbcd6456c0 Mon Sep 17 00:00:00 2001 From: Xiaochang Wu Date: Wed, 14 Apr 2021 17:50:10 +0800 Subject: [PATCH 46/62] update --- .github/workflows/oap-mllib-ci.yml | 4 ++-- dev/test-cluster/ci-test-cluster.sh | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/oap-mllib-ci.yml b/.github/workflows/oap-mllib-ci.yml index a23487e8b..d4147c930 100644 --- a/.github/workflows/oap-mllib-ci.yml +++ b/.github/workflows/oap-mllib-ci.yml @@ -32,8 +32,8 @@ jobs: export SPARK_HOME=~/opt/spark-3.0.0-bin-hadoop2.7 ${{github.workspace}}/dev/install-build-deps-ubuntu.sh - name: Build and Test - run: | + run: | source /opt/intel/oneapi/setvars.sh source /tmp/oneCCL/build/_install/env/setvars.sh - echo 127.0.0.1 $(hostname) | sudo tee -a /etc/hosts + echo "$(hostname -i) $(hostname)" | sudo tee -a /etc/hosts ${{github.workspace}}/dev/ci-test.sh diff --git a/dev/test-cluster/ci-test-cluster.sh b/dev/test-cluster/ci-test-cluster.sh index 32b0519ab..6c919e2ac 100755 --- a/dev/test-cluster/ci-test-cluster.sh +++ b/dev/test-cluster/ci-test-cluster.sh @@ -20,5 +20,5 @@ hadoop fs -copyFromLocal data hadoop fs -ls data ./build-all.sh -# ./run-all-scala.sh -# ./run-all-pyspark.sh +./run-all-scala.sh +./run-all-pyspark.sh From 63855342f69d9590e1cb62ed8309f18b346df9fe Mon Sep 17 00:00:00 2001 From: Xiaochang Wu Date: Wed, 14 Apr 2021 18:24:49 +0800 Subject: [PATCH 47/62] update --- .github/workflows/oap-mllib-ci.yml | 8 ++------ dev/test-cluster/ci-test-cluster.sh | 2 +- dev/test-cluster/setup-cluster.sh | 3 +++ 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/.github/workflows/oap-mllib-ci.yml b/.github/workflows/oap-mllib-ci.yml index d4147c930..e9e8e512a 100644 --- a/.github/workflows/oap-mllib-ci.yml +++ b/.github/workflows/oap-mllib-ci.yml @@ -24,13 +24,9 @@ jobs: restore-keys: | ${{ runner.os }}- - name: Set up dependencies - run: | - [ -d ~/opt ] || mkdir ~/opt - cd ~/opt - [ -f spark-3.0.0-bin-hadoop2.7.tgz ] || wget --no-verbose http://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop2.7.tgz - [ -d spark-3.0.0-bin-hadoop2.7 ] || tar -zxf spark-3.0.0-bin-hadoop2.7.tgz - export SPARK_HOME=~/opt/spark-3.0.0-bin-hadoop2.7 + run: | ${{github.workspace}}/dev/install-build-deps-ubuntu.sh + source ${{github.workspace}}/dev/test-cluster/setup-cluster.sh - name: Build and Test run: | source /opt/intel/oneapi/setvars.sh diff --git a/dev/test-cluster/ci-test-cluster.sh b/dev/test-cluster/ci-test-cluster.sh index 6c919e2ac..0ddac8d3c 100755 --- a/dev/test-cluster/ci-test-cluster.sh +++ b/dev/test-cluster/ci-test-cluster.sh @@ -8,7 +8,7 @@ cd $GITHUB_WORKSPACE/dev/test-cluster ./setup-python3.sh # Setup Hadoop cluster and envs -source ./setup-cluster.sh +# source ./setup-cluster.sh # Build and run all examples cp $GITHUB_WORKSPACE/dev/test-cluster/env.sh $GITHUB_WORKSPACE/conf diff --git a/dev/test-cluster/setup-cluster.sh b/dev/test-cluster/setup-cluster.sh index 75114d9c5..96a721ef5 100755 --- a/dev/test-cluster/setup-cluster.sh +++ b/dev/test-cluster/setup-cluster.sh @@ -38,6 +38,9 @@ export PYSPARK_PYTHON=python3 export PATH=$HADOOP_HOME/bin:$SPARK_HOME/bin:$PATH +echo $(hostname) > $HADOOP_HOME/etc/hadoop/slaves +echo $(hostname) > $SPARK_HOME/conf/slaves + # start hdfs and yarn $HADOOP_HOME/sbin/start-dfs.sh $HADOOP_HOME/sbin/start-yarn.sh From 30ab18bb47b09cb1a9a571c9df3fc016b34f253c Mon Sep 17 00:00:00 2001 From: Xiaochang Wu Date: Wed, 14 Apr 2021 18:34:29 +0800 Subject: [PATCH 48/62] update --- .github/workflows/oap-mllib-ci.yml | 4 ++-- dev/test-cluster/ci-test-cluster.sh | 10 ---------- 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/.github/workflows/oap-mllib-ci.yml b/.github/workflows/oap-mllib-ci.yml index e9e8e512a..ab64735da 100644 --- a/.github/workflows/oap-mllib-ci.yml +++ b/.github/workflows/oap-mllib-ci.yml @@ -26,9 +26,9 @@ jobs: - name: Set up dependencies run: | ${{github.workspace}}/dev/install-build-deps-ubuntu.sh - source ${{github.workspace}}/dev/test-cluster/setup-cluster.sh + source ${{github.workspace}}/dev/setup-all.sh - name: Build and Test - run: | + run: | source /opt/intel/oneapi/setvars.sh source /tmp/oneCCL/build/_install/env/setvars.sh echo "$(hostname -i) $(hostname)" | sudo tee -a /etc/hosts diff --git a/dev/test-cluster/ci-test-cluster.sh b/dev/test-cluster/ci-test-cluster.sh index 0ddac8d3c..a34c56472 100755 --- a/dev/test-cluster/ci-test-cluster.sh +++ b/dev/test-cluster/ci-test-cluster.sh @@ -1,15 +1,5 @@ #!/usr/bin/env bash -set -x - -# Setup Password-less & Python3 -cd $GITHUB_WORKSPACE/dev/test-cluster -./config-ssh.sh -./setup-python3.sh - -# Setup Hadoop cluster and envs -# source ./setup-cluster.sh - # Build and run all examples cp $GITHUB_WORKSPACE/dev/test-cluster/env.sh $GITHUB_WORKSPACE/conf From 736dd21790004576e70c64ca683e4c6ebc8ad359 Mon Sep 17 00:00:00 2001 From: Xiaochang Wu Date: Wed, 14 Apr 2021 18:36:37 +0800 Subject: [PATCH 49/62] update --- .github/workflows/oap-mllib-ci.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/oap-mllib-ci.yml b/.github/workflows/oap-mllib-ci.yml index ab64735da..a59caee99 100644 --- a/.github/workflows/oap-mllib-ci.yml +++ b/.github/workflows/oap-mllib-ci.yml @@ -30,6 +30,5 @@ jobs: - name: Build and Test run: | source /opt/intel/oneapi/setvars.sh - source /tmp/oneCCL/build/_install/env/setvars.sh - echo "$(hostname -i) $(hostname)" | sudo tee -a /etc/hosts - ${{github.workspace}}/dev/ci-test.sh + source /tmp/oneCCL/build/_install/env/setvars.sh + ${{github.workspace}}/dev/ci-test.sh From 5138cdf686fe6ef273933590c7482377eafefbb5 Mon Sep 17 00:00:00 2001 From: Xiaochang Wu Date: Wed, 14 Apr 2021 18:43:27 +0800 Subject: [PATCH 50/62] update --- .github/workflows/oap-mllib-ci.yml | 5 +---- dev/ci-build.sh | 3 +++ dev/ci-test.sh | 4 ++++ dev/setup-all.sh | 15 +++++++++++++++ 4 files changed, 23 insertions(+), 4 deletions(-) create mode 100755 dev/setup-all.sh diff --git a/.github/workflows/oap-mllib-ci.yml b/.github/workflows/oap-mllib-ci.yml index a59caee99..6528a4b98 100644 --- a/.github/workflows/oap-mllib-ci.yml +++ b/.github/workflows/oap-mllib-ci.yml @@ -23,12 +23,9 @@ jobs: key: ${{ runner.os }}-${{ hashFiles('**/pom.xml', '{{github.workspace}}/dev/install-build-deps-ubuntu.sh') }} restore-keys: | ${{ runner.os }}- - - name: Set up dependencies + - name: Set up environments run: | - ${{github.workspace}}/dev/install-build-deps-ubuntu.sh source ${{github.workspace}}/dev/setup-all.sh - name: Build and Test run: | - source /opt/intel/oneapi/setvars.sh - source /tmp/oneCCL/build/_install/env/setvars.sh ${{github.workspace}}/dev/ci-test.sh diff --git a/dev/ci-build.sh b/dev/ci-build.sh index f51b9067c..5e735f925 100755 --- a/dev/ci-build.sh +++ b/dev/ci-build.sh @@ -1,5 +1,8 @@ #!/usr/bin/env bash +source /opt/intel/oneapi/setvars.sh +source /tmp/oneCCL/build/_install/env/setvars.sh + # Check envs for building if [[ -z $JAVA_HOME ]]; then echo JAVA_HOME not defined! diff --git a/dev/ci-test.sh b/dev/ci-test.sh index 5815db38c..7ce665089 100755 --- a/dev/ci-test.sh +++ b/dev/ci-test.sh @@ -1,5 +1,9 @@ #!/usr/bin/env bash +# Setup building envs +source /opt/intel/oneapi/setvars.sh +source /tmp/oneCCL/build/_install/env/setvars.sh + # Check envs for building if [[ -z $JAVA_HOME ]]; then echo JAVA_HOME not defined! diff --git a/dev/setup-all.sh b/dev/setup-all.sh new file mode 100755 index 000000000..6fabc4b91 --- /dev/null +++ b/dev/setup-all.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +# Setup hosts +echo "$(hostname -i) $(hostname)" | sudo tee -a /etc/hosts + +# Install all building dependencies +${{github.workspace}}/dev/install-build-deps-ubuntu.sh + +# Setup password-less & Python3 +cd $GITHUB_WORKSPACE/dev/test-cluster +./config-ssh.sh +./setup-python3.sh + +# Setup Hadoop cluster and envs +source ./setup-cluster.sh \ No newline at end of file From 4e1b138e5397b36e4f75dabf861f5c954956ffda Mon Sep 17 00:00:00 2001 From: Xiaochang Wu Date: Wed, 14 Apr 2021 21:30:26 +0800 Subject: [PATCH 51/62] update --- dev/setup-all.sh | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/dev/setup-all.sh b/dev/setup-all.sh index 6fabc4b91..0188079d2 100755 --- a/dev/setup-all.sh +++ b/dev/setup-all.sh @@ -3,13 +3,12 @@ # Setup hosts echo "$(hostname -i) $(hostname)" | sudo tee -a /etc/hosts -# Install all building dependencies -${{github.workspace}}/dev/install-build-deps-ubuntu.sh +# Install dependencies for building +$GITHUB_WORKSPACE/dev/install-build-deps-ubuntu.sh -# Setup password-less & Python3 -cd $GITHUB_WORKSPACE/dev/test-cluster -./config-ssh.sh -./setup-python3.sh +# Setup password-less & python3 +$GITHUB_WORKSPACE/dev/test-cluster/config-ssh.sh +$GITHUB_WORKSPACE/dev/test-cluster/setup-python3.sh -# Setup Hadoop cluster and envs -source ./setup-cluster.sh \ No newline at end of file +# Setup cluster and envs +source $GITHUB_WORKSPACE/dev/test-cluster/setup-cluster.sh From 34823fdefdcd425037f868742430b8fbba460a84 Mon Sep 17 00:00:00 2001 From: Xiaochang Wu Date: Wed, 14 Apr 2021 22:17:38 +0800 Subject: [PATCH 52/62] update --- dev/ci-build.sh | 1 + dev/test-cluster/setup-cluster.sh | 17 ++++++++++------- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/dev/ci-build.sh b/dev/ci-build.sh index 5e735f925..6f70ea372 100755 --- a/dev/ci-build.sh +++ b/dev/ci-build.sh @@ -1,5 +1,6 @@ #!/usr/bin/env bash +# Setup building envs source /opt/intel/oneapi/setvars.sh source /tmp/oneCCL/build/_install/env/setvars.sh diff --git a/dev/test-cluster/setup-cluster.sh b/dev/test-cluster/setup-cluster.sh index 96a721ef5..a236e4251 100755 --- a/dev/test-cluster/setup-cluster.sh +++ b/dev/test-cluster/setup-cluster.sh @@ -15,19 +15,15 @@ cd ~/opt cd $WORK_DIR +sed -i "s/localhost/$(hostname)/g" core-site.xml +sed -i "s/localhost/$(hostname)/g" yarn-site.xml + cp ./core-site.xml ~/opt/hadoop-2.7.7/etc/hadoop/ cp ./hdfs-site.xml ~/opt/hadoop-2.7.7/etc/hadoop/ cp ./yarn-site.xml ~/opt/hadoop-2.7.7/etc/hadoop/ cp ./hadoop-env.sh ~/opt/hadoop-2.7.7/etc/hadoop/ cp ./spark-defaults.conf ~/opt/spark-3.0.0-bin-hadoop2.7/conf -# create directories -mkdir -p /tmp/run/hdfs/namenode -mkdir -p /tmp/run/hdfs/datanode - -# hdfs format -~/opt/hadoop-2.7.7/bin/hdfs namenode -format - export HADOOP_HOME=~/opt/hadoop-2.7.7 export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HADOOP_HOME/lib/native @@ -41,6 +37,13 @@ export PATH=$HADOOP_HOME/bin:$SPARK_HOME/bin:$PATH echo $(hostname) > $HADOOP_HOME/etc/hadoop/slaves echo $(hostname) > $SPARK_HOME/conf/slaves +# create directories +mkdir -p /tmp/run/hdfs/namenode +mkdir -p /tmp/run/hdfs/datanode + +# hdfs format +$HADOOP_HOME/bin/hdfs namenode -format + # start hdfs and yarn $HADOOP_HOME/sbin/start-dfs.sh $HADOOP_HOME/sbin/start-yarn.sh From e8fb1a484df99aaad85dab3b3686c3f7864c3b3b Mon Sep 17 00:00:00 2001 From: Xiaochang Wu Date: Wed, 14 Apr 2021 22:37:59 +0800 Subject: [PATCH 53/62] update --- .github/workflows/oap-mllib-ci.yml | 1 + dev/test-cluster/ci-test-cluster.sh | 2 ++ dev/test-cluster/setup-cluster.sh | 10 +--------- 3 files changed, 4 insertions(+), 9 deletions(-) diff --git a/.github/workflows/oap-mllib-ci.yml b/.github/workflows/oap-mllib-ci.yml index 6528a4b98..b1bccbc8a 100644 --- a/.github/workflows/oap-mllib-ci.yml +++ b/.github/workflows/oap-mllib-ci.yml @@ -28,4 +28,5 @@ jobs: source ${{github.workspace}}/dev/setup-all.sh - name: Build and Test run: | + source ${{github.workspace}}/dev/setup-spark-envs.sh ${{github.workspace}}/dev/ci-test.sh diff --git a/dev/test-cluster/ci-test-cluster.sh b/dev/test-cluster/ci-test-cluster.sh index a34c56472..7795d1ca3 100755 --- a/dev/test-cluster/ci-test-cluster.sh +++ b/dev/test-cluster/ci-test-cluster.sh @@ -1,5 +1,7 @@ #!/usr/bin/env bash +source $GITHUB_WORKSPACE/dev/test-cluster/setup-spark-envs.sh + # Build and run all examples cp $GITHUB_WORKSPACE/dev/test-cluster/env.sh $GITHUB_WORKSPACE/conf diff --git a/dev/test-cluster/setup-cluster.sh b/dev/test-cluster/setup-cluster.sh index a236e4251..14dc80767 100755 --- a/dev/test-cluster/setup-cluster.sh +++ b/dev/test-cluster/setup-cluster.sh @@ -24,15 +24,7 @@ cp ./yarn-site.xml ~/opt/hadoop-2.7.7/etc/hadoop/ cp ./hadoop-env.sh ~/opt/hadoop-2.7.7/etc/hadoop/ cp ./spark-defaults.conf ~/opt/spark-3.0.0-bin-hadoop2.7/conf -export HADOOP_HOME=~/opt/hadoop-2.7.7 -export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HADOOP_HOME/lib/native - -export SPARK_HOME=~/opt/spark-3.0.0-bin-hadoop2.7 -export PYTHONPATH=$SPARK_HOME/python:$PYTHONPATH -export PYSPARK_PYTHON=python3 - -export PATH=$HADOOP_HOME/bin:$SPARK_HOME/bin:$PATH +source ./setup-spark-envs.sh echo $(hostname) > $HADOOP_HOME/etc/hadoop/slaves echo $(hostname) > $SPARK_HOME/conf/slaves From f83cdac294f1648fb7cca201bb0141f92c7ac5a9 Mon Sep 17 00:00:00 2001 From: Xiaochang Wu Date: Wed, 14 Apr 2021 22:38:26 +0800 Subject: [PATCH 54/62] update --- dev/test-cluster/setup-spark-envs.sh | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100755 dev/test-cluster/setup-spark-envs.sh diff --git a/dev/test-cluster/setup-spark-envs.sh b/dev/test-cluster/setup-spark-envs.sh new file mode 100755 index 000000000..3819f6ee8 --- /dev/null +++ b/dev/test-cluster/setup-spark-envs.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +export HADOOP_HOME=~/opt/hadoop-2.7.7 +export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HADOOP_HOME/lib/native + +export SPARK_HOME=~/opt/spark-3.0.0-bin-hadoop2.7 +export PYTHONPATH=$SPARK_HOME/python:$PYTHONPATH +export PYSPARK_PYTHON=python3 + +export PATH=$HADOOP_HOME/bin:$SPARK_HOME/bin:$PATH \ No newline at end of file From 4aec7e05ba99511346431547a81ee2e36aa3c2c7 Mon Sep 17 00:00:00 2001 From: Xiaochang Wu Date: Wed, 14 Apr 2021 23:00:18 +0800 Subject: [PATCH 55/62] update --- .github/workflows/oap-mllib-ci.yml | 3 +-- dev/install-build-deps-ubuntu.sh | 18 +++++++++--------- dev/setup-all.sh | 2 +- dev/test-cluster/ci-test-cluster.sh | 2 +- 4 files changed, 12 insertions(+), 13 deletions(-) diff --git a/.github/workflows/oap-mllib-ci.yml b/.github/workflows/oap-mllib-ci.yml index b1bccbc8a..4f567086f 100644 --- a/.github/workflows/oap-mllib-ci.yml +++ b/.github/workflows/oap-mllib-ci.yml @@ -27,6 +27,5 @@ jobs: run: | source ${{github.workspace}}/dev/setup-all.sh - name: Build and Test - run: | - source ${{github.workspace}}/dev/setup-spark-envs.sh + run: | ${{github.workspace}}/dev/ci-test.sh diff --git a/dev/install-build-deps-ubuntu.sh b/dev/install-build-deps-ubuntu.sh index 36fafc1f6..2ed3f87ce 100755 --- a/dev/install-build-deps-ubuntu.sh +++ b/dev/install-build-deps-ubuntu.sh @@ -14,12 +14,12 @@ else echo "oneAPI components already installed!" fi -echo "Building oneCCL ..." -cd /tmp -rm -rf oneCCL -git clone https://github.com/oneapi-src/oneCCL -cd oneCCL -git checkout 2021.2 -mkdir build && cd build -cmake .. -make -j 2 install +# echo "Building oneCCL ..." +# cd /tmp +# rm -rf oneCCL +# git clone https://github.com/oneapi-src/oneCCL +# cd oneCCL +# git checkout 2021.2 +# mkdir build && cd build +# cmake .. +# make -j 2 install diff --git a/dev/setup-all.sh b/dev/setup-all.sh index 0188079d2..c09e33dcf 100755 --- a/dev/setup-all.sh +++ b/dev/setup-all.sh @@ -8,7 +8,7 @@ $GITHUB_WORKSPACE/dev/install-build-deps-ubuntu.sh # Setup password-less & python3 $GITHUB_WORKSPACE/dev/test-cluster/config-ssh.sh -$GITHUB_WORKSPACE/dev/test-cluster/setup-python3.sh +# $GITHUB_WORKSPACE/dev/test-cluster/setup-python3.sh # Setup cluster and envs source $GITHUB_WORKSPACE/dev/test-cluster/setup-cluster.sh diff --git a/dev/test-cluster/ci-test-cluster.sh b/dev/test-cluster/ci-test-cluster.sh index 7795d1ca3..dfa0475fc 100755 --- a/dev/test-cluster/ci-test-cluster.sh +++ b/dev/test-cluster/ci-test-cluster.sh @@ -13,4 +13,4 @@ hadoop fs -ls data ./build-all.sh ./run-all-scala.sh -./run-all-pyspark.sh +# ./run-all-pyspark.sh From 49f914043c237ab2aeeb28da9a86412678aaa14e Mon Sep 17 00:00:00 2001 From: Xiaochang Wu Date: Wed, 14 Apr 2021 23:05:18 +0800 Subject: [PATCH 56/62] update --- .github/workflows/oap-mllib-ci.yml | 3 ++- dev/install-build-deps-ubuntu.sh | 18 +++++++++--------- dev/setup-all.sh | 2 +- dev/test-cluster/ci-test-cluster.sh | 2 +- 4 files changed, 13 insertions(+), 12 deletions(-) diff --git a/.github/workflows/oap-mllib-ci.yml b/.github/workflows/oap-mllib-ci.yml index 4f567086f..7c76c74d6 100644 --- a/.github/workflows/oap-mllib-ci.yml +++ b/.github/workflows/oap-mllib-ci.yml @@ -28,4 +28,5 @@ jobs: source ${{github.workspace}}/dev/setup-all.sh - name: Build and Test run: | - ${{github.workspace}}/dev/ci-test.sh + # ${{github.workspace}}/dev/ci-test.sh + ${{github.workspace}}/dev/test-cluster/ci-test-cluster.sh diff --git a/dev/install-build-deps-ubuntu.sh b/dev/install-build-deps-ubuntu.sh index 2ed3f87ce..36fafc1f6 100755 --- a/dev/install-build-deps-ubuntu.sh +++ b/dev/install-build-deps-ubuntu.sh @@ -14,12 +14,12 @@ else echo "oneAPI components already installed!" fi -# echo "Building oneCCL ..." -# cd /tmp -# rm -rf oneCCL -# git clone https://github.com/oneapi-src/oneCCL -# cd oneCCL -# git checkout 2021.2 -# mkdir build && cd build -# cmake .. -# make -j 2 install +echo "Building oneCCL ..." +cd /tmp +rm -rf oneCCL +git clone https://github.com/oneapi-src/oneCCL +cd oneCCL +git checkout 2021.2 +mkdir build && cd build +cmake .. +make -j 2 install diff --git a/dev/setup-all.sh b/dev/setup-all.sh index c09e33dcf..0188079d2 100755 --- a/dev/setup-all.sh +++ b/dev/setup-all.sh @@ -8,7 +8,7 @@ $GITHUB_WORKSPACE/dev/install-build-deps-ubuntu.sh # Setup password-less & python3 $GITHUB_WORKSPACE/dev/test-cluster/config-ssh.sh -# $GITHUB_WORKSPACE/dev/test-cluster/setup-python3.sh +$GITHUB_WORKSPACE/dev/test-cluster/setup-python3.sh # Setup cluster and envs source $GITHUB_WORKSPACE/dev/test-cluster/setup-cluster.sh diff --git a/dev/test-cluster/ci-test-cluster.sh b/dev/test-cluster/ci-test-cluster.sh index dfa0475fc..7795d1ca3 100755 --- a/dev/test-cluster/ci-test-cluster.sh +++ b/dev/test-cluster/ci-test-cluster.sh @@ -13,4 +13,4 @@ hadoop fs -ls data ./build-all.sh ./run-all-scala.sh -# ./run-all-pyspark.sh +./run-all-pyspark.sh From 2a84d94f8832376ea7bf69d3bea0f79054daf8f1 Mon Sep 17 00:00:00 2001 From: Xiaochang Wu Date: Wed, 14 Apr 2021 23:21:48 +0800 Subject: [PATCH 57/62] update README --- README.md | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 477c74b1f..6ae7082ac 100644 --- a/README.md +++ b/README.md @@ -65,14 +65,6 @@ To use K-means example for sanity check, you need to upload a data file to your $ ./run.sh ``` -### Benchmark with HiBench -Use [Hibench](https://github.com/Intel-bigdata/HiBench) to generate dataset with various profiles, and change related variables in `run-XXX.sh` script when applicable. Then run the following commands: -``` - $ cd oap-mllib/examples/kmeans-hibench - $ ./build.sh - $ ./run-hibench-oap-mllib.sh -``` - ### PySpark Support As PySpark-based applications call their Scala couterparts, they shall be supported out-of-box. An example can be found in the [Examples](#examples) section. @@ -95,7 +87,7 @@ Intel® oneAPI Toolkits and its components can be downloaded and install from [h More details about oneAPI can be found [here](https://software.intel.com/content/www/us/en/develop/tools/oneapi.html). -You can also refer to [this script and comments in it](https://github.com/Intel-bigdata/OAP/blob/branch-1.0-spark-3.x/oap-mllib/dev/install-build-deps-centos.sh) to install correct oneAPI version and manually setup the environments. +You can also refer to [this script and comments in it](dev/install-build-deps-centos.sh) to install correct oneAPI version and manually setup the environments. Scala and Java dependency descriptions are already included in Maven POM file. @@ -107,7 +99,7 @@ To clone and build from open source oneCCL, run the following commands: ``` $ git clone https://github.com/oneapi-src/oneCCL $ cd oneCCL - $ git checkout beta08 + $ git checkout 2021.2 $ mkdir build && cd build $ cmake .. $ make -j install @@ -138,7 +130,7 @@ CCL_ROOT | Path to oneCCL home directory We suggest you to source `setvars.sh` script into current shell to setup building environments as following: ``` - $ source /opt/intel/inteloneapi/setvars.sh + $ source /opt/intel/oneapi/setvars.sh $ source /your/oneCCL_source_code/build/_install/env/setvars.sh ``` @@ -156,12 +148,17 @@ The built JAR package will be placed in `target` directory with the name `oap-ml ## Examples -Example | Description +Example | Description ----------------|--------------------------- kmeans | K-means example for Scala kmeans-pyspark | K-means example for PySpark -kmeans-hibench | Use HiBench-generated input dataset to benchmark K-means performance +pca | PCA example for Scala +pca-pyspark | PCA example for PySpark +als | ALS example for Scala +als-pyspark | ALS example for PySpark ## List of Accelerated Algorithms * K-Means (CPU, Experimental) +* PCA (CPU, Experimental) +* ALS (CPU, Experimental) From 416c547db823700bbacddcd90352a5f7152b69da Mon Sep 17 00:00:00 2001 From: Xiaochang Wu Date: Wed, 14 Apr 2021 23:28:57 +0800 Subject: [PATCH 58/62] update --- .github/workflows/oap-mllib-ci.yml | 3 +-- dev/test-cluster/setup-cluster.sh | 8 ++++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/.github/workflows/oap-mllib-ci.yml b/.github/workflows/oap-mllib-ci.yml index 7c76c74d6..4f567086f 100644 --- a/.github/workflows/oap-mllib-ci.yml +++ b/.github/workflows/oap-mllib-ci.yml @@ -28,5 +28,4 @@ jobs: source ${{github.workspace}}/dev/setup-all.sh - name: Build and Test run: | - # ${{github.workspace}}/dev/ci-test.sh - ${{github.workspace}}/dev/test-cluster/ci-test-cluster.sh + ${{github.workspace}}/dev/ci-test.sh diff --git a/dev/test-cluster/setup-cluster.sh b/dev/test-cluster/setup-cluster.sh index 14dc80767..92ed170b0 100755 --- a/dev/test-cluster/setup-cluster.sh +++ b/dev/test-cluster/setup-cluster.sh @@ -15,8 +15,8 @@ cd ~/opt cd $WORK_DIR -sed -i "s/localhost/$(hostname)/g" core-site.xml -sed -i "s/localhost/$(hostname)/g" yarn-site.xml +sed -i "s/localhost/$(hostname -i)/g" core-site.xml +sed -i "s/localhost/$(hostname -i)/g" yarn-site.xml cp ./core-site.xml ~/opt/hadoop-2.7.7/etc/hadoop/ cp ./hdfs-site.xml ~/opt/hadoop-2.7.7/etc/hadoop/ @@ -26,8 +26,8 @@ cp ./spark-defaults.conf ~/opt/spark-3.0.0-bin-hadoop2.7/conf source ./setup-spark-envs.sh -echo $(hostname) > $HADOOP_HOME/etc/hadoop/slaves -echo $(hostname) > $SPARK_HOME/conf/slaves +echo $(hostname -i) > $HADOOP_HOME/etc/hadoop/slaves +echo $(hostname -i) > $SPARK_HOME/conf/slaves # create directories mkdir -p /tmp/run/hdfs/namenode From 041e87c6be0b1867a5498d1a1f4f6360f2134807 Mon Sep 17 00:00:00 2001 From: Xiaochang Wu Date: Wed, 14 Apr 2021 23:35:41 +0800 Subject: [PATCH 59/62] update --- dev/test-cluster/setup-cluster.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dev/test-cluster/setup-cluster.sh b/dev/test-cluster/setup-cluster.sh index 92ed170b0..fa3dad189 100755 --- a/dev/test-cluster/setup-cluster.sh +++ b/dev/test-cluster/setup-cluster.sh @@ -15,8 +15,8 @@ cd ~/opt cd $WORK_DIR -sed -i "s/localhost/$(hostname -i)/g" core-site.xml -sed -i "s/localhost/$(hostname -i)/g" yarn-site.xml +# sed -i "s/localhost/$(hostname -i)/g" core-site.xml +# sed -i "s/localhost/$(hostname -i)/g" yarn-site.xml cp ./core-site.xml ~/opt/hadoop-2.7.7/etc/hadoop/ cp ./hdfs-site.xml ~/opt/hadoop-2.7.7/etc/hadoop/ @@ -26,8 +26,8 @@ cp ./spark-defaults.conf ~/opt/spark-3.0.0-bin-hadoop2.7/conf source ./setup-spark-envs.sh -echo $(hostname -i) > $HADOOP_HOME/etc/hadoop/slaves -echo $(hostname -i) > $SPARK_HOME/conf/slaves +# echo $(hostname -i) > $HADOOP_HOME/etc/hadoop/slaves +# echo $(hostname -i) > $SPARK_HOME/conf/slaves # create directories mkdir -p /tmp/run/hdfs/namenode From 108227acd24fbdabb6cd7d3f2f94f3552475b3c5 Mon Sep 17 00:00:00 2001 From: Xiaochang Wu Date: Wed, 14 Apr 2021 23:38:34 +0800 Subject: [PATCH 60/62] update --- dev/setup-all.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dev/setup-all.sh b/dev/setup-all.sh index 0188079d2..b5757dcaa 100755 --- a/dev/setup-all.sh +++ b/dev/setup-all.sh @@ -1,7 +1,8 @@ #!/usr/bin/env bash # Setup hosts -echo "$(hostname -i) $(hostname)" | sudo tee -a /etc/hosts +# echo "$(hostname -i) $(hostname)" | sudo tee -a /etc/hosts +echo 127.0.0.1 $(hostname) | sudo tee -a /etc/hosts # Install dependencies for building $GITHUB_WORKSPACE/dev/install-build-deps-ubuntu.sh From 6422f8f2954f30c7ad251f8415929529c7280136 Mon Sep 17 00:00:00 2001 From: Xiaochang Wu Date: Thu, 15 Apr 2021 08:22:37 +0800 Subject: [PATCH 61/62] update --- dev/setup-all.sh | 4 +++- dev/test-cluster/setup-cluster.sh | 10 ++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/dev/setup-all.sh b/dev/setup-all.sh index b5757dcaa..82276c59c 100755 --- a/dev/setup-all.sh +++ b/dev/setup-all.sh @@ -2,7 +2,9 @@ # Setup hosts # echo "$(hostname -i) $(hostname)" | sudo tee -a /etc/hosts -echo 127.0.0.1 $(hostname) | sudo tee -a /etc/hosts + +HOST_IP=$(hostname -I | cut -f2 -d" ") +echo $HOST_IP $(hostname) | sudo tee -a /etc/hosts # Install dependencies for building $GITHUB_WORKSPACE/dev/install-build-deps-ubuntu.sh diff --git a/dev/test-cluster/setup-cluster.sh b/dev/test-cluster/setup-cluster.sh index fa3dad189..52191e05c 100755 --- a/dev/test-cluster/setup-cluster.sh +++ b/dev/test-cluster/setup-cluster.sh @@ -15,8 +15,10 @@ cd ~/opt cd $WORK_DIR -# sed -i "s/localhost/$(hostname -i)/g" core-site.xml -# sed -i "s/localhost/$(hostname -i)/g" yarn-site.xml +HOST_IP=$(hostname -I | cut -f2 -d" ") + +sed -i "s/localhost/$HOST_IP/g" core-site.xml +sed -i "s/localhost/$HOST_IP/g" yarn-site.xml cp ./core-site.xml ~/opt/hadoop-2.7.7/etc/hadoop/ cp ./hdfs-site.xml ~/opt/hadoop-2.7.7/etc/hadoop/ @@ -26,8 +28,8 @@ cp ./spark-defaults.conf ~/opt/spark-3.0.0-bin-hadoop2.7/conf source ./setup-spark-envs.sh -# echo $(hostname -i) > $HADOOP_HOME/etc/hadoop/slaves -# echo $(hostname -i) > $SPARK_HOME/conf/slaves +echo $HOST_IP > $HADOOP_HOME/etc/hadoop/slaves +echo $HOST_IP > $SPARK_HOME/conf/slaves # create directories mkdir -p /tmp/run/hdfs/namenode From 57b5f55e28b6a57a304555f71b43bd9373110408 Mon Sep 17 00:00:00 2001 From: Xiaochang Wu Date: Thu, 15 Apr 2021 11:19:17 +0800 Subject: [PATCH 62/62] Update README and nit changes --- README.md | 42 ++++++++++++++++++++-------- dev/setup-all.sh | 3 +- dev/test-cluster/ci-test-cluster.sh | 5 +++- dev/test-cluster/setup-cluster.sh | 1 + dev/test-cluster/spark-defaults.conf | 4 +-- 5 files changed, 39 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 6ae7082ac..c9e003ba5 100644 --- a/README.md +++ b/README.md @@ -11,13 +11,13 @@ For those algorithms that are not accelerated by OAP MLlib, the original Spark M ## Online Documentation -You can find the all the OAP MLlib documents on the [project web page](https://oap-project.github.io/oap-mllib/). +You can find the all the OAP MLlib documents on the [project web page](https://oap-project.github.io/oap-mllib). ## Getting Started ### Java/Scala Users Preferred -Use a pre-built OAP MLlib JAR to get started. You can firstly download OAP package from [OAP-JARs-Tarball](https://github.com/Intel-bigdata/OAP/releases/download/v1.0.0-spark-3.0.0/oap-1.0.0-bin-spark-3.0.0.tar.gz) and extract this Tarball to get `oap-mllib-x.x.x-with-spark-x.x.x.jar` under `oap-1.0.0-bin-spark-3.0.0/jars`. +Use a pre-built OAP MLlib JAR to get started. You can firstly download OAP package from [OAP-JARs-Tarball](https://github.com/Intel-bigdata/OAP/releases/download/v1.1.0-spark-3.0.0/oap-1.1.0-bin-spark-3.0.0.tar.gz) and extract this Tarball to get `oap-mllib-x.x.x-with-spark-x.x.x.jar` under `oap-1.1.0-bin-spark-3.0.0/jars`. Then you can refer to the following [Running](#running) section to try out. @@ -58,16 +58,31 @@ spark.executor.extraClassPath ./oap-mllib-x.x.x-with-spark-x.x.x.jar ### Sanity Check -To use K-means example for sanity check, you need to upload a data file to your HDFS and change related variables in `run.sh` of kmeans example. Then run the following commands: +#### Setup `env.sh` ``` - $ cd oap-mllib/examples/kmeans + $ cd conf + $ cp env.sh.template env.sh +``` +Edit related variables in "`Minimun Settings`" of `env.sh` + +#### Upload example data files to HDFS +``` + $ cd examples + $ hadoop fs -mkdir -p /user/$USER + $ hadoop fs -copyFromLocal data + $ hadoop fs -ls data +``` +#### Run K-means + +``` + $ cd examples/kmeans $ ./build.sh $ ./run.sh ``` ### PySpark Support -As PySpark-based applications call their Scala couterparts, they shall be supported out-of-box. An example can be found in the [Examples](#examples) section. +As PySpark-based applications call their Scala couterparts, they shall be supported out-of-box. Examples can be found in the [Examples](#examples) section. ## Building @@ -78,7 +93,8 @@ We use [Apache Maven](https://maven.apache.org/) to manage and build source code * JDK 8.0+ * Apache Maven 3.6.2+ * GNU GCC 4.8.5+ -* Intel® oneAPI Toolkits 2021.1.1 Components: +* Intel® oneAPI Toolkits 2021.2+ Components: + - DPC++/C++ Compiler (dpcpp/clang++) - Data Analytics Library (oneDAL) - Threading Building Blocks (oneTBB) * [Open Source Intel® oneAPI Collective Communications Library (oneCCL)](https://github.com/oneapi-src/oneCCL) @@ -87,7 +103,7 @@ Intel® oneAPI Toolkits and its components can be downloaded and install from [h More details about oneAPI can be found [here](https://software.intel.com/content/www/us/en/develop/tools/oneapi.html). -You can also refer to [this script and comments in it](dev/install-build-deps-centos.sh) to install correct oneAPI version and manually setup the environments. +You can refer to [this script](dev/install-build-deps-centos.sh) to install correct dependencies. Scala and Java dependency descriptions are already included in Maven POM file. @@ -136,11 +152,13 @@ We suggest you to source `setvars.sh` script into current shell to setup buildin __Be noticed we are using our own built oneCCL instead, we should source oneCCL's `setvars.sh` to overwrite oneAPI one.__ +You can also refer to [this CI script](dev/ci-build.sh) to setup the building environments. + If you prefer to buid your own open source [oneDAL](https://github.com/oneapi-src/oneDAL), [oneTBB](https://github.com/oneapi-src/oneTBB) versions rather than use the ones included in oneAPI TookKits, you can refer to the related build instructions and manually source `setvars.sh` accordingly. To build, run the following commands: ``` - $ cd oap-mllib/mllib-dal + $ cd mllib-dal $ ./build.sh ``` @@ -159,6 +177,8 @@ als-pyspark | ALS example for PySpark ## List of Accelerated Algorithms -* K-Means (CPU, Experimental) -* PCA (CPU, Experimental) -* ALS (CPU, Experimental) +Algorithm | Category | Maturity +----------|----------|------------- +K-Means | CPU | Experimental +PCA | CPU | Experimental +ALS | CPU | Experimental diff --git a/dev/setup-all.sh b/dev/setup-all.sh index 82276c59c..75defbe5a 100755 --- a/dev/setup-all.sh +++ b/dev/setup-all.sh @@ -1,8 +1,7 @@ #!/usr/bin/env bash # Setup hosts -# echo "$(hostname -i) $(hostname)" | sudo tee -a /etc/hosts - +# Use second internal IP, use first IP will be SSH timeout HOST_IP=$(hostname -I | cut -f2 -d" ") echo $HOST_IP $(hostname) | sudo tee -a /etc/hosts diff --git a/dev/test-cluster/ci-test-cluster.sh b/dev/test-cluster/ci-test-cluster.sh index 7795d1ca3..7a4600267 100755 --- a/dev/test-cluster/ci-test-cluster.sh +++ b/dev/test-cluster/ci-test-cluster.sh @@ -1,16 +1,19 @@ #!/usr/bin/env bash +# Setup Spark envs source $GITHUB_WORKSPACE/dev/test-cluster/setup-spark-envs.sh -# Build and run all examples +# Setup OAP MLlib envs cp $GITHUB_WORKSPACE/dev/test-cluster/env.sh $GITHUB_WORKSPACE/conf cd $GITHUB_WORKSPACE/examples +# Copy examples data to HDFS hadoop fs -mkdir -p /user/$USER hadoop fs -copyFromLocal data hadoop fs -ls data +# Build and run all examples ./build-all.sh ./run-all-scala.sh ./run-all-pyspark.sh diff --git a/dev/test-cluster/setup-cluster.sh b/dev/test-cluster/setup-cluster.sh index 52191e05c..b58c676ab 100755 --- a/dev/test-cluster/setup-cluster.sh +++ b/dev/test-cluster/setup-cluster.sh @@ -15,6 +15,7 @@ cd ~/opt cd $WORK_DIR +# Use second internal IP, use first IP will be SSH timeout HOST_IP=$(hostname -I | cut -f2 -d" ") sed -i "s/localhost/$HOST_IP/g" core-site.xml diff --git a/dev/test-cluster/spark-defaults.conf b/dev/test-cluster/spark-defaults.conf index 1c25bb2ec..05f1c31e3 100644 --- a/dev/test-cluster/spark-defaults.conf +++ b/dev/test-cluster/spark-defaults.conf @@ -28,7 +28,7 @@ spark.master yarn spark.serializer org.apache.spark.serializer.KryoSerializer -spark.driver.memory 3g +spark.driver.memory 1g spark.executor.num 2 spark.executor.cores 1 -spark.executor.memory 4g +spark.executor.memory 2g