oap-project · xwu99 · Apr 15, 2021 · Apr 15, 2021
diff --git a/.github/workflows/oap-mllib-ci.yml b/.github/workflows/oap-mllib-ci.yml
@@ -38,4 +38,5 @@ jobs:
           source /opt/intel/oneapi/dal/latest/env/vars.sh
           source /opt/intel/oneapi/tbb/latest/env/vars.sh
           source /tmp/oneCCL/build/_install/env/setvars.sh
-          ./test.sh
+          # temp disable and will enable for new release of oneCCL
+          #./build.sh
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,7 @@
 *.o
 *.log
 .vscode
+*.iml
 target/
 .idea/
 .idea_modules/
diff --git a/README.md b/README.md
@@ -17,7 +17,7 @@ You can find the all the OAP MLlib documents on the [project web page](https://o
 
 ### Java/Scala Users Preferred
 
-Use a pre-built OAP MLlib JAR to get started. You can firstly download OAP package from [OAP-JARs-Tarball](https://github.com/oap-mllib/releases/download/v1.1.0-spark-3.0.0/oap-1.1.0-bin-spark-3.0.0.tar.gz) and extract this Tarball to get `oap-mllib-x.x.x-with-spark-x.x.x.jar` under `oap-1.1.0-bin-spark-3.0.0/jars`.
+Use a pre-built OAP MLlib JAR to get started. You can firstly download OAP package from [OAP-JARs-Tarball](https://github.com/Intel-bigdata/OAP/releases/download/v1.0.0-spark-3.0.0/oap-1.0.0-bin-spark-3.0.0.tar.gz) and extract this Tarball to get `oap-mllib-x.x.x-with-spark-x.x.x.jar` under `oap-1.0.0-bin-spark-3.0.0/jars`.
 
 Then you can refer to the following [Running](#running) section to try out.
 
@@ -65,6 +65,14 @@ To use K-means example for sanity check, you need to upload a data file to your
     $ ./run.sh
 ```
 
+### Benchmark with HiBench
+Use [Hibench](https://github.com/Intel-bigdata/HiBench) to generate dataset with various profiles, and change related variables in `run-XXX.sh` script when applicable.  Then run the following commands:
+```
+    $ cd oap-mllib/examples/kmeans-hibench
+    $ ./build.sh
+    $ ./run-hibench-oap-mllib.sh
+```
+
 ### PySpark Support
 
 As PySpark-based applications call their Scala couterparts, they shall be supported out-of-box. An example can be found in the [Examples](#examples) section.
@@ -87,7 +95,7 @@ Intel® oneAPI Toolkits and its components can be downloaded and install from [h
 
 More details about oneAPI can be found [here](https://software.intel.com/content/www/us/en/develop/tools/oneapi.html).
 
-You can also refer to [this script and comments in it](https://github.com/oap-project/oap-mllib/blob/branch-1.1-spark-3.x/dev/install-build-deps-centos.sh) to install correct oneAPI version and manually setup the environments.
+You can also refer to [this script and comments in it](https://github.com/Intel-bigdata/OAP/blob/branch-1.0-spark-3.x/oap-mllib/dev/install-build-deps-centos.sh) to install correct oneAPI version and manually setup the environments.
 
 Scala and Java dependency descriptions are already included in Maven POM file. 
 
@@ -130,7 +138,7 @@ CCL_ROOT    | Path to oneCCL home directory
 We suggest you to source `setvars.sh` script into current shell to setup building environments as following:
 
 ```
-	$ source /opt/intel/oneapi/setvars.sh
+	$ source /opt/intel/inteloneapi/setvars.sh
 	$ source /your/oneCCL_source_code/build/_install/env/setvars.sh
 ```
 
@@ -152,11 +160,8 @@ Example         |  Description
 ----------------|---------------------------
 kmeans          |  K-means example for Scala
 kmeans-pyspark  |  K-means example for PySpark
-pca             |  PCA example for Scala
-pca-pyspark     |  PCA example for PySpark
+kmeans-hibench  |  Use HiBench-generated input dataset to benchmark K-means performance
 
 ## List of Accelerated Algorithms
 
 * K-Means (CPU, Experimental)
-* PCA (CPU, Experimental)
-
diff --git a/dev/install-build-deps-centos.sh b/dev/install-build-deps-centos.sh
@@ -23,7 +23,7 @@ cd /tmp
 rm -rf oneCCL
 git clone https://github.com/oneapi-src/oneCCL
 cd oneCCL
-git checkout beta08
+git checkout 2021.1
 mkdir -p build && cd build
 cmake ..
 make -j 2 install

diff --git a/dev/install-build-deps-ubuntu.sh b/dev/install-build-deps-ubuntu.sh
@@ -17,7 +17,7 @@ echo "Building oneCCL ..."
 cd /tmp
 git clone https://github.com/oneapi-src/oneCCL
 cd oneCCL
-git checkout beta08
+git checkout 2021.1
 mkdir build && cd build
 cmake ..
 make -j 2 install

diff --git a/dev/test-cluster/config-ssh.sh b/dev/test-cluster/config-ssh.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+
+ssh-keygen -q -N "" -t rsa -f ~/.ssh/id_rsa
+cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
+echo "    StrictHostKeyChecking no                     " | sudo tee -a /etc/ssh/ssh_config
+sudo service ssh restart
diff --git a/dev/test-cluster/core-site.xml b/dev/test-cluster/core-site.xml
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!--
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. See accompanying LICENSE file.
+-->
+
+<!-- Put site-specific property overrides in this file. -->
+
+<configuration>
+	<property>
+        <name>fs.default.name</name>
+    	<value>hdfs://localhost:8020</value>
+    </property>
+</configuration>
diff --git a/dev/test-cluster/envs.sh b/dev/test-cluster/envs.sh
@@ -0,0 +1,22 @@
+# Set user Spark and Hadoop home directory
+export HADOOP_HOME=~/opt/hadoop-2.7.7
+export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
+export SPARK_HOME=~/opt/spark-3.0.0-bin-hadoop2.7
+
+export PYTHONPATH=$SPARK_HOME/python:$PYTHONPATH
+export PYSPARK_PYTHON=python3
+
+# Set user HDFS Root
+export HDFS_ROOT=hdfs://localhost:8020
+export OAP_MLLIB_DATA_ROOT=OAPMLlib/Data
+# Set user Intel MLlib Root directory
+export OAP_MLLIB_ROOT=${GITHUB_WORKSPACE}
+
+# Target jar built
+OAP_MLLIB_JAR_NAME=oap-mllib-1.1.0.jar
+OAP_MLLIB_JAR=$OAP_MLLIB_ROOT/mllib-dal/target/$OAP_MLLIB_JAR_NAME
+
+# Use absolute path
+SPARK_DRIVER_CLASSPATH=$OAP_MLLIB_JAR
+# Use relative path
+SPARK_EXECUTOR_CLASSPATH=./$OAP_MLLIB_JAR_NAME
diff --git a/dev/test-cluster/hadoop-env.sh b/dev/test-cluster/hadoop-env.sh
@@ -0,0 +1,99 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Set Hadoop-specific environment variables here.
+
+# The only required environment variable is JAVA_HOME.  All others are
+# optional.  When running a distributed configuration it is best to
+# set JAVA_HOME in this file, so that it is correctly defined on
+# remote nodes.
+
+# The java implementation to use.
+# export JAVA_HOME=${JAVA_HOME}
+export JAVA_HOME=/usr/local/lib/jvm/openjdk8
+
+# The jsvc implementation to use. Jsvc is required to run secure datanodes
+# that bind to privileged ports to provide authentication of data transfer
+# protocol.  Jsvc is not required if SASL is configured for authentication of
+# data transfer protocol using non-privileged ports.
+#export JSVC_HOME=${JSVC_HOME}
+
+export HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-"/etc/hadoop"}
+
+# Extra Java CLASSPATH elements.  Automatically insert capacity-scheduler.
+for f in $HADOOP_HOME/contrib/capacity-scheduler/*.jar; do
+  if [ "$HADOOP_CLASSPATH" ]; then
+    export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:$f
+  else
+    export HADOOP_CLASSPATH=$f
+  fi
+done
+
+# The maximum amount of heap to use, in MB. Default is 1000.
+#export HADOOP_HEAPSIZE=
+#export HADOOP_NAMENODE_INIT_HEAPSIZE=""
+
+# Extra Java runtime options.  Empty by default.
+export HADOOP_OPTS="$HADOOP_OPTS -Djava.net.preferIPv4Stack=true"
+
+# Command specific options appended to HADOOP_OPTS when specified
+export HADOOP_NAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} $HADOOP_NAMENODE_OPTS"
+export HADOOP_DATANODE_OPTS="-Dhadoop.security.logger=ERROR,RFAS $HADOOP_DATANODE_OPTS"
+
+export HADOOP_SECONDARYNAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} $HADOOP_SECONDARYNAMENODE_OPTS"
+
+export HADOOP_NFS3_OPTS="$HADOOP_NFS3_OPTS"
+export HADOOP_PORTMAP_OPTS="-Xmx512m $HADOOP_PORTMAP_OPTS"
+
+# The following applies to multiple commands (fs, dfs, fsck, distcp etc)
+export HADOOP_CLIENT_OPTS="-Xmx512m $HADOOP_CLIENT_OPTS"
+#HADOOP_JAVA_PLATFORM_OPTS="-XX:-UsePerfData $HADOOP_JAVA_PLATFORM_OPTS"
+
+# On secure datanodes, user to run the datanode as after dropping privileges.
+# This **MUST** be uncommented to enable secure HDFS if using privileged ports
+# to provide authentication of data transfer protocol.  This **MUST NOT** be
+# defined if SASL is configured for authentication of data transfer protocol
+# using non-privileged ports.
+export HADOOP_SECURE_DN_USER=${HADOOP_SECURE_DN_USER}
+
+# Where log files are stored.  $HADOOP_HOME/logs by default.
+#export HADOOP_LOG_DIR=${HADOOP_LOG_DIR}/$USER
+
+# Where log files are stored in the secure data environment.
+export HADOOP_SECURE_DN_LOG_DIR=${HADOOP_LOG_DIR}/${HADOOP_HDFS_USER}
+
+###
+# HDFS Mover specific parameters
+###
+# Specify the JVM options to be used when starting the HDFS Mover.
+# These options will be appended to the options specified as HADOOP_OPTS
+# and therefore may override any similar flags set in HADOOP_OPTS
+#
+# export HADOOP_MOVER_OPTS=""
+
+###
+# Advanced Users Only!
+###
+
+# The directory where pid files are stored. /tmp by default.
+# NOTE: this should be set to a directory that can only be written to by 
+#       the user that will run the hadoop daemons.  Otherwise there is the
+#       potential for a symlink attack.
+export HADOOP_PID_DIR=${HADOOP_PID_DIR}
+export HADOOP_SECURE_DN_PID_DIR=${HADOOP_PID_DIR}
+
+# A string representing this instance of hadoop. $USER by default.
+export HADOOP_IDENT_STRING=$USER
diff --git a/dev/test-cluster/hdfs-site.xml b/dev/test-cluster/hdfs-site.xml
@@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!--
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. See accompanying LICENSE file.
+-->
+
+<!-- Put site-specific property overrides in this file. -->
+
+<configuration>
+	<property>
+		<name>dfs.replication</name>
+		<value>1</value>
+	</property>
+	<property>
+		<name>dfs.namenode.name.dir</name>
+		<value>/tmp/run/hdfs/namenode</value>
+	</property>
+	<property>
+		<name>dfs.datanode.data.dir</name>
+		<value>/tmp/run/hdfs/datanode</value>
+	</property>
+</configuration>
diff --git a/dev/test-cluster/setup-cluster.sh b/dev/test-cluster/setup-cluster.sh
@@ -0,0 +1,42 @@
+#!/usr/bin/env bash
+
+WORK_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+
+cd $WORK_DIR
+
+echo JAVA_HOME is $JAVA_HOME
+
+mkdir ~/opt
+cd ~/opt
+wget https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop2.7.tgz
+tar -xzf spark-3.0.0-bin-hadoop2.7.tgz
+wget https://archive.apache.org/dist/hadoop/core/hadoop-2.7.7/hadoop-2.7.7.tar.gz
+tar -xzf hadoop-2.7.7.tar.gz
+
+cd $WORK_DIR
+
+cp ./core-site.xml ~/opt/hadoop-2.7.7/etc/hadoop/
+cp ./hdfs-site.xml ~/opt/hadoop-2.7.7/etc/hadoop/
+cp ./yarn-site.xml ~/opt/hadoop-2.7.7/etc/hadoop/
+cp ./hadoop-env.sh ~/opt/hadoop-2.7.7/etc/hadoop/
+cp ./spark-defaults.conf ~/opt/spark-3.0.0-bin-hadoop2.7/conf
+
+# create directories
+mkdir -p /tmp/run/hdfs/namenode
+mkdir -p /tmp/run/hdfs/datanode
+
+# hdfs format
+~/opt/hadoop-2.7.7/bin/hdfs namenode -format
+
+export HADOOP_HOME=~/opt/hadoop-2.7.7
+export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
+export SPARK_HOME=~/opt/spark-3.0.0-bin-hadoop2.7
+
+export PATH=$HADOOP_HOME/bin:$SPARK_HOME/bin:$PATH
+
+# start hdfs and yarn
+$HADOOP_HOME/sbin/start-dfs.sh
+$HADOOP_HOME/sbin/start-yarn.sh
+
+hadoop fs -ls /
+yarn node -list
diff --git a/dev/test-cluster/setup-python3-env.sh b/dev/test-cluster/setup-python3-env.sh
@@ -0,0 +1,12 @@
+#!/usr/bin/env bash
+
+sudo apt-get update
+sudo apt-get install python3-pip python3-setuptools python3-wheel
+
+pip3 install --user numpy
+
+echo python is in $(which python) 
+python --version
+
+echo python3 is in $(which python3) 
+python3 --version
diff --git a/dev/test-cluster/spark-defaults.conf b/dev/test-cluster/spark-defaults.conf
@@ -0,0 +1,34 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Default system properties included when running spark-submit.
+# This is useful for setting default environmental settings.
+
+# Example:
+# spark.master                     spark://master:7077
+# spark.eventLog.enabled           true
+# spark.eventLog.dir               hdfs://namenode:8021/directory
+# spark.serializer                 org.apache.spark.serializer.KryoSerializer
+# spark.driver.memory              5g
+# spark.executor.extraJavaOptions  -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"
+
+spark.master                     yarn	
+spark.serializer                 org.apache.spark.serializer.KryoSerializer
+spark.driver.memory              3g
+spark.executor.num               2
+spark.executor.cores             1
+spark.executor.memory            4g