diff --git a/.github/workflows/oap-mllib-ci.yml b/.github/workflows/oap-mllib-ci.yml
index 2c6973321..4f567086f 100644
--- a/.github/workflows/oap-mllib-ci.yml
+++ b/.github/workflows/oap-mllib-ci.yml
@@ -11,32 +11,21 @@ jobs:
- name: Set up JDK 1.8
uses: actions/setup-java@v1
with:
- java-version: 1.8
+ java-version: 1.8
- name: Restore cached dependencies
uses: actions/cache@v2
with:
path: |
+ # /var/cache/apt/archives/*.deb
~/.m2/repository
- ~/downloads
- /opt/intel/inteloneapi
/opt/intel/oneapi
+ ~/opt
key: ${{ runner.os }}-${{ hashFiles('**/pom.xml', '{{github.workspace}}/dev/install-build-deps-ubuntu.sh') }}
restore-keys: |
${{ runner.os }}-
- - name: Set up dependencies
- run: |
- [ -d ~/downloads ] || mkdir ~/downloads
- cd ~/downloads
- [ -f spark-3.0.0-bin-hadoop2.7.tgz ] || wget http://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop2.7.tgz
- [ -d spark-3.0.0-bin-hadoop2.7 ] || cd ~ && tar -zxf downloads/spark-3.0.0-bin-hadoop2.7.tgz
- export SPARK_HOME=~/spark-3.0.0-bin-hadoop2.7
- ${{github.workspace}}/dev/install-build-deps-ubuntu.sh
+ - name: Set up environments
+ run: |
+ source ${{github.workspace}}/dev/setup-all.sh
- name: Build and Test
- run: |
- cd ${{github.workspace}}/mllib-dal
- export ONEAPI_ROOT=/opt/intel/oneapi
- source /opt/intel/oneapi/dal/latest/env/vars.sh
- source /opt/intel/oneapi/tbb/latest/env/vars.sh
- source /tmp/oneCCL/build/_install/env/setvars.sh
- # temp disable and will enable for new release of oneCCL
- #./build.sh
+ run: |
+ ${{github.workspace}}/dev/ci-test.sh
diff --git a/README.md b/README.md
index 477c74b1f..c9e003ba5 100644
--- a/README.md
+++ b/README.md
@@ -11,13 +11,13 @@ For those algorithms that are not accelerated by OAP MLlib, the original Spark M
## Online Documentation
-You can find the all the OAP MLlib documents on the [project web page](https://oap-project.github.io/oap-mllib/).
+You can find the all the OAP MLlib documents on the [project web page](https://oap-project.github.io/oap-mllib).
## Getting Started
### Java/Scala Users Preferred
-Use a pre-built OAP MLlib JAR to get started. You can firstly download OAP package from [OAP-JARs-Tarball](https://github.com/Intel-bigdata/OAP/releases/download/v1.0.0-spark-3.0.0/oap-1.0.0-bin-spark-3.0.0.tar.gz) and extract this Tarball to get `oap-mllib-x.x.x-with-spark-x.x.x.jar` under `oap-1.0.0-bin-spark-3.0.0/jars`.
+Use a pre-built OAP MLlib JAR to get started. You can firstly download OAP package from [OAP-JARs-Tarball](https://github.com/Intel-bigdata/OAP/releases/download/v1.1.0-spark-3.0.0/oap-1.1.0-bin-spark-3.0.0.tar.gz) and extract this Tarball to get `oap-mllib-x.x.x-with-spark-x.x.x.jar` under `oap-1.1.0-bin-spark-3.0.0/jars`.
Then you can refer to the following [Running](#running) section to try out.
@@ -58,24 +58,31 @@ spark.executor.extraClassPath ./oap-mllib-x.x.x-with-spark-x.x.x.jar
### Sanity Check
-To use K-means example for sanity check, you need to upload a data file to your HDFS and change related variables in `run.sh` of kmeans example. Then run the following commands:
+#### Setup `env.sh`
```
- $ cd oap-mllib/examples/kmeans
- $ ./build.sh
- $ ./run.sh
+ $ cd conf
+ $ cp env.sh.template env.sh
```
+Edit related variables in "`Minimun Settings`" of `env.sh`
-### Benchmark with HiBench
-Use [Hibench](https://github.com/Intel-bigdata/HiBench) to generate dataset with various profiles, and change related variables in `run-XXX.sh` script when applicable. Then run the following commands:
+#### Upload example data files to HDFS
```
- $ cd oap-mllib/examples/kmeans-hibench
+ $ cd examples
+ $ hadoop fs -mkdir -p /user/$USER
+ $ hadoop fs -copyFromLocal data
+ $ hadoop fs -ls data
+```
+#### Run K-means
+
+```
+ $ cd examples/kmeans
$ ./build.sh
- $ ./run-hibench-oap-mllib.sh
+ $ ./run.sh
```
### PySpark Support
-As PySpark-based applications call their Scala couterparts, they shall be supported out-of-box. An example can be found in the [Examples](#examples) section.
+As PySpark-based applications call their Scala couterparts, they shall be supported out-of-box. Examples can be found in the [Examples](#examples) section.
## Building
@@ -86,7 +93,8 @@ We use [Apache Maven](https://maven.apache.org/) to manage and build source code
* JDK 8.0+
* Apache Maven 3.6.2+
* GNU GCC 4.8.5+
-* Intel® oneAPI Toolkits 2021.1.1 Components:
+* Intel® oneAPI Toolkits 2021.2+ Components:
+ - DPC++/C++ Compiler (dpcpp/clang++)
- Data Analytics Library (oneDAL)
- Threading Building Blocks (oneTBB)
* [Open Source Intel® oneAPI Collective Communications Library (oneCCL)](https://github.com/oneapi-src/oneCCL)
@@ -95,7 +103,7 @@ Intel® oneAPI Toolkits and its components can be downloaded and install from [h
More details about oneAPI can be found [here](https://software.intel.com/content/www/us/en/develop/tools/oneapi.html).
-You can also refer to [this script and comments in it](https://github.com/Intel-bigdata/OAP/blob/branch-1.0-spark-3.x/oap-mllib/dev/install-build-deps-centos.sh) to install correct oneAPI version and manually setup the environments.
+You can refer to [this script](dev/install-build-deps-centos.sh) to install correct dependencies.
Scala and Java dependency descriptions are already included in Maven POM file.
@@ -107,7 +115,7 @@ To clone and build from open source oneCCL, run the following commands:
```
$ git clone https://github.com/oneapi-src/oneCCL
$ cd oneCCL
- $ git checkout beta08
+ $ git checkout 2021.2
$ mkdir build && cd build
$ cmake ..
$ make -j install
@@ -138,17 +146,19 @@ CCL_ROOT | Path to oneCCL home directory
We suggest you to source `setvars.sh` script into current shell to setup building environments as following:
```
- $ source /opt/intel/inteloneapi/setvars.sh
+ $ source /opt/intel/oneapi/setvars.sh
$ source /your/oneCCL_source_code/build/_install/env/setvars.sh
```
__Be noticed we are using our own built oneCCL instead, we should source oneCCL's `setvars.sh` to overwrite oneAPI one.__
+You can also refer to [this CI script](dev/ci-build.sh) to setup the building environments.
+
If you prefer to buid your own open source [oneDAL](https://github.com/oneapi-src/oneDAL), [oneTBB](https://github.com/oneapi-src/oneTBB) versions rather than use the ones included in oneAPI TookKits, you can refer to the related build instructions and manually source `setvars.sh` accordingly.
To build, run the following commands:
```
- $ cd oap-mllib/mllib-dal
+ $ cd mllib-dal
$ ./build.sh
```
@@ -156,12 +166,19 @@ The built JAR package will be placed in `target` directory with the name `oap-ml
## Examples
-Example | Description
+Example | Description
----------------|---------------------------
kmeans | K-means example for Scala
kmeans-pyspark | K-means example for PySpark
-kmeans-hibench | Use HiBench-generated input dataset to benchmark K-means performance
+pca | PCA example for Scala
+pca-pyspark | PCA example for PySpark
+als | ALS example for Scala
+als-pyspark | ALS example for PySpark
## List of Accelerated Algorithms
-* K-Means (CPU, Experimental)
+Algorithm | Category | Maturity
+----------|----------|-------------
+K-Means | CPU | Experimental
+PCA | CPU | Experimental
+ALS | CPU | Experimental
diff --git a/conf/env.sh.template b/conf/env.sh.template
new file mode 100644
index 000000000..ddcc84665
--- /dev/null
+++ b/conf/env.sh.template
@@ -0,0 +1,46 @@
+# == OAP MLlib users to customize the following environments for running examples ======= #
+
+# ============== Minimum Settings ============= #
+
+# Set OAP MLlib version (e.g. 1.1.0)
+OAP_MLLIB_VERSION=x.x.x
+# Set Spark master
+SPARK_MASTER=yarn
+# Set Hadoop home path
+export HADOOP_HOME=/path/to/your/hadoop/home
+# Set Spark home path
+export SPARK_HOME=/path/to/your/spark/home
+# Set HDFS Root, should be hdfs://xxx or file://xxx
+export HDFS_ROOT=hdfs://localhost:8020
+# Set OAP MLlib source code root directory
+export OAP_MLLIB_ROOT=/path/to/oap-mllib/home
+
+# ============================================= #
+
+# Set HADOOP_CONF_DIR for Spark
+export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
+
+# Set JAR name & path
+OAP_MLLIB_JAR_NAME=oap-mllib-$OAP_MLLIB_VERSION.jar
+OAP_MLLIB_JAR=$OAP_MLLIB_ROOT/mllib-dal/target/$OAP_MLLIB_JAR_NAME
+# Set Spark driver & executor classpaths,
+# absolute path for driver, relative path for executor
+SPARK_DRIVER_CLASSPATH=$OAP_MLLIB_JAR
+SPARK_EXECUTOR_CLASSPATH=./$OAP_MLLIB_JAR_NAME
+
+# Set Spark resources, can be overwritten in example
+SPARK_DRIVER_MEMORY=1G
+SPARK_NUM_EXECUTORS=2
+SPARK_EXECUTOR_CORES=1
+SPARK_EXECUTOR_MEMORY=1G
+SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES '*' 2)
+
+# Checks
+
+for dir in $SPARK_HOME $HADOOP_HOME $OAP_MLLIB_JAR
+do
+ if [[ ! -e $dir ]]; then
+ echo $dir does not exist!
+ exit 1
+ fi
+done
diff --git a/dev/ci-build.sh b/dev/ci-build.sh
new file mode 100755
index 000000000..6f70ea372
--- /dev/null
+++ b/dev/ci-build.sh
@@ -0,0 +1,43 @@
+#!/usr/bin/env bash
+
+# Setup building envs
+source /opt/intel/oneapi/setvars.sh
+source /tmp/oneCCL/build/_install/env/setvars.sh
+
+# Check envs for building
+if [[ -z $JAVA_HOME ]]; then
+ echo JAVA_HOME not defined!
+ exit 1
+fi
+
+if [[ -z $(which mvn) ]]; then
+ echo Maven not found!
+ exit 1
+fi
+
+if [[ -z $DAALROOT ]]; then
+ echo DAALROOT not defined!
+ exit 1
+fi
+
+if [[ -z $TBBROOT ]]; then
+ echo TBBROOT not defined!
+ exit 1
+fi
+
+if [[ -z $CCL_ROOT ]]; then
+ echo CCL_ROOT not defined!
+ exit 1
+fi
+
+echo === Building Environments ===
+echo JAVA_HOME=$JAVA_HOME
+echo DAALROOT=$DAALROOT
+echo TBBROOT=$TBBROOT
+echo CCL_ROOT=$CCL_ROOT
+echo Maven Version: $(mvn -v | head -n 1 | cut -f3 -d" ")
+echo Clang Version: $(clang -dumpversion)
+echo =============================
+
+cd $GITHUB_WORKSPACE/mllib-dal
+mvn --no-transfer-progress -DskipTests clean package
diff --git a/dev/ci-test.sh b/dev/ci-test.sh
new file mode 100755
index 000000000..7ce665089
--- /dev/null
+++ b/dev/ci-test.sh
@@ -0,0 +1,61 @@
+#!/usr/bin/env bash
+
+# Setup building envs
+source /opt/intel/oneapi/setvars.sh
+source /tmp/oneCCL/build/_install/env/setvars.sh
+
+# Check envs for building
+if [[ -z $JAVA_HOME ]]; then
+ echo JAVA_HOME not defined!
+ exit 1
+fi
+
+if [[ -z $(which mvn) ]]; then
+ echo Maven not found!
+ exit 1
+fi
+
+if [[ -z $DAALROOT ]]; then
+ echo DAALROOT not defined!
+ exit 1
+fi
+
+if [[ -z $TBBROOT ]]; then
+ echo TBBROOT not defined!
+ exit 1
+fi
+
+if [[ -z $CCL_ROOT ]]; then
+ echo CCL_ROOT not defined!
+ exit 1
+fi
+
+echo === Testing Environments ===
+echo JAVA_HOME=$JAVA_HOME
+echo DAALROOT=$DAALROOT
+echo TBBROOT=$TBBROOT
+echo CCL_ROOT=$CCL_ROOT
+echo Maven Version: $(mvn -v | head -n 1 | cut -f3 -d" ")
+echo Clang Version: $(clang -dumpversion)
+echo =============================
+
+cd $GITHUB_WORKSPACE/mllib-dal
+
+# Build test
+$GITHUB_WORKSPACE/dev/ci-build.sh
+
+# Enable signal chaining support for JNI
+# export LD_PRELOAD=$JAVA_HOME/jre/lib/amd64/libjsig.so
+
+# -Dtest=none to turn off the Java tests
+
+# Test all
+# mvn -Dtest=none -Dmaven.test.skip=false test
+
+# Individual test
+mvn --no-transfer-progress -Dtest=none -DwildcardSuites=org.apache.spark.ml.clustering.IntelKMeansSuite test
+mvn --no-transfer-progress -Dtest=none -DwildcardSuites=org.apache.spark.ml.feature.IntelPCASuite test
+# mvn -Dtest=none -DwildcardSuites=org.apache.spark.ml.recommendation.IntelALSSuite test
+
+# Yarn cluster test
+$GITHUB_WORKSPACE/dev/test-cluster/ci-test-cluster.sh
\ No newline at end of file
diff --git a/dev/install-build-deps-centos.sh b/dev/install-build-deps-centos.sh
index 8a347fdef..5f8a6e5c8 100755
--- a/dev/install-build-deps-centos.sh
+++ b/dev/install-build-deps-centos.sh
@@ -12,8 +12,10 @@ gpgcheck=1
repo_gpgcheck=1
gpgkey=https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2023.PUB
EOF
- sudo mv /tmp/oneAPI.repo /etc/yum.repos.d
- sudo yum install -y intel-oneapi-dal-devel-2021.1.1 intel-oneapi-tbb-devel-2021.1.1
+ sudo mv /tmp/oneAPI.repo /etc/yum.repos.d
+ # sudo yum groupinstall -y "Development Tools"
+ # sudo yum install -y cmake
+ sudo yum install -y intel-oneapi-dpcpp-cpp-2021.2.0 intel-oneapi-dal-devel-2021.2.0 intel-oneapi-tbb-devel-2021.2.0
else
echo "oneAPI components already installed!"
fi
@@ -23,16 +25,7 @@ cd /tmp
rm -rf oneCCL
git clone https://github.com/oneapi-src/oneCCL
cd oneCCL
-git checkout 2021.1
-mkdir -p build && cd build
+git checkout 2021.2
+mkdir build && cd build
cmake ..
-make -j 2 install
-
-#
-# Setup building environments manually:
-#
-# export ONEAPI_ROOT=/opt/intel/oneapi
-# source /opt/intel/oneapi/dal/latest/env/vars.sh
-# source /opt/intel/oneapi/tbb/latest/env/vars.sh
-# source /tmp/oneCCL/build/_install/env/setvars.sh
-#
+make -j 2 install
\ No newline at end of file
diff --git a/dev/install-build-deps-ubuntu.sh b/dev/install-build-deps-ubuntu.sh
index d43e35b89..36fafc1f6 100755
--- a/dev/install-build-deps-ubuntu.sh
+++ b/dev/install-build-deps-ubuntu.sh
@@ -1,6 +1,6 @@
#!/usr/bin/env bash
-if [ ! -f /opt/intel/oneapi ]; then
+if [ ! -d /opt/intel/oneapi ]; then
echo "Installing oneAPI components ..."
cd /tmp
wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2023.PUB
@@ -8,25 +8,18 @@ if [ ! -f /opt/intel/oneapi ]; then
rm GPG-PUB-KEY-INTEL-SW-PRODUCTS-2023.PUB
echo "deb https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list
sudo apt-get update
- sudo apt-get install intel-oneapi-dal-devel-2021.1.1 intel-oneapi-tbb-devel-2021.1.1
+ # sudo apt-get install -y build-essential cmake
+ sudo apt-get install -y intel-oneapi-dpcpp-cpp-2021.2.0 intel-oneapi-dal-devel-2021.2.0 intel-oneapi-tbb-devel-2021.2.0
else
echo "oneAPI components already installed!"
-fi
+fi
echo "Building oneCCL ..."
cd /tmp
+rm -rf oneCCL
git clone https://github.com/oneapi-src/oneCCL
cd oneCCL
-git checkout 2021.1
+git checkout 2021.2
mkdir build && cd build
cmake ..
make -j 2 install
-
-#
-# Setup building environments manually:
-#
-# export ONEAPI_ROOT=/opt/intel/oneapi
-# source /opt/intel/oneapi/dal/latest/env/vars.sh
-# source /opt/intel/oneapi/tbb/latest/env/vars.sh
-# source /tmp/oneCCL/build/_install/env/setvars.sh
-#
diff --git a/dev/setup-all.sh b/dev/setup-all.sh
new file mode 100755
index 000000000..75defbe5a
--- /dev/null
+++ b/dev/setup-all.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+
+# Setup hosts
+# Use second internal IP, use first IP will be SSH timeout
+HOST_IP=$(hostname -I | cut -f2 -d" ")
+echo $HOST_IP $(hostname) | sudo tee -a /etc/hosts
+
+# Install dependencies for building
+$GITHUB_WORKSPACE/dev/install-build-deps-ubuntu.sh
+
+# Setup password-less & python3
+$GITHUB_WORKSPACE/dev/test-cluster/config-ssh.sh
+$GITHUB_WORKSPACE/dev/test-cluster/setup-python3.sh
+
+# Setup cluster and envs
+source $GITHUB_WORKSPACE/dev/test-cluster/setup-cluster.sh
diff --git a/dev/test-cluster/ci-test-cluster.sh b/dev/test-cluster/ci-test-cluster.sh
new file mode 100755
index 000000000..7a4600267
--- /dev/null
+++ b/dev/test-cluster/ci-test-cluster.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+
+# Setup Spark envs
+source $GITHUB_WORKSPACE/dev/test-cluster/setup-spark-envs.sh
+
+# Setup OAP MLlib envs
+cp $GITHUB_WORKSPACE/dev/test-cluster/env.sh $GITHUB_WORKSPACE/conf
+
+cd $GITHUB_WORKSPACE/examples
+
+# Copy examples data to HDFS
+hadoop fs -mkdir -p /user/$USER
+hadoop fs -copyFromLocal data
+hadoop fs -ls data
+
+# Build and run all examples
+./build-all.sh
+./run-all-scala.sh
+./run-all-pyspark.sh
diff --git a/dev/test-cluster/config-ssh.sh b/dev/test-cluster/config-ssh.sh
index d093fa17a..a6fc2699e 100755
--- a/dev/test-cluster/config-ssh.sh
+++ b/dev/test-cluster/config-ssh.sh
@@ -2,5 +2,14 @@
ssh-keygen -q -N "" -t rsa -f ~/.ssh/id_rsa
cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
+chmod 600 ~/.ssh/authorized_keys
+
+# Disable strict host key checking
echo " StrictHostKeyChecking no " | sudo tee -a /etc/ssh/ssh_config
-sudo service ssh restart
+# Disable strict modes for less strict permission checking
+echo "StrictModes no" | sudo tee -a /etc/ssh/sshd_config
+
+ls -ld ~/.ssh
+ls -l ~/.ssh
+
+sudo systemctl restart ssh
diff --git a/dev/test-cluster/env.sh b/dev/test-cluster/env.sh
new file mode 100644
index 000000000..225db0b7b
--- /dev/null
+++ b/dev/test-cluster/env.sh
@@ -0,0 +1,46 @@
+# == OAP MLlib users to customize the following environments for running examples ======= #
+
+# ============== Minimum Settings ============= #
+
+# Set OAP MLlib version (e.g. 1.1.0)
+OAP_MLLIB_VERSION=1.1.0
+# Set Spark master
+SPARK_MASTER=yarn
+# Set Hadoop home path
+export HADOOP_HOME=$HADOOP_HOME
+# Set Spark home path
+export SPARK_HOME=$SPARK_HOME
+# Set HDFS Root, should be hdfs://xxx or file://xxx
+export HDFS_ROOT=hdfs://localhost:8020
+# Set OAP MLlib source code root directory
+export OAP_MLLIB_ROOT=$GITHUB_WORKSPACE
+
+# ============================================= #
+
+# Set HADOOP_CONF_DIR for Spark
+export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
+
+# Set JAR name & path
+OAP_MLLIB_JAR_NAME=oap-mllib-$OAP_MLLIB_VERSION.jar
+OAP_MLLIB_JAR=$OAP_MLLIB_ROOT/mllib-dal/target/$OAP_MLLIB_JAR_NAME
+# Set Spark driver & executor classpaths,
+# absolute path for driver, relative path for executor
+SPARK_DRIVER_CLASSPATH=$OAP_MLLIB_JAR
+SPARK_EXECUTOR_CLASSPATH=./$OAP_MLLIB_JAR_NAME
+
+# Set Spark resources, can be overwritten in example
+SPARK_DRIVER_MEMORY=1G
+SPARK_NUM_EXECUTORS=2
+SPARK_EXECUTOR_CORES=1
+SPARK_EXECUTOR_MEMORY=1G
+SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES '*' 2)
+
+# Checks
+
+for dir in $SPARK_HOME $HADOOP_HOME $OAP_MLLIB_JAR
+do
+ if [[ ! -e $dir ]]; then
+ echo $dir does not exist!
+ exit 1
+ fi
+done
diff --git a/dev/test-cluster/envs.sh b/dev/test-cluster/envs.sh
deleted file mode 100644
index 71e8506e6..000000000
--- a/dev/test-cluster/envs.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-# Set user Spark and Hadoop home directory
-export HADOOP_HOME=~/opt/hadoop-2.7.7
-export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
-export SPARK_HOME=~/opt/spark-3.0.0-bin-hadoop2.7
-
-export PYTHONPATH=$SPARK_HOME/python:$PYTHONPATH
-export PYSPARK_PYTHON=python3
-
-# Set user HDFS Root
-export HDFS_ROOT=hdfs://localhost:8020
-export OAP_MLLIB_DATA_ROOT=OAPMLlib/Data
-# Set user Intel MLlib Root directory
-export OAP_MLLIB_ROOT=${GITHUB_WORKSPACE}
-
-# Target jar built
-OAP_MLLIB_JAR_NAME=oap-mllib-1.1.0.jar
-OAP_MLLIB_JAR=$OAP_MLLIB_ROOT/mllib-dal/target/$OAP_MLLIB_JAR_NAME
-
-# Use absolute path
-SPARK_DRIVER_CLASSPATH=$OAP_MLLIB_JAR
-# Use relative path
-SPARK_EXECUTOR_CLASSPATH=./$OAP_MLLIB_JAR_NAME
diff --git a/dev/test-cluster/hadoop-env.sh b/dev/test-cluster/hadoop-env.sh
index bee6c1f69..f60b65a0b 100755
--- a/dev/test-cluster/hadoop-env.sh
+++ b/dev/test-cluster/hadoop-env.sh
@@ -22,8 +22,7 @@
# remote nodes.
# The java implementation to use.
-# export JAVA_HOME=${JAVA_HOME}
-export JAVA_HOME=/usr/local/lib/jvm/openjdk8
+export JAVA_HOME=${JAVA_HOME}
# The jsvc implementation to use. Jsvc is required to run secure datanodes
# that bind to privileged ports to provide authentication of data transfer
diff --git a/dev/test-cluster/setup-cluster.sh b/dev/test-cluster/setup-cluster.sh
index eea058f80..b58c676ab 100755
--- a/dev/test-cluster/setup-cluster.sh
+++ b/dev/test-cluster/setup-cluster.sh
@@ -6,33 +6,38 @@ cd $WORK_DIR
echo JAVA_HOME is $JAVA_HOME
-mkdir ~/opt
+[ -d ~/opt ] || mkdir ~/opt
cd ~/opt
-wget https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop2.7.tgz
-tar -xzf spark-3.0.0-bin-hadoop2.7.tgz
-wget https://archive.apache.org/dist/hadoop/core/hadoop-2.7.7/hadoop-2.7.7.tar.gz
-tar -xzf hadoop-2.7.7.tar.gz
+[ -f spark-3.0.0-bin-hadoop2.7.tgz ] || wget --no-verbose https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop2.7.tgz
+[ -d spark-3.0.0-bin-hadoop2.7 ] || tar -xzf spark-3.0.0-bin-hadoop2.7.tgz
+[ -f hadoop-2.7.7.tar.gz ] || wget --no-verbose https://archive.apache.org/dist/hadoop/core/hadoop-2.7.7/hadoop-2.7.7.tar.gz
+[ -d hadoop-2.7.7 ] || tar -xzf hadoop-2.7.7.tar.gz
cd $WORK_DIR
+# Use second internal IP, use first IP will be SSH timeout
+HOST_IP=$(hostname -I | cut -f2 -d" ")
+
+sed -i "s/localhost/$HOST_IP/g" core-site.xml
+sed -i "s/localhost/$HOST_IP/g" yarn-site.xml
+
cp ./core-site.xml ~/opt/hadoop-2.7.7/etc/hadoop/
cp ./hdfs-site.xml ~/opt/hadoop-2.7.7/etc/hadoop/
cp ./yarn-site.xml ~/opt/hadoop-2.7.7/etc/hadoop/
cp ./hadoop-env.sh ~/opt/hadoop-2.7.7/etc/hadoop/
cp ./spark-defaults.conf ~/opt/spark-3.0.0-bin-hadoop2.7/conf
+source ./setup-spark-envs.sh
+
+echo $HOST_IP > $HADOOP_HOME/etc/hadoop/slaves
+echo $HOST_IP > $SPARK_HOME/conf/slaves
+
# create directories
mkdir -p /tmp/run/hdfs/namenode
mkdir -p /tmp/run/hdfs/datanode
# hdfs format
-~/opt/hadoop-2.7.7/bin/hdfs namenode -format
-
-export HADOOP_HOME=~/opt/hadoop-2.7.7
-export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
-export SPARK_HOME=~/opt/spark-3.0.0-bin-hadoop2.7
-
-export PATH=$HADOOP_HOME/bin:$SPARK_HOME/bin:$PATH
+$HADOOP_HOME/bin/hdfs namenode -format
# start hdfs and yarn
$HADOOP_HOME/sbin/start-dfs.sh
diff --git a/dev/test-cluster/setup-python3-env.sh b/dev/test-cluster/setup-python3.sh
similarity index 100%
rename from dev/test-cluster/setup-python3-env.sh
rename to dev/test-cluster/setup-python3.sh
diff --git a/dev/test-cluster/setup-spark-envs.sh b/dev/test-cluster/setup-spark-envs.sh
new file mode 100755
index 000000000..3819f6ee8
--- /dev/null
+++ b/dev/test-cluster/setup-spark-envs.sh
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+
+export HADOOP_HOME=~/opt/hadoop-2.7.7
+export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HADOOP_HOME/lib/native
+
+export SPARK_HOME=~/opt/spark-3.0.0-bin-hadoop2.7
+export PYTHONPATH=$SPARK_HOME/python:$PYTHONPATH
+export PYSPARK_PYTHON=python3
+
+export PATH=$HADOOP_HOME/bin:$SPARK_HOME/bin:$PATH
\ No newline at end of file
diff --git a/dev/test-cluster/spark-defaults.conf b/dev/test-cluster/spark-defaults.conf
index 1c25bb2ec..05f1c31e3 100644
--- a/dev/test-cluster/spark-defaults.conf
+++ b/dev/test-cluster/spark-defaults.conf
@@ -28,7 +28,7 @@
spark.master yarn
spark.serializer org.apache.spark.serializer.KryoSerializer
-spark.driver.memory 3g
+spark.driver.memory 1g
spark.executor.num 2
spark.executor.cores 1
-spark.executor.memory 4g
+spark.executor.memory 2g
diff --git a/dev/test-cluster/workloads/kmeans-pyspark.py b/dev/test-cluster/workloads/kmeans-pyspark.py
deleted file mode 100644
index cf93e6034..000000000
--- a/dev/test-cluster/workloads/kmeans-pyspark.py
+++ /dev/null
@@ -1,70 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""
-An example demonstrating k-means clustering.
-Run with:
- bin/spark-submit examples/src/main/python/ml/kmeans_example.py
-
-This example requires NumPy (http://www.numpy.org/).
-"""
-from __future__ import print_function
-import sys
-
-# $example on$
-from pyspark.ml.clustering import KMeans
-from pyspark.ml.evaluation import ClusteringEvaluator
-# $example off$
-
-from pyspark.sql import SparkSession
-
-if __name__ == "__main__":
- spark = SparkSession\
- .builder\
- .appName("KMeansExample")\
- .getOrCreate()
-
- if (len(sys.argv) != 2) :
- println("Require data file path as input parameter")
- sys.exit(1)
-
- # $example on$
- # Loads data.
- dataset = spark.read.format("libsvm").load(sys.argv[1])
-
- # Trains a k-means model.
- kmeans = KMeans().setK(2).setSeed(1)
- model = kmeans.fit(dataset)
-
- # Make predictions
- predictions = model.transform(dataset)
-
- # Evaluate clustering by computing Silhouette score
- evaluator = ClusteringEvaluator()
-
- silhouette = evaluator.evaluate(predictions)
- print("Silhouette with squared euclidean distance = " + str(silhouette))
-
- # Shows the result.
- centers = model.clusterCenters()
- print("Cluster Centers: ")
- for center in centers:
- print(center)
- # $example off$
-
- spark.stop()
-
diff --git a/dev/test-cluster/workloads/run-kmeans-pyspark.sh b/dev/test-cluster/workloads/run-kmeans-pyspark.sh
deleted file mode 100755
index e07f3f7b6..000000000
--- a/dev/test-cluster/workloads/run-kmeans-pyspark.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/usr/bin/env bash
-
-source ../envs.sh
-
-# Data file is from Spark Examples (data/mllib/sample_kmeans_data.txt), the data file should be copied to HDFS
-$HADOOP_HOME/bin/hadoop fs -mkdir -p $OAP_MLLIB_DATA_ROOT
-$HADOOP_HOME/bin/hadoop fs -copyFromLocal $SPARK_HOME/data/mllib/sample_kmeans_data.txt $OAP_MLLIB_DATA_ROOT
-
-# User should check the requested resources are acturally allocated by cluster manager or Intel MLlib will behave incorrectly
-SPARK_MASTER=yarn
-SPARK_DRIVER_MEMORY=1G
-SPARK_NUM_EXECUTORS=2
-SPARK_EXECUTOR_CORES=1
-SPARK_EXECUTOR_MEMORY=1G
-
-SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES '*' 2)
-
-# ======================================================= #
-
-# Check env
-if [[ -z $SPARK_HOME ]]; then
- echo SPARK_HOME not defined!
- exit 1
-fi
-
-if [[ -z $HADOOP_HOME ]]; then
- echo HADOOP_HOME not defined!
- exit 1
-fi
-
-APP_PY="$OAP_MLLIB_ROOT/dev/test-cluster/workloads/kmeans-pyspark.py"
-DATA_FILE=$OAP_MLLIB_DATA_ROOT/sample_kmeans_data.txt
-
-$SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \
- --num-executors $SPARK_NUM_EXECUTORS \
- --driver-memory $SPARK_DRIVER_MEMORY \
- --executor-cores $SPARK_EXECUTOR_CORES \
- --executor-memory $SPARK_EXECUTOR_MEMORY \
- --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \
- --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \
- --conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \
- --conf "spark.driver.extraClassPath=$SPARK_DRIVER_CLASSPATH" \
- --conf "spark.executor.extraClassPath=$SPARK_EXECUTOR_CLASSPATH" \
- --conf "spark.shuffle.reduceLocality.enabled=false" \
- --conf "spark.network.timeout=1200s" \
- --conf "spark.task.maxFailures=1" \
- --jars $OAP_MLLIB_JAR \
- $APP_PY $DATA_FILE
diff --git a/examples/als-hibench/pom.xml b/examples/als-hibench/pom.xml
deleted file mode 100644
index 68e02c256..000000000
--- a/examples/als-hibench/pom.xml
+++ /dev/null
@@ -1,100 +0,0 @@
-
- 4.0.0
-
- com.intel.oap
- oap-mllib-examples
- 0.9.0-with-spark-3.0.0
- jar
-
- ALSHiBenchExample
- https://github.com/Intel-bigdata/OAP
-
-
- UTF-8
- 2.12.10
- 2.12
- 3.0.0
-
-
-
-
-
- org.scala-lang
- scala-library
- 2.12.10
-
-
-
- com.github.scopt
- scopt_2.12
- 3.7.0
-
-
-
-
-
-
-
-
-
-
- org.apache.spark
- spark-sql_2.12
- ${spark.version}
- provided
-
-
-
- org.apache.spark
- spark-mllib_2.12
- ${spark.version}
- provided
-
-
-
-
-
-
-
- org.scala-tools
- maven-scala-plugin
- 2.15.2
-
-
-
- compile
- testCompile
-
-
-
-
- ${scala.version}
-
- -target:jvm-1.8
-
-
-
-
- maven-assembly-plugin
- 3.0.0
-
- false
-
- jar-with-dependencies
-
-
-
-
- assembly
- package
-
- single
-
-
-
-
-
-
-
-
diff --git a/examples/als-hibench/run-hibench-oap-mllib.sh b/examples/als-hibench/run-hibench-oap-mllib.sh
deleted file mode 100755
index 050b80558..000000000
--- a/examples/als-hibench/run-hibench-oap-mllib.sh
+++ /dev/null
@@ -1,73 +0,0 @@
-#!/usr/bin/env bash
-
-export HDFS_ROOT=hdfs://sr591:8020
-export OAP_MLLIB_ROOT=/home/xiaochang/Works/OAP-xwu99-als/oap-mllib
-
-SPARK_MASTER=yarn
-SPARK_DRIVER_MEMORY=16G
-SPARK_NUM_EXECUTORS=6
-SPARK_EXECUTOR_CORES=28
-SPARK_EXECUTOR_MEMORY_OVERHEAD=25G
-SPARK_EXECUTOR_MEMORY=100G
-
-SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES '*' 2)
-#SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES)
-
-# ======================================================= #
-
-# for log suffix
-SUFFIX=$( basename -s .sh "${BASH_SOURCE[0]}" )
-
-# Check envs
-if [[ -z $SPARK_HOME ]]; then
- echo SPARK_HOME not defined!
- exit 1
-fi
-
-if [[ -z $HADOOP_HOME ]]; then
- echo HADOOP_HOME not defined!
- exit 1
-fi
-
-export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
-
-# Target jar built
-OAP_MLLIB_JAR_NAME=oap-mllib-0.9.0-with-spark-3.0.0.jar
-OAP_MLLIB_JAR=$OAP_MLLIB_ROOT/mllib-dal/target/$OAP_MLLIB_JAR_NAME
-
-# Use absolute path
-SPARK_DRIVER_CLASSPATH=$OAP_MLLIB_JAR
-# Use relative path
-SPARK_EXECUTOR_CLASSPATH=./$OAP_MLLIB_JAR_NAME
-
-APP_JAR=target/oap-mllib-examples-0.9.0-with-spark-3.0.0.jar
-APP_CLASS=com.intel.hibench.sparkbench.ml.ALSExample
-
-HDFS_INPUT=hdfs://sr591:8020/HiBench/ALS/Input
-RANK=10
-NUM_ITERATIONS=1
-LAMBDA=0.1
-IMPLICIT=true
-
-/usr/bin/time -p $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \
- --num-executors $SPARK_NUM_EXECUTORS \
- --driver-memory $SPARK_DRIVER_MEMORY \
- --executor-cores $SPARK_EXECUTOR_CORES \
- --executor-memory $SPARK_EXECUTOR_MEMORY \
- --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \
- --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \
- --conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \
- --conf "spark.driver.extraClassPath=$SPARK_DRIVER_CLASSPATH" \
- --conf "spark.executor.extraClassPath=$SPARK_EXECUTOR_CLASSPATH" \
- --conf "spark.shuffle.reduceLocality.enabled=false" \
- --conf "spark.executor.memoryOverhead=$SPARK_EXECUTOR_MEMORY_OVERHEAD" \
- --conf "spark.network.timeout=1200s" \
- --conf "spark.task.maxFailures=1" \
- --jars $OAP_MLLIB_JAR \
- --class $APP_CLASS \
- $APP_JAR \
- --rank $RANK --numIterations $NUM_ITERATIONS --implicitPrefs $IMPLICIT --lambda $LAMBDA \
- --numProductBlocks $SPARK_DEFAULT_PARALLELISM --numUserBlocks $SPARK_DEFAULT_PARALLELISM \
- $HDFS_INPUT \
- 2>&1 | tee ALS-$SUFFIX-$(date +%m%d_%H_%M_%S).log
-
diff --git a/examples/als-hibench/run-hibench-vanilla.sh b/examples/als-hibench/run-hibench-vanilla.sh
deleted file mode 100755
index 6cb6b3ae7..000000000
--- a/examples/als-hibench/run-hibench-vanilla.sh
+++ /dev/null
@@ -1,61 +0,0 @@
-#!/usr/bin/env bash
-
-export HDFS_ROOT=hdfs://sr591:8020
-
-SPARK_MASTER=yarn
-SPARK_DRIVER_MEMORY=16G
-SPARK_NUM_EXECUTORS=6
-SPARK_EXECUTOR_CORES=28
-SPARK_EXECUTOR_MEMORY_OVERHEAD=25G
-SPARK_EXECUTOR_MEMORY=100G
-
-SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES '*' 2)
-
-# ======================================================= #
-
-# for log suffix
-SUFFIX=$( basename -s .sh "${BASH_SOURCE[0]}" )
-
-# Check envs
-if [[ -z $SPARK_HOME ]]; then
- echo SPARK_HOME not defined!
- exit 1
-fi
-
-if [[ -z $HADOOP_HOME ]]; then
- echo HADOOP_HOME not defined!
- exit 1
-fi
-
-export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
-
-APP_JAR=target/oap-mllib-examples-0.9.0-with-spark-3.0.0.jar
-APP_CLASS=com.intel.hibench.sparkbench.ml.ALSExample
-
-HDFS_INPUT=hdfs://sr591:8020/HiBench/ALS/Input
-RANK=10
-NUM_ITERATIONS=1
-LAMBDA=0.1
-IMPLICIT=true
-
-/usr/bin/time -p $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \
- --num-executors $SPARK_NUM_EXECUTORS \
- --driver-memory $SPARK_DRIVER_MEMORY \
- --executor-cores $SPARK_EXECUTOR_CORES \
- --executor-memory $SPARK_EXECUTOR_MEMORY \
- --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \
- --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \
- --conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \
- --conf "spark.driver.extraClassPath=$SPARK_DRIVER_CLASSPATH" \
- --conf "spark.executor.extraClassPath=$SPARK_EXECUTOR_CLASSPATH" \
- --conf "spark.shuffle.reduceLocality.enabled=false" \
- --conf "spark.executor.memoryOverhead=$SPARK_EXECUTOR_MEMORY_OVERHEAD" \
- --conf "spark.network.timeout=1200s" \
- --conf "spark.task.maxFailures=1" \
- --class $APP_CLASS \
- $APP_JAR \
- --rank $RANK --numIterations $NUM_ITERATIONS --implicitPrefs $IMPLICIT --lambda $LAMBDA \
- --numProductBlocks $SPARK_DEFAULT_PARALLELISM --numUserBlocks $SPARK_DEFAULT_PARALLELISM \
- $HDFS_INPUT \
- 2>&1 | tee ALS-$SUFFIX-$(date +%m%d_%H_%M_%S).log
-
diff --git a/examples/als-hibench/src/main/scala/com/intel/hibench/sparkbench/ml/ALSExample.scala b/examples/als-hibench/src/main/scala/com/intel/hibench/sparkbench/ml/ALSExample.scala
deleted file mode 100644
index 5a29bcc80..000000000
--- a/examples/als-hibench/src/main/scala/com/intel/hibench/sparkbench/ml/ALSExample.scala
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.intel.hibench.sparkbench.ml
-
-import org.apache.spark.ml.evaluation.RegressionEvaluator
-import org.apache.spark.ml.recommendation.ALS
-import org.apache.spark.ml.recommendation.ALS.Rating
-import org.apache.spark.sql.SparkSession
-import scopt.OptionParser
-
-object ALSExample {
-
- case class Params(
- dataPath: String = null,
- numIterations: Int = 10,
- lambda: Double = 0.1,
- rank: Int = 10,
- numUserBlocks: Int = 10,
- numItemBlocks: Int = 10,
- implicitPrefs: Boolean = false)
-
- def main(args: Array[String]) {
- val defaultParams = Params()
-
- val parser = new OptionParser[Params]("ALS") {
- head("ALS: an example app for ALS on User-Item data.")
- opt[Int]("rank")
- .text(s"rank, default: ${defaultParams.rank}")
- .action((x, c) => c.copy(rank = x))
- opt[Int]("numIterations")
- .text(s"number of iterations, default: ${defaultParams.numIterations}")
- .action((x, c) => c.copy(numIterations = x))
- opt[Double]("lambda")
- .text(s"regularization parameter, default: ${defaultParams.lambda}")
- .action((x, c) => c.copy(lambda = x))
- opt[Int]("numUserBlocks")
- .text(s"number of user blocks, default: ${defaultParams.numUserBlocks}")
- .action((x, c) => c.copy(numUserBlocks = x))
- opt[Int]("numProductBlocks")
- .text(s"number of product blocks, default: ${defaultParams.numItemBlocks}")
- .action((x, c) => c.copy(numItemBlocks = x))
- opt[Boolean]("implicitPrefs")
- .text("implicit preference, default: ${defaultParams.implicitPrefs}")
- .action((x, c) => c.copy(implicitPrefs = x))
- arg[String]("")
- .required()
- .text("Input paths to a User-Product dataset of ratings")
- .action((x, c) => c.copy(dataPath = x))
- }
- parser.parse(args, defaultParams) match {
- case Some(params) => run(params)
- case _ => sys.exit(1)
- }
- }
-
- def run(params: Params): Unit = {
- val spark = SparkSession
- .builder
- .appName(s"ALS with $params")
- .getOrCreate()
- val sc = spark.sparkContext
-
- import spark.implicits._
-
- val ratings = sc.objectFile[Rating[Int]](params.dataPath).toDF()
-
- val Array(training, test) = ratings.randomSplit(Array(0.8, 0.2), 1L)
-
- // Build the recommendation model using ALS on the training data
- val als = new ALS()
- .setRank(params.rank)
- .setMaxIter(params.numIterations)
- .setRegParam(params.lambda)
- .setImplicitPrefs(params.implicitPrefs)
- .setNumUserBlocks(params.numUserBlocks)
- .setNumItemBlocks(params.numItemBlocks)
- .setUserCol("user")
- .setItemCol("item")
- .setRatingCol("rating")
- val model = als.fit(training)
-
- // Evaluate the model by computing the RMSE on the test data
- // Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
- model.setColdStartStrategy("drop")
- val predictions = model.transform(test)
-
- val evaluator = new RegressionEvaluator()
- .setMetricName("rmse")
- .setLabelCol("rating")
- .setPredictionCol("prediction")
- val rmse = evaluator.evaluate(predictions)
- println(s"Root-mean-square error = $rmse")
-
- spark.stop()
- }
-}
diff --git a/examples/als-pyspark/als-pyspark.py b/examples/als-pyspark/als-pyspark.py
index 8847ca2b9..12622a4fd 100644
--- a/examples/als-pyspark/als-pyspark.py
+++ b/examples/als-pyspark/als-pyspark.py
@@ -44,8 +44,8 @@
parts = lines.map(lambda row: row.value.split("::"))
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
rating=float(p[2])))
- ratings = spark.createDataFrame(ratingsRDD)
- # (training, test) = ratings.randomSplit([0.8, 0.2])
+ ratings = spark.createDataFrame(ratingsRDD)
+ (training, test) = ratings.randomSplit([0.8, 0.2])
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
@@ -55,13 +55,13 @@
print("\nALS training with implicitPrefs={}, rank={}, maxIter={}, regParam={}, alpha={}, seed={}\n".format(
als.getImplicitPrefs(), als.getRank(), als.getMaxIter(), als.getRegParam(), als.getAlpha(), als.getSeed()
))
- model = als.fit(ratings)
+ model = als.fit(training)
- # Evaluate the model by computing the RMSE on the test data
- # predictions = model.transform(test)
- # evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
- # predictionCol="prediction")
- # rmse = evaluator.evaluate(predictions)
- # print("Root-mean-square error = " + str(rmse))
+ # Evaluate the model by computing the RMSE on the test data
+ predictions = model.transform(test)
+ evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
+ predictionCol="prediction")
+ rmse = evaluator.evaluate(predictions)
+ print("Root-mean-square error = " + str(rmse))
spark.stop()
diff --git a/examples/als-pyspark/run.sh b/examples/als-pyspark/run.sh
index b3ba1b6d2..044c857f6 100755
--- a/examples/als-pyspark/run.sh
+++ b/examples/als-pyspark/run.sh
@@ -1,62 +1,14 @@
#!/usr/bin/env bash
-# == User to customize the following environments ======= #
+source ../../conf/env.sh
-# Set user Spark and Hadoop home directory
-#export SPARK_HOME=/path/to/your/spark/home
-#export HADOOP_HOME=/path/to/your/hadoop/home
-# Set user HDFS Root
-export HDFS_ROOT=hdfs://sr549:8020
-# Set user Intel MLlib Root directory
-export OAP_MLLIB_ROOT=/home/xiaochang/Works/OAP-xwu99-als/oap-mllib
-# Set IP and Port for oneCCL KVS, you can select any one of the worker nodes and set CCL_KVS_IP_PORT to its IP and Port
-# IP can be got with `hostname -I`, if multiple IPs are returned, the first IP should be used. Port can be any available port.
-# For example, if one of the worker IP is 192.168.0.1 and an available port is 51234.
-# CCL_KVS_IP_PORT can be set in the format of 192.168.0.1_51234
-# Incorrectly setting this value will result in hanging when oneCCL initialize
-export CCL_KVS_IP_PORT=10.0.2.149_51234
-
-# Data file is from Spark Examples (data/mllib/sample_kmeans_data.txt), the data file should be copied to HDFS
+# Data file is converted from oneDAL examples ($DAALROOT/examples/daal/data/batch/implicit_als_csr.csv)
+# The data file should be copied to $HDFS_ROOT before running examples
DATA_FILE=data/onedal_als_csr_ratings.txt
-# == User to customize Spark executor cores and memory == #
-
-# User should check the requested resources are acturally allocated by cluster manager or Intel MLlib will behave incorrectly
-SPARK_MASTER=yarn
-SPARK_DRIVER_MEMORY=1G
-SPARK_NUM_EXECUTORS=2
-SPARK_EXECUTOR_CORES=1
-SPARK_EXECUTOR_MEMORY=1G
-
-SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES '*' 2)
-
-# ======================================================= #
-
-# Check env
-if [[ -z $SPARK_HOME ]]; then
- echo SPARK_HOME not defined!
- exit 1
-fi
-
-if [[ -z $HADOOP_HOME ]]; then
- echo HADOOP_HOME not defined!
- exit 1
-fi
-
-export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
-
-# Target jar built
-OAP_MLLIB_JAR_NAME=oap-mllib-0.9.0-with-spark-3.0.0.jar
-OAP_MLLIB_JAR=$OAP_MLLIB_ROOT/mllib-dal/target/$OAP_MLLIB_JAR_NAME
-
-# Use absolute path
-SPARK_DRIVER_CLASSPATH=$OAP_MLLIB_JAR
-# Use relative path
-SPARK_EXECUTOR_CLASSPATH=./$OAP_MLLIB_JAR_NAME
-
APP_PY=als-pyspark.py
-/usr/bin/time -p $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \
+time $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \
--num-executors $SPARK_NUM_EXECUTORS \
--driver-memory $SPARK_DRIVER_MEMORY \
--executor-cores $SPARK_EXECUTOR_CORES \
@@ -66,7 +18,6 @@ APP_PY=als-pyspark.py
--conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \
--conf "spark.driver.extraClassPath=$SPARK_DRIVER_CLASSPATH" \
--conf "spark.executor.extraClassPath=$SPARK_EXECUTOR_CLASSPATH" \
- --conf "spark.executorEnv.CCL_KVS_IP_PORT=$CCL_KVS_IP_PORT" \
--conf "spark.shuffle.reduceLocality.enabled=false" \
--conf "spark.network.timeout=1200s" \
--conf "spark.task.maxFailures=1" \
diff --git a/examples/als-hibench/build.sh b/examples/als/build.sh
similarity index 52%
rename from examples/als-hibench/build.sh
rename to examples/als/build.sh
index 8cbc692be..3c01d1689 100755
--- a/examples/als-hibench/build.sh
+++ b/examples/als/build.sh
@@ -1,3 +1,4 @@
#!/usr/bin/env bash
-mvn clean package
\ No newline at end of file
+mvn clean package
+
diff --git a/examples/kmeans-hibench/pom.xml b/examples/als/pom.xml
similarity index 91%
rename from examples/kmeans-hibench/pom.xml
rename to examples/als/pom.xml
index 3f5b56e29..94a72189a 100644
--- a/examples/kmeans-hibench/pom.xml
+++ b/examples/als/pom.xml
@@ -4,14 +4,15 @@
com.intel.oap
oap-mllib-examples
- 1.1.0-with-spark-3.0.0
+ ${oap.version}-with-spark-${spark.version}
jar
- KMeansHiBenchExample
+ ALSExample
https://github.com/oap-project/oap-mllib.git
UTF-8
+ 1.1.0
2.12.10
2.12
3.0.0
@@ -31,12 +32,6 @@
3.7.0
-
- org.apache.mahout
- mahout-hdfs
- 14.1
-
-
org.apache.spark
spark-sql_2.12
diff --git a/examples/als/run.sh b/examples/als/run.sh
new file mode 100755
index 000000000..8a317dbc7
--- /dev/null
+++ b/examples/als/run.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+
+source ../../conf/env.sh
+
+# Data file is converted from oneDAL examples ($DAALROOT/examples/daal/data/batch/implicit_als_csr.csv)
+# The data file should be copied to $HDFS_ROOT before running examples
+DATA_FILE=data/onedal_als_csr_ratings.txt
+
+APP_JAR=target/oap-mllib-examples-$OAP_MLLIB_VERSION-with-spark-3.0.0.jar
+APP_CLASS=org.apache.spark.examples.ml.ALSExample
+
+time $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \
+ --num-executors $SPARK_NUM_EXECUTORS \
+ --driver-memory $SPARK_DRIVER_MEMORY \
+ --executor-cores $SPARK_EXECUTOR_CORES \
+ --executor-memory $SPARK_EXECUTOR_MEMORY \
+ --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \
+ --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \
+ --conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \
+ --conf "spark.driver.extraClassPath=$SPARK_DRIVER_CLASSPATH" \
+ --conf "spark.executor.extraClassPath=$SPARK_EXECUTOR_CLASSPATH" \
+ --conf "spark.shuffle.reduceLocality.enabled=false" \
+ --conf "spark.network.timeout=1200s" \
+ --conf "spark.task.maxFailures=1" \
+ --jars $OAP_MLLIB_JAR \
+ --class $APP_CLASS \
+ $APP_JAR $DATA_FILE \
+ 2>&1 | tee ALS-$(date +%m%d_%H_%M_%S).log
diff --git a/examples/als/src/main/scala/org/apache/spark/examples/ml/ALSExample.scala b/examples/als/src/main/scala/org/apache/spark/examples/ml/ALSExample.scala
new file mode 100644
index 000000000..1071e906b
--- /dev/null
+++ b/examples/als/src/main/scala/org/apache/spark/examples/ml/ALSExample.scala
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.evaluation.RegressionEvaluator
+import org.apache.spark.ml.recommendation.ALS
+// $example off$
+import org.apache.spark.sql.SparkSession
+
+/**
+ * An example demonstrating ALS.
+ * Run with
+ * {{{
+ * bin/run-example ml.ALSExample
+ * }}}
+ */
+object ALSExample {
+
+ // $example on$
+ case class Rating(userId: Int, movieId: Int, rating: Float)
+ def parseRating(str: String): Rating = {
+ val fields = str.split("::")
+ assert(fields.size == 3)
+ Rating(fields(0).toInt, fields(1).toInt, fields(2).toFloat)
+ }
+ // $example off$
+
+ def main(args: Array[String]): Unit = {
+ val spark = SparkSession
+ .builder
+ .appName("ALSExample")
+ .getOrCreate()
+ import spark.implicits._
+
+ if (args.length != 1) {
+ println("Require data file path as input parameter")
+ sys.exit(1)
+ }
+
+ // $example on$
+ val ratings = spark.read.textFile(args(0))
+ .map(parseRating)
+ .toDF()
+ val Array(training, test) = ratings.randomSplit(Array(0.8, 0.2))
+
+ // Build the recommendation model using ALS on the training data
+ val als = new ALS()
+ .setImplicitPrefs(true)
+ .setRank(10)
+ .setMaxIter(5)
+ .setRegParam(0.01)
+ .setAlpha(40.0)
+ .setUserCol("userId")
+ .setItemCol("movieId")
+ .setRatingCol("rating")
+ val model = als.fit(training)
+
+ // Evaluate the model by computing the RMSE on the test data
+ // Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
+ model.setColdStartStrategy("drop")
+ val predictions = model.transform(test)
+
+ val evaluator = new RegressionEvaluator()
+ .setMetricName("rmse")
+ .setLabelCol("rating")
+ .setPredictionCol("prediction")
+ val rmse = evaluator.evaluate(predictions)
+ println(s"Root-mean-square error = $rmse")
+
+ spark.stop()
+ }
+}
+// scalastyle:on println
+
diff --git a/examples/build-all.sh b/examples/build-all.sh
new file mode 100755
index 000000000..62a95d255
--- /dev/null
+++ b/examples/build-all.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+for dir in kmeans pca als
+do
+ cd $dir
+ ./build.sh
+ cd ..
+done
diff --git a/examples/data/onedal_als_csr_ratings.txt b/examples/data/onedal_als_csr_ratings.txt
new file mode 100644
index 000000000..e3f5d3fae
--- /dev/null
+++ b/examples/data/onedal_als_csr_ratings.txt
@@ -0,0 +1,167 @@
+0::17::0.938283
+1::1::0.124207
+1::8::0.411504
+1::13::0.746992
+1::16::0.169437
+2::6::0.42021
+2::12::0.322446
+2::17::0.561519
+3::2::0.760795
+4::6::0.252532
+4::8::0.155777
+5::1::0.593943
+5::2::0.5289
+5::9::0.827276
+5::16::0.291741
+6::1::0.642664
+6::9::0.843839
+6::18::0.801948
+7::17::0.162832
+8::15::0.764005
+8::19::0.545399
+9::4::0.871051
+9::8::0.2094
+9::9::0.900738
+9::17::0.998866
+9::19::0.154139
+10::0::0.59703
+10::6::0.727774
+10::15::0.877197
+11::3::0.598636
+11::5::0.999655
+11::7::0.23638
+11::14::0.463678
+11::17::0.802767
+11::18::0.828629
+12::3::0.449008
+12::4::0.108126
+12::5::0.17944
+12::11::0.14992
+12::15::0.645085
+12::17::0.356908
+13::7::0.54838
+13::13::0.719667
+14::1::0.144589
+14::2::0.956232
+14::4::0.410129
+14::5::0.237406
+14::9::0.701227
+15::5::0.598455
+15::13::0.534545
+15::18::0.85741
+16::0::0.08512
+16::1::0.306062
+17::2::0.87395
+17::6::0.680554
+17::15::0.383043
+17::20::0.16813
+18::2::0.641488
+18::4::0.542261
+18::10::0.69714
+18::11::0.776203
+18::17::0.498716
+18::18::0.788093
+18::20::0.52406
+19::4::0.10402
+19::7::0.276732
+20::1::0.666263
+20::6::0.280048
+21::5::0.898574
+21::6::0.892768
+21::9::0.061185
+21::18::0.691028
+22::7::0.813807
+22::16::0.293614
+23::10::0.217541
+23::14::0.98958
+23::15::0.20269
+24::7::0.67432
+24::8::0.520428
+24::10::0.138665
+24::13::0.364809
+24::14::0.970167
+24::19::0.68381
+25::0::0.166145
+25::1::0.194913
+25::2::0.265607
+25::18::0.740052
+25::19::0.209377
+26::2::0.122306
+26::8::0.742562
+26::11::0.405206
+26::16::0.442783
+27::12::0.010994
+27::16::0.632512
+27::17::0.421555
+28::1::0.854519
+28::3::0.843519
+28::7::0.388753
+28::12::0.020689
+28::13::0.071531
+28::14::0.537579
+28::16::0.079456
+29::17::0.548573
+30::1::0.959732
+30::3::0.913432
+30::4::0.88553
+31::0::0.653987
+31::13::0.736684
+31::20::0.629751
+32::2::0.420538
+32::6::0.110444
+32::12::0.55993
+32::13::0.730668
+32::17::0.588223
+32::18::0.188579
+33::8::0.717314
+33::9::0.249797
+33::10::0.404286
+33::18::0.83197
+34::4::0.364628
+34::6::0.023655
+34::10::0.94169
+35::3::0.015393
+35::11::0.356229
+35::18::0.328241
+36::0::0.03866
+36::1::0.21685
+36::5::0.725101
+36::8::0.191972
+36::9::0.658415
+36::12::0.592436
+37::2::0.812225
+37::4::0.411506
+37::6::0.613151
+37::9::0.345352
+37::10::0.89008
+37::12::0.139664
+37::17::0.7633
+37::20::0.488679
+38::0::0.594923
+38::1::0.441561
+38::13::0.467085
+39::0::0.949957
+39::7::0.360488
+39::12::0.354949
+39::15::0.976556
+39::17::0.024024
+40::1::0.121904
+40::4::0.871203
+40::8::0.102956
+41::9::0.593112
+42::3::0.542693
+42::4::0.340404
+42::6::0.997438
+42::7::0.335679
+42::10::0.657767
+42::11::0.382666
+42::14::0.621782
+42::17::0.150028
+43::16::0.318803
+43::17::0.83869
+44::10::0.460685
+45::0::0.926797
+45::4::0.257822
+45::8::0.714351
+45::17::0.333358
+45::18::0.134587
diff --git a/examples/pca-pyspark/data/pca_data.csv b/examples/data/pca_data.csv
similarity index 100%
rename from examples/pca-pyspark/data/pca_data.csv
rename to examples/data/pca_data.csv
diff --git a/examples/data/sample_kmeans_data.txt b/examples/data/sample_kmeans_data.txt
new file mode 100644
index 000000000..50013776b
--- /dev/null
+++ b/examples/data/sample_kmeans_data.txt
@@ -0,0 +1,6 @@
+0 1:0.0 2:0.0 3:0.0
+1 1:0.1 2:0.1 3:0.1
+2 1:0.2 2:0.2 3:0.2
+3 1:9.0 2:9.0 3:9.0
+4 1:9.1 2:9.1 3:9.1
+5 1:9.2 2:9.2 3:9.2
diff --git a/examples/kmeans-hibench/run-hibench-oap-mllib.sh b/examples/kmeans-hibench/run-hibench-oap-mllib.sh
deleted file mode 100755
index caa42584f..000000000
--- a/examples/kmeans-hibench/run-hibench-oap-mllib.sh
+++ /dev/null
@@ -1,86 +0,0 @@
-#!/usr/bin/env bash
-
-# == User to customize the following environments ======= #
-
-# Set user Spark and Hadoop home directory
-export SPARK_HOME=/path/to/your/spark/home
-export HADOOP_HOME=/path/to/your/hadoop/home
-# Set user HDFS Root
-export HDFS_ROOT=hdfs://your_hostname:8020
-# Set user Intel MLlib Root directory
-export OAP_MLLIB_ROOT=/path/to/your/OAP/oap-mllib
-# Set IP and Port for oneCCL KVS, you can select any one of the worker nodes and set CCL_KVS_IP_PORT to its IP and Port
-# IP can be got with `hostname -I`, if multiple IPs are returned, the first IP should be used. Port can be any available port.
-# For example, if one of the worker IP is 192.168.0.1 and an available port is 51234.
-# CCL_KVS_IP_PORT can be set in the format of 192.168.0.1_51234
-# Incorrectly setting this value will result in hanging when oneCCL initialize
-export CCL_KVS_IP_PORT=192.168.0.1_51234
-
-# == User to customize Spark executor cores and memory == #
-
-# User should check the requested resources are acturally allocated by cluster manager or Intel MLlib will behave incorrectly
-SPARK_MASTER=yarn
-SPARK_DRIVER_MEMORY=8G
-SPARK_NUM_EXECUTORS=6
-SPARK_EXECUTOR_CORES=15
-SPARK_EXECUTOR_MEMORY_OVERHEAD=25G
-SPARK_EXECUTOR_MEMORY=50G
-
-SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES '*' 2)
-
-# ======================================================= #
-
-# for log suffix
-SUFFIX=$( basename -s .sh "${BASH_SOURCE[0]}" )
-
-# Check envs
-if [[ -z $SPARK_HOME ]]; then
- echo SPARK_HOME not defined!
- exit 1
-fi
-
-if [[ -z $HADOOP_HOME ]]; then
- echo HADOOP_HOME not defined!
- exit 1
-fi
-
-export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
-
-# Target jar built
-OAP_MLLIB_JAR_NAME=oap-mllib-0.9.0-with-spark-3.0.0.jar
-OAP_MLLIB_JAR=$OAP_MLLIB_ROOT/mllib-dal/target/$OAP_MLLIB_JAR_NAME
-
-# Use absolute path
-SPARK_DRIVER_CLASSPATH=$OAP_MLLIB_JAR
-# Use relative path
-SPARK_EXECUTOR_CLASSPATH=./$OAP_MLLIB_JAR_NAME
-
-APP_JAR=target/oap-mllib-examples-0.9.0-with-spark-3.0.0.jar
-APP_CLASS=com.intel.hibench.sparkbench.ml.DenseKMeansDS
-
-K=200
-INIT_MODE=Random
-MAX_ITERATION=20
-INPUT_HDFS=$HDFS_ROOT/HiBench/Kmeans/Input/samples
-
-/usr/bin/time -p $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \
- --num-executors $SPARK_NUM_EXECUTORS \
- --driver-memory $SPARK_DRIVER_MEMORY \
- --executor-cores $SPARK_EXECUTOR_CORES \
- --executor-memory $SPARK_EXECUTOR_MEMORY \
- --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \
- --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \
- --conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \
- --conf "spark.driver.extraClassPath=$SPARK_DRIVER_CLASSPATH" \
- --conf "spark.executor.extraClassPath=$SPARK_EXECUTOR_CLASSPATH" \
- --conf "spark.executorEnv.CCL_KVS_IP_PORT=$CCL_KVS_IP_PORT" \
- --conf "spark.shuffle.reduceLocality.enabled=false" \
- --conf "spark.executor.memoryOverhead=$SPARK_EXECUTOR_MEMORY_OVERHEAD" \
- --conf "spark.memory.fraction=0.8" \
- --conf "spark.network.timeout=1200s" \
- --conf "spark.task.maxFailures=1" \
- --jars $OAP_MLLIB_JAR \
- --class $APP_CLASS \
- $APP_JAR \
- -k $K --initMode $INIT_MODE --numIterations $MAX_ITERATION $INPUT_HDFS \
- 2>&1 | tee KMeansHiBench-$SUFFIX-$(date +%m%d_%H_%M_%S).log
diff --git a/examples/kmeans-hibench/run-hibench-vanilla.sh b/examples/kmeans-hibench/run-hibench-vanilla.sh
deleted file mode 100755
index 475c25aff..000000000
--- a/examples/kmeans-hibench/run-hibench-vanilla.sh
+++ /dev/null
@@ -1,58 +0,0 @@
-#!/usr/bin/env bash
-
-# == User to customize the following environments ======= #
-
-# Set user Spark and Hadoop home directory
-export SPARK_HOME=/path/to/your/spark/home
-export HADOOP_HOME=/path/to/your/hadoop/home
-# Set user HDFS Root
-export HDFS_ROOT=hdfs://your_hostname:8020
-
-# == User to customize Spark executor cores and memory == #
-
-SPARK_MASTER=yarn
-SPARK_DRIVER_MEMORY=8G
-SPARK_NUM_EXECUTORS=6
-SPARK_EXECUTOR_CORES=15
-SPARK_EXECUTOR_MEMORY=75G
-
-SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES '*' 2)
-
-# ======================================================= #
-
-# for log suffix
-SUFFIX=$( basename -s .sh "${BASH_SOURCE[0]}" )
-
-# Check envs
-if [[ -z $SPARK_HOME ]]; then
- echo SPARK_HOME not defined!
- exit 1
-fi
-
-if [[ -z $HADOOP_HOME ]]; then
- echo HADOOP_HOME not defined!
- exit 1
-fi
-
-export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
-
-APP_JAR=target/oap-mllib-examples-0.9.0-with-spark-3.0.0.jar
-APP_CLASS=com.intel.hibench.sparkbench.ml.DenseKMeansDS
-
-K=200
-INIT_MODE=Random
-MAX_ITERATION=20
-INPUT_HDFS=$HDFS_ROOT/HiBench/Kmeans/Input/samples
-
-/usr/bin/time -p $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \
- --num-executors $SPARK_NUM_EXECUTORS \
- --driver-memory $SPARK_DRIVER_MEMORY \
- --executor-cores $SPARK_EXECUTOR_CORES \
- --executor-memory $SPARK_EXECUTOR_MEMORY \
- --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \
- --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \
- --conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \
- --class $APP_CLASS \
- $APP_JAR \
- -k $K --initMode $INIT_MODE --numIterations $MAX_ITERATION $INPUT_HDFS \
- 2>&1 | tee KMeansHiBench-$SUFFIX-$(date +%m%d_%H_%M_%S).log
diff --git a/examples/kmeans-hibench/src/main/scala/com/intel/hibench/sparkbench/ml/DenseKMeansDS.scala b/examples/kmeans-hibench/src/main/scala/com/intel/hibench/sparkbench/ml/DenseKMeansDS.scala
deleted file mode 100644
index 3a949bb1c..000000000
--- a/examples/kmeans-hibench/src/main/scala/com/intel/hibench/sparkbench/ml/DenseKMeansDS.scala
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.intel.hibench.sparkbench.ml
-
-import breeze.linalg.DenseVector
-import org.apache.hadoop.io.LongWritable
-import org.apache.mahout.math.VectorWritable
-import org.apache.spark.ml.clustering.KMeans
-import org.apache.spark.ml.evaluation.ClusteringEvaluator
-import org.apache.spark.ml.linalg.{Vector, Vectors}
-import org.apache.spark.{SparkConf, SparkContext}
-import org.apache.spark.sql._
-import scopt.OptionParser
-import org.apache.spark.sql.SparkSession
-
-object DenseKMeansDS {
-
- object InitializationMode extends Enumeration {
- type InitializationMode = Value
- val Random, Parallel = Value
- }
-
- import com.intel.hibench.sparkbench.ml.DenseKMeansDS.InitializationMode._
-
- case class Params(input: String = null,
- k: Int = -1,
- numIterations: Int = 10,
- initializationMode: InitializationMode = Random)
-
- def main(args: Array[String]) {
- val defaultParams = Params()
-
- val parser = new OptionParser[Params]("DenseKMeans") {
- head("DenseKMeans: an example k-means app for dense data.")
- opt[Int]('k', "k")
- .required()
- .text(s"number of clusters, required")
- .action((x, c) => c.copy(k = x))
- opt[Int]("numIterations")
- .text(s"number of iterations, default; ${defaultParams.numIterations}")
- .action((x, c) => c.copy(numIterations = x))
- opt[String]("initMode")
- .text(s"initialization mode (${InitializationMode.values.mkString(",")}), " +
- s"default: ${defaultParams.initializationMode}")
- .action((x, c) => c.copy(initializationMode = InitializationMode.withName(x)))
- arg[String]("")
- .text("input paths to examples")
- .required()
- .action((x, c) => c.copy(input = x))
- }
-
- parser.parse(args, defaultParams).map { params =>
- run(params)
- }.getOrElse {
- sys.exit(1)
- }
- }
-
- def run(params: Params) {
- val spark = SparkSession
- .builder
- .appName(s"DenseKMeansDS with $params")
- .getOrCreate()
- import spark.implicits._
-
- val sc = spark.sparkContext
-
- val data = sc.sequenceFile[LongWritable, VectorWritable](params.input)
-
- // Should use Tuple1 to warp around for calling toDF
- val dataset = data.map { case (k, v) =>
- var vector: Array[Double] = new Array[Double](v.get().size)
- for (i <- 0 until v.get().size) vector(i) = v.get().get(i)
- Tuple1(Vectors.dense(vector))
- }.toDF("features")
-
- val initMode = params.initializationMode match {
- case Random => "random"
- case Parallel => "k-means||"
- }
-
- val model = new KMeans()
- .setInitMode(initMode)
- .setK(params.k)
- .setMaxIter(params.numIterations)
- .setSeed(1L)
- .fit(dataset)
-
- spark.stop()
- }
-}
-
diff --git a/examples/kmeans-pyspark/run.sh b/examples/kmeans-pyspark/run.sh
index d029bf294..0fa2a7bcb 100755
--- a/examples/kmeans-pyspark/run.sh
+++ b/examples/kmeans-pyspark/run.sh
@@ -1,62 +1,14 @@
#!/usr/bin/env bash
-# == User to customize the following environments ======= #
+source ../../conf/env.sh
-# Set user Spark and Hadoop home directory
-export SPARK_HOME=/path/to/your/spark/home
-export HADOOP_HOME=/path/to/your/hadoop/home
-# Set user HDFS Root
-export HDFS_ROOT=hdfs://your_hostname:8020
-# Set user Intel MLlib Root directory
-export OAP_MLLIB_ROOT=/path/to/your/OAP/oap-mllib
-# Set IP and Port for oneCCL KVS, you can select any one of the worker nodes and set CCL_KVS_IP_PORT to its IP and Port
-# IP can be got with `hostname -I`, if multiple IPs are returned, the first IP should be used. Port can be any available port.
-# For example, if one of the worker IP is 192.168.0.1 and an available port is 51234.
-# CCL_KVS_IP_PORT can be set in the format of 192.168.0.1_51234
-# Incorrectly setting this value will result in hanging when oneCCL initialize
-export CCL_KVS_IP_PORT=192.168.0.1_51234
-
-# Data file is from Spark Examples (data/mllib/sample_kmeans_data.txt), the data file should be copied to HDFS
+# Data file is from Spark Examples (data/mllib/sample_kmeans_data.txt) and put in examples/data
+# The data file should be copied to $HDFS_ROOT before running examples
DATA_FILE=data/sample_kmeans_data.txt
-# == User to customize Spark executor cores and memory == #
-
-# User should check the requested resources are acturally allocated by cluster manager or Intel MLlib will behave incorrectly
-SPARK_MASTER=yarn
-SPARK_DRIVER_MEMORY=1G
-SPARK_NUM_EXECUTORS=2
-SPARK_EXECUTOR_CORES=1
-SPARK_EXECUTOR_MEMORY=1G
-
-SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES '*' 2)
-
-# ======================================================= #
-
-# Check env
-if [[ -z $SPARK_HOME ]]; then
- echo SPARK_HOME not defined!
- exit 1
-fi
-
-if [[ -z $HADOOP_HOME ]]; then
- echo HADOOP_HOME not defined!
- exit 1
-fi
-
-export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
-
-# Target jar built
-OAP_MLLIB_JAR_NAME=oap-mllib-0.9.0-with-spark-3.0.0.jar
-OAP_MLLIB_JAR=$OAP_MLLIB_ROOT/mllib-dal/target/$OAP_MLLIB_JAR_NAME
-
-# Use absolute path
-SPARK_DRIVER_CLASSPATH=$OAP_MLLIB_JAR
-# Use relative path
-SPARK_EXECUTOR_CLASSPATH=./$OAP_MLLIB_JAR_NAME
-
APP_PY=kmeans-pyspark.py
-/usr/bin/time -p $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \
+time $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \
--num-executors $SPARK_NUM_EXECUTORS \
--driver-memory $SPARK_DRIVER_MEMORY \
--executor-cores $SPARK_EXECUTOR_CORES \
@@ -66,7 +18,6 @@ APP_PY=kmeans-pyspark.py
--conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \
--conf "spark.driver.extraClassPath=$SPARK_DRIVER_CLASSPATH" \
--conf "spark.executor.extraClassPath=$SPARK_EXECUTOR_CLASSPATH" \
- --conf "spark.executorEnv.CCL_KVS_IP_PORT=$CCL_KVS_IP_PORT" \
--conf "spark.shuffle.reduceLocality.enabled=false" \
--conf "spark.network.timeout=1200s" \
--conf "spark.task.maxFailures=1" \
diff --git a/examples/kmeans/pom.xml b/examples/kmeans/pom.xml
index 71c4ea6d7..476a98ce6 100644
--- a/examples/kmeans/pom.xml
+++ b/examples/kmeans/pom.xml
@@ -4,7 +4,7 @@
com.intel.oap
oap-mllib-examples
- 1.1.0-with-spark-3.0.0
+ ${oap.version}-with-spark-${spark.version}
jar
KMeansExample
@@ -12,6 +12,7 @@
UTF-8
+ 1.1.0
2.12.10
2.12
3.0.0
diff --git a/examples/kmeans/run.sh b/examples/kmeans/run.sh
index bb0f9ac78..00b782464 100755
--- a/examples/kmeans/run.sh
+++ b/examples/kmeans/run.sh
@@ -1,64 +1,15 @@
#!/usr/bin/env bash
-# == User to customize the following environments ======= #
+source ../../conf/env.sh
-# Set user Spark and Hadoop home directory
-export SPARK_HOME=/path/to/your/spark/home
-export HADOOP_HOME=/path/to/your/hadoop/home
-# Set user HDFS Root
-export HDFS_ROOT=hdfs://your_hostname:8020
-# Set user Intel MLlib Root directory
-export OAP_MLLIB_ROOT=/path/to/your/OAP/oap-mllib
-# Set IP and Port for oneCCL KVS, you can select any one of the worker nodes and set CCL_KVS_IP_PORT to its IP and Port
-# IP can be got with `hostname -I`, if multiple IPs are returned, the first IP should be used. Port can be any available port.
-# For example, if one of the worker IP is 192.168.0.1 and an available port is 51234.
-# CCL_KVS_IP_PORT can be set in the format of 192.168.0.1_51234
-# Incorrectly setting this value will result in hanging when oneCCL initialize
-export CCL_KVS_IP_PORT=192.168.0.1_51234
-
-# Data file is from Spark examples' data (data/mllib/sample_kmeans_data.txt)
-# This data file should be copied to HDFS hdfs://your_hostname:8020/user//data/sample_kmeans_data.txt
+# Data file is from Spark Examples (data/mllib/sample_kmeans_data.txt) and put in examples/data
+# The data file should be copied to $HDFS_ROOT before running examples
DATA_FILE=data/sample_kmeans_data.txt
-# == User to customize Spark executor cores and memory == #
-
-# User should check the requested resources are acturally allocated by cluster manager or Intel MLlib will behave incorrectly
-SPARK_MASTER=yarn
-SPARK_DRIVER_MEMORY=1G
-SPARK_NUM_EXECUTORS=2
-SPARK_EXECUTOR_CORES=1
-SPARK_EXECUTOR_MEMORY=1G
-
-SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES '*' 2)
-
-# ======================================================= #
-
-# Check envs
-if [[ -z $SPARK_HOME ]]; then
- echo SPARK_HOME not defined!
- exit 1
-fi
-
-if [[ -z $HADOOP_HOME ]]; then
- echo HADOOP_HOME not defined!
- exit 1
-fi
-
-export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
-
-# Target jar built
-OAP_MLLIB_JAR_NAME=oap-mllib-0.9.0-with-spark-3.0.0.jar
-OAP_MLLIB_JAR=$OAP_MLLIB_ROOT/mllib-dal/target/$OAP_MLLIB_JAR_NAME
-
-# Use absolute path
-SPARK_DRIVER_CLASSPATH=$OAP_MLLIB_JAR
-# Use relative path
-SPARK_EXECUTOR_CLASSPATH=./$OAP_MLLIB_JAR_NAME
-
-APP_JAR=target/oap-mllib-examples-0.9.0-with-spark-3.0.0.jar
+APP_JAR=target/oap-mllib-examples-$OAP_MLLIB_VERSION-with-spark-3.0.0.jar
APP_CLASS=org.apache.spark.examples.ml.KMeansExample
-/usr/bin/time -p $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \
+time $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \
--num-executors $SPARK_NUM_EXECUTORS \
--driver-memory $SPARK_DRIVER_MEMORY \
--executor-cores $SPARK_EXECUTOR_CORES \
@@ -68,7 +19,6 @@ APP_CLASS=org.apache.spark.examples.ml.KMeansExample
--conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \
--conf "spark.driver.extraClassPath=$SPARK_DRIVER_CLASSPATH" \
--conf "spark.executor.extraClassPath=$SPARK_EXECUTOR_CLASSPATH" \
- --conf "spark.executorEnv.CCL_KVS_IP_PORT=$CCL_KVS_IP_PORT" \
--conf "spark.shuffle.reduceLocality.enabled=false" \
--conf "spark.network.timeout=1200s" \
--conf "spark.task.maxFailures=1" \
diff --git a/examples/pca-pyspark/run.sh b/examples/pca-pyspark/run.sh
index 20010ce6b..b3776e877 100755
--- a/examples/pca-pyspark/run.sh
+++ b/examples/pca-pyspark/run.sh
@@ -1,57 +1,15 @@
#!/usr/bin/env bash
-# == User to customize the following environments ======= #
+source ../../conf/env.sh
-# Set user Spark and Hadoop home directory
-export SPARK_HOME=/path/to/your/spark/home
-export HADOOP_HOME=/path/to/your/hadoop/home
-# Set user HDFS Root
-export HDFS_ROOT=hdfs://your_hostname:8020
-# Set user Intel MLlib Root directory
-export OAP_MLLIB_ROOT=/path/to/your/OAP/oap-mllib
-
-# CSV data is the same as in Spark example "ml/pca_example.py", the data file should be copied to HDFS
+# CSV data is the same as in Spark example "ml/pca_example.py"
+# The data file should be copied to $HDFS_ROOT before running examples
DATA_FILE=data/pca_data.csv
-# == User to customize Spark executor cores and memory == #
-
-# User should check the requested resources are acturally allocated by cluster manager or Intel MLlib will behave incorrectly
-SPARK_MASTER=yarn
-SPARK_DRIVER_MEMORY=1G
-SPARK_NUM_EXECUTORS=2
-SPARK_EXECUTOR_CORES=1
-SPARK_EXECUTOR_MEMORY=1G
-
-SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES '*' 2)
-
-# ======================================================= #
-
-# Check env
-if [[ -z $SPARK_HOME ]]; then
- echo SPARK_HOME not defined!
- exit 1
-fi
-
-if [[ -z $HADOOP_HOME ]]; then
- echo HADOOP_HOME not defined!
- exit 1
-fi
-
-export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
-
-# Target jar built
-OAP_MLLIB_JAR_NAME=oap-mllib-0.9.0-with-spark-3.0.0.jar
-OAP_MLLIB_JAR=$OAP_MLLIB_ROOT/mllib-dal/target/$OAP_MLLIB_JAR_NAME
-
-# Use absolute path
-SPARK_DRIVER_CLASSPATH=$OAP_MLLIB_JAR
-# Use relative path
-SPARK_EXECUTOR_CLASSPATH=./$OAP_MLLIB_JAR_NAME
-
APP_PY=pca-pyspark.py
K=3
-/usr/bin/time -p $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \
+time $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \
--num-executors $SPARK_NUM_EXECUTORS \
--driver-memory $SPARK_DRIVER_MEMORY \
--executor-cores $SPARK_EXECUTOR_CORES \
diff --git a/examples/kmeans-hibench/build.sh b/examples/pca/build.sh
similarity index 100%
rename from examples/kmeans-hibench/build.sh
rename to examples/pca/build.sh
diff --git a/examples/pca/pom.xml b/examples/pca/pom.xml
index 06cf2343c..75641da81 100644
--- a/examples/pca/pom.xml
+++ b/examples/pca/pom.xml
@@ -4,7 +4,7 @@
com.intel.oap
oap-mllib-examples
- 1.1.0-with-spark-3.0.0
+ ${oap.version}-with-spark-${spark.version}
jar
PCAExample
@@ -12,6 +12,7 @@
UTF-8
+ 1.1.0
2.12.10
2.12
3.0.0
diff --git a/examples/pca/run.sh b/examples/pca/run.sh
index da373645b..64d22c6fe 100755
--- a/examples/pca/run.sh
+++ b/examples/pca/run.sh
@@ -1,3 +1,24 @@
#!/usr/bin/env bash
-mvn clean package
+source ../../conf/env.sh
+
+APP_JAR=target/oap-mllib-examples-$OAP_MLLIB_VERSION-with-spark-3.0.0.jar
+APP_CLASS=org.apache.spark.examples.ml.PCAExample
+
+time $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \
+ --num-executors $SPARK_NUM_EXECUTORS \
+ --driver-memory $SPARK_DRIVER_MEMORY \
+ --executor-cores $SPARK_EXECUTOR_CORES \
+ --executor-memory $SPARK_EXECUTOR_MEMORY \
+ --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \
+ --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \
+ --conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \
+ --conf "spark.driver.extraClassPath=$SPARK_DRIVER_CLASSPATH" \
+ --conf "spark.executor.extraClassPath=$SPARK_EXECUTOR_CLASSPATH" \
+ --conf "spark.shuffle.reduceLocality.enabled=false" \
+ --conf "spark.network.timeout=1200s" \
+ --conf "spark.task.maxFailures=1" \
+ --jars $OAP_MLLIB_JAR \
+ --class $APP_CLASS \
+ $APP_JAR $DATA_FILE $K \
+ 2>&1 | tee PCA-$(date +%m%d_%H_%M_%S).log
diff --git a/examples/run-all-pyspark.sh b/examples/run-all-pyspark.sh
new file mode 100755
index 000000000..06decd242
--- /dev/null
+++ b/examples/run-all-pyspark.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+for dir in kmeans-pyspark pca-pyspark als-pyspark
+do
+ cd $dir
+ ./run.sh
+ cd ..
+done
diff --git a/examples/run-all-scala.sh b/examples/run-all-scala.sh
new file mode 100755
index 000000000..65636c250
--- /dev/null
+++ b/examples/run-all-scala.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+for dir in kmeans pca als
+do
+ cd $dir
+ ./run.sh
+ cd ..
+done
diff --git a/mllib-dal/build.sh b/mllib-dal/build.sh
index da1d8df75..bd88c998c 100755
--- a/mllib-dal/build.sh
+++ b/mllib-dal/build.sh
@@ -2,7 +2,12 @@
# Check envs for building
if [[ -z $JAVA_HOME ]]; then
- echo $JAVA_HOME not defined!
+ echo JAVA_HOME not defined!
+ exit 1
+fi
+
+if [[ -z $(which mvn) ]]; then
+ echo Maven not found!
exit 1
fi
@@ -26,7 +31,8 @@ echo JAVA_HOME=$JAVA_HOME
echo DAALROOT=$DAALROOT
echo TBBROOT=$TBBROOT
echo CCL_ROOT=$CCL_ROOT
-echo GCC Version: $(gcc -dumpversion)
+echo Maven Version: $(mvn -v | head -n 1 | cut -f3 -d" ")
+echo Clang Version: $(clang -dumpversion)
echo =============================
mvn -DskipTests clean package
diff --git a/mllib-dal/pom.xml b/mllib-dal/pom.xml
index 4e51f9157..979c53d06 100644
--- a/mllib-dal/pom.xml
+++ b/mllib-dal/pom.xml
@@ -17,6 +17,13 @@
2.12.10
2.12
3.0.0
+ 2021.2.0
+ libtbb.so.12.2
+ libtbbmalloc.so.2.2
+ libJavaAPI.so.1.1
+ libccl.so
+ libfabric.so.1
+ libmpi.so.12.0.0
@@ -55,11 +62,11 @@
- com.intel.daal
- daal
- 2021.1
+ com.intel.onedal
+ onedal
+ ${oneapi.version}
system
- ${env.ONEAPI_ROOT}/dal/latest/lib/onedal.jar
+ ${env.DAALROOT}/lib/onedal.jar
@@ -218,14 +225,11 @@
${env.CCL_ROOT}/lib
-
-
- libmpi.so.12.0.0
- libfabric.so.1
- libccl.so
-
+ ${ccl.lib}
+ ${ccl.mpi.lib}
+ ${ccl.fabric.lib}
-
+
${env.CCL_ROOT}/lib/prov
@@ -236,8 +240,8 @@
${env.TBBROOT}/lib/intel64/gcc4.8
- libtbb.so.12.1
- libtbbmalloc.so.2.1
+ ${tbb.lib}
+ ${tbb.malloc.lib}
@@ -263,23 +267,19 @@
+ rename to workaround. See https://github.com/oneapi-src/oneDAL/issues/1254 -->
- ${project.build.testOutputDirectory}/lib/libtbb.so.12.1
+ ${project.build.testOutputDirectory}/lib/${tbb.lib}
${project.build.testOutputDirectory}/lib/libtbb.so.2
- ${project.build.testOutputDirectory}/lib/libtbbmalloc.so.2.1
+ ${project.build.testOutputDirectory}/lib/${tbb.malloc.lib}
${project.build.testOutputDirectory}/lib/libtbbmalloc.so.2
- ${project.build.testOutputDirectory}/lib/libmpi.so.12.0.0
+ ${project.build.testOutputDirectory}/lib/${ccl.mpi.lib}
${project.build.testOutputDirectory}/lib/libmpi.so.12
-
-
-
-
diff --git a/mllib-dal/src/assembly/assembly.xml b/mllib-dal/src/assembly/assembly.xml
index 498b90e02..541c4f2bd 100644
--- a/mllib-dal/src/assembly/assembly.xml
+++ b/mllib-dal/src/assembly/assembly.xml
@@ -41,28 +41,28 @@
-
+
lib
libtbb.so.2
-
+
lib
libtbbmalloc.so.2
-
+
lib
libJavaAPI.so
-
+
lib
-
+
lib
libmpi.so.12
@@ -75,4 +75,4 @@
lib
-
\ No newline at end of file
+
diff --git a/mllib-dal/src/main/native/ALSDALImpl.cpp b/mllib-dal/src/main/native/ALSDALImpl.cpp
index 53212dc1d..29162fddd 100644
--- a/mllib-dal/src/main/native/ALSDALImpl.cpp
+++ b/mllib-dal/src/main/native/ALSDALImpl.cpp
@@ -613,8 +613,8 @@ JNIEXPORT jlong JNICALL Java_org_apache_spark_ml_recommendation_ALSDALImpl_cDALI
std::cout << "\n=== Results for Rank " << rankId << "===\n" << std::endl;
// std::cout << "Partition ID: " << partitionId << std::endl;
- printNumericTable(pUser, "User Factors (first 10 rows):", 10);
- printNumericTable(pItem, "Item Factors (first 10 rows):", 10);
+ printNumericTable(pUser, "User Factors (first 10 rows x 20 columns):", 10, 20);
+ printNumericTable(pItem, "Item Factors (first 10 rows x 20 columns):", 10, 20);
std::cout << "User Offset: " << getOffsetFromOffsetTable(userOffset) << std::endl;
std::cout << "Item Offset: " << getOffsetFromOffsetTable(itemOffset) << std::endl;
std::cout << std::endl;
diff --git a/mllib-dal/src/main/native/Makefile b/mllib-dal/src/main/native/Makefile
index 23222e646..e3a7e2161 100644
--- a/mllib-dal/src/main/native/Makefile
+++ b/mllib-dal/src/main/native/Makefile
@@ -12,11 +12,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-CC := gcc
-CXX := g++
+CC := clang
+CXX := clang++
RM := rm -rf
-CFLAGS := -g -Wall -fPIC -std=c++11
+CFLAGS := -g -Wall -Wno-deprecated-declarations -fPIC -std=c++11
# The following paths setting works for self-built libs from source code
# https://github.com/oneapi-src/oneCCL. If oneCCL package in oneAPI Toolkit is used,
@@ -33,7 +33,7 @@ INCS := -I $(JAVA_HOME)/include \
LIBS := -L${CCL_ROOT}/lib -lccl \
-L$(DAALROOT)/lib/intel64 -l:libdaal_core.a -l:libdaal_thread.a \
- -L$(TBBROOT)/lib -ltbb -ltbbmalloc
+ -L$(TBBROOT)/lib/intel64/gcc4.8 -ltbb -ltbbmalloc
# TODO: Add signal chaining support, should fix linking, package so and loading
# -L$(JAVA_HOME)/jre/lib/amd64 -ljsig
diff --git a/mllib-dal/src/main/native/PCADALImpl.cpp b/mllib-dal/src/main/native/PCADALImpl.cpp
index 33e2bc95d..95172d05f 100644
--- a/mllib-dal/src/main/native/PCADALImpl.cpp
+++ b/mllib-dal/src/main/native/PCADALImpl.cpp
@@ -125,8 +125,8 @@ JNIEXPORT jlong JNICALL Java_org_apache_spark_ml_feature_PCADALImpl_cPCATrainDAL
std::cout << "PCA (native): master step took " << duration << " secs" << std::endl;
/* Print the results */
- printNumericTable(result->get(pca::eigenvalues), "First 10 Eigenvalues:", 10);
- printNumericTable(result->get(pca::eigenvectors), "First 10 Eigenvectors:", 10);
+ printNumericTable(result->get(pca::eigenvalues), "First 10 eigenvalues with first 20 dimensions:", 10, 20);
+ printNumericTable(result->get(pca::eigenvectors), "First 10 eigenvectors with first 20 dimensions:", 10, 20);
// Return all eigenvalues & eigenvectors
diff --git a/mllib-dal/src/main/scala/org/apache/spark/ml/util/Utils.scala b/mllib-dal/src/main/scala/org/apache/spark/ml/util/Utils.scala
index aa8eb8979..0dd43d24f 100644
--- a/mllib-dal/src/main/scala/org/apache/spark/ml/util/Utils.scala
+++ b/mllib-dal/src/main/scala/org/apache/spark/ml/util/Utils.scala
@@ -72,6 +72,12 @@ object Utils {
}
def checkExecutorAvailPort(data: RDD[_], localIP: String) : Int = {
+
+ if (localIP == "127.0.0.1" || localIP == "127.0.1.1") {
+ println(s"\nOneCCL: Error: doesn't support loopback IP ${localIP}, please assign IP address to your host.\n")
+ System.exit(-1)
+ }
+
val sc = data.sparkContext
val result = data.mapPartitions { p =>
LibLoader.loadLibraries()
diff --git a/mllib-dal/test-cluster.sh b/mllib-dal/test-cluster.sh
deleted file mode 100755
index 4f5a6132a..000000000
--- a/mllib-dal/test-cluster.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/usr/bin/env bash
-
-cd ../dev/test-cluster/workloads
-
-./run-kmeans-pyspark.sh
diff --git a/mllib-dal/test.sh b/mllib-dal/test.sh
index 0157c22a4..befd66495 100755
--- a/mllib-dal/test.sh
+++ b/mllib-dal/test.sh
@@ -2,7 +2,12 @@
# Check envs for building
if [[ -z $JAVA_HOME ]]; then
- echo $JAVA_HOME not defined!
+ echo JAVA_HOME not defined!
+ exit 1
+fi
+
+if [[ -z $(which mvn) ]]; then
+ echo Maven not found!
exit 1
fi
@@ -21,16 +26,17 @@ if [[ -z $CCL_ROOT ]]; then
exit 1
fi
-echo === Building Environments ===
+echo === Testing Environments ===
echo JAVA_HOME=$JAVA_HOME
echo DAALROOT=$DAALROOT
echo TBBROOT=$TBBROOT
echo CCL_ROOT=$CCL_ROOT
-echo GCC Version: $(gcc -dumpversion)
+echo Maven Version: $(mvn -v | head -n 1 | cut -f3 -d" ")
+echo Clang Version: $(clang -dumpversion)
echo =============================
# Enable signal chaining support for JNI
-export LD_PRELOAD=$JAVA_HOME/jre/lib/amd64/libjsig.so
+# export LD_PRELOAD=$JAVA_HOME/jre/lib/amd64/libjsig.so
# -Dtest=none to turn off the Java tests