diff --git a/.github/workflows/oap-mllib-ci.yml b/.github/workflows/oap-mllib-ci.yml
index 73700a96d..2c6973321 100644
--- a/.github/workflows/oap-mllib-ci.yml
+++ b/.github/workflows/oap-mllib-ci.yml
@@ -38,4 +38,5 @@ jobs:
source /opt/intel/oneapi/dal/latest/env/vars.sh
source /opt/intel/oneapi/tbb/latest/env/vars.sh
source /tmp/oneCCL/build/_install/env/setvars.sh
- ./test.sh
+ # temp disable and will enable for new release of oneCCL
+ #./build.sh
diff --git a/.gitignore b/.gitignore
index 21a5d40c2..1d621bdd4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
*.o
*.log
.vscode
+*.iml
target/
.idea/
.idea_modules/
diff --git a/README.md b/README.md
index d217becee..477c74b1f 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@ You can find the all the OAP MLlib documents on the [project web page](https://o
### Java/Scala Users Preferred
-Use a pre-built OAP MLlib JAR to get started. You can firstly download OAP package from [OAP-JARs-Tarball](https://github.com/oap-mllib/releases/download/v1.1.0-spark-3.0.0/oap-1.1.0-bin-spark-3.0.0.tar.gz) and extract this Tarball to get `oap-mllib-x.x.x-with-spark-x.x.x.jar` under `oap-1.1.0-bin-spark-3.0.0/jars`.
+Use a pre-built OAP MLlib JAR to get started. You can firstly download OAP package from [OAP-JARs-Tarball](https://github.com/Intel-bigdata/OAP/releases/download/v1.0.0-spark-3.0.0/oap-1.0.0-bin-spark-3.0.0.tar.gz) and extract this Tarball to get `oap-mllib-x.x.x-with-spark-x.x.x.jar` under `oap-1.0.0-bin-spark-3.0.0/jars`.
Then you can refer to the following [Running](#running) section to try out.
@@ -65,6 +65,14 @@ To use K-means example for sanity check, you need to upload a data file to your
$ ./run.sh
```
+### Benchmark with HiBench
+Use [Hibench](https://github.com/Intel-bigdata/HiBench) to generate dataset with various profiles, and change related variables in `run-XXX.sh` script when applicable. Then run the following commands:
+```
+ $ cd oap-mllib/examples/kmeans-hibench
+ $ ./build.sh
+ $ ./run-hibench-oap-mllib.sh
+```
+
### PySpark Support
As PySpark-based applications call their Scala couterparts, they shall be supported out-of-box. An example can be found in the [Examples](#examples) section.
@@ -87,7 +95,7 @@ IntelĀ® oneAPI Toolkits and its components can be downloaded and install from [h
More details about oneAPI can be found [here](https://software.intel.com/content/www/us/en/develop/tools/oneapi.html).
-You can also refer to [this script and comments in it](https://github.com/oap-project/oap-mllib/blob/branch-1.1-spark-3.x/dev/install-build-deps-centos.sh) to install correct oneAPI version and manually setup the environments.
+You can also refer to [this script and comments in it](https://github.com/Intel-bigdata/OAP/blob/branch-1.0-spark-3.x/oap-mllib/dev/install-build-deps-centos.sh) to install correct oneAPI version and manually setup the environments.
Scala and Java dependency descriptions are already included in Maven POM file.
@@ -130,7 +138,7 @@ CCL_ROOT | Path to oneCCL home directory
We suggest you to source `setvars.sh` script into current shell to setup building environments as following:
```
- $ source /opt/intel/oneapi/setvars.sh
+ $ source /opt/intel/inteloneapi/setvars.sh
$ source /your/oneCCL_source_code/build/_install/env/setvars.sh
```
@@ -152,11 +160,8 @@ Example | Description
----------------|---------------------------
kmeans | K-means example for Scala
kmeans-pyspark | K-means example for PySpark
-pca | PCA example for Scala
-pca-pyspark | PCA example for PySpark
+kmeans-hibench | Use HiBench-generated input dataset to benchmark K-means performance
## List of Accelerated Algorithms
* K-Means (CPU, Experimental)
-* PCA (CPU, Experimental)
-
diff --git a/dev/install-build-deps-centos.sh b/dev/install-build-deps-centos.sh
index 7b27736ae..8a347fdef 100755
--- a/dev/install-build-deps-centos.sh
+++ b/dev/install-build-deps-centos.sh
@@ -23,7 +23,7 @@ cd /tmp
rm -rf oneCCL
git clone https://github.com/oneapi-src/oneCCL
cd oneCCL
-git checkout beta08
+git checkout 2021.1
mkdir -p build && cd build
cmake ..
make -j 2 install
diff --git a/dev/install-build-deps-ubuntu.sh b/dev/install-build-deps-ubuntu.sh
index 07019b834..d43e35b89 100755
--- a/dev/install-build-deps-ubuntu.sh
+++ b/dev/install-build-deps-ubuntu.sh
@@ -17,7 +17,7 @@ echo "Building oneCCL ..."
cd /tmp
git clone https://github.com/oneapi-src/oneCCL
cd oneCCL
-git checkout beta08
+git checkout 2021.1
mkdir build && cd build
cmake ..
make -j 2 install
diff --git a/dev/test-cluster/config-ssh.sh b/dev/test-cluster/config-ssh.sh
new file mode 100755
index 000000000..d093fa17a
--- /dev/null
+++ b/dev/test-cluster/config-ssh.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+
+ssh-keygen -q -N "" -t rsa -f ~/.ssh/id_rsa
+cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
+echo " StrictHostKeyChecking no " | sudo tee -a /etc/ssh/ssh_config
+sudo service ssh restart
diff --git a/dev/test-cluster/core-site.xml b/dev/test-cluster/core-site.xml
new file mode 100644
index 000000000..7016e477e
--- /dev/null
+++ b/dev/test-cluster/core-site.xml
@@ -0,0 +1,24 @@
+
+
+
+
+
+
+
+
+ fs.default.name
+ hdfs://localhost:8020
+
+
diff --git a/dev/test-cluster/envs.sh b/dev/test-cluster/envs.sh
new file mode 100644
index 000000000..71e8506e6
--- /dev/null
+++ b/dev/test-cluster/envs.sh
@@ -0,0 +1,22 @@
+# Set user Spark and Hadoop home directory
+export HADOOP_HOME=~/opt/hadoop-2.7.7
+export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
+export SPARK_HOME=~/opt/spark-3.0.0-bin-hadoop2.7
+
+export PYTHONPATH=$SPARK_HOME/python:$PYTHONPATH
+export PYSPARK_PYTHON=python3
+
+# Set user HDFS Root
+export HDFS_ROOT=hdfs://localhost:8020
+export OAP_MLLIB_DATA_ROOT=OAPMLlib/Data
+# Set user Intel MLlib Root directory
+export OAP_MLLIB_ROOT=${GITHUB_WORKSPACE}
+
+# Target jar built
+OAP_MLLIB_JAR_NAME=oap-mllib-1.1.0.jar
+OAP_MLLIB_JAR=$OAP_MLLIB_ROOT/mllib-dal/target/$OAP_MLLIB_JAR_NAME
+
+# Use absolute path
+SPARK_DRIVER_CLASSPATH=$OAP_MLLIB_JAR
+# Use relative path
+SPARK_EXECUTOR_CLASSPATH=./$OAP_MLLIB_JAR_NAME
diff --git a/dev/test-cluster/hadoop-env.sh b/dev/test-cluster/hadoop-env.sh
new file mode 100755
index 000000000..bee6c1f69
--- /dev/null
+++ b/dev/test-cluster/hadoop-env.sh
@@ -0,0 +1,99 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Set Hadoop-specific environment variables here.
+
+# The only required environment variable is JAVA_HOME. All others are
+# optional. When running a distributed configuration it is best to
+# set JAVA_HOME in this file, so that it is correctly defined on
+# remote nodes.
+
+# The java implementation to use.
+# export JAVA_HOME=${JAVA_HOME}
+export JAVA_HOME=/usr/local/lib/jvm/openjdk8
+
+# The jsvc implementation to use. Jsvc is required to run secure datanodes
+# that bind to privileged ports to provide authentication of data transfer
+# protocol. Jsvc is not required if SASL is configured for authentication of
+# data transfer protocol using non-privileged ports.
+#export JSVC_HOME=${JSVC_HOME}
+
+export HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-"/etc/hadoop"}
+
+# Extra Java CLASSPATH elements. Automatically insert capacity-scheduler.
+for f in $HADOOP_HOME/contrib/capacity-scheduler/*.jar; do
+ if [ "$HADOOP_CLASSPATH" ]; then
+ export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:$f
+ else
+ export HADOOP_CLASSPATH=$f
+ fi
+done
+
+# The maximum amount of heap to use, in MB. Default is 1000.
+#export HADOOP_HEAPSIZE=
+#export HADOOP_NAMENODE_INIT_HEAPSIZE=""
+
+# Extra Java runtime options. Empty by default.
+export HADOOP_OPTS="$HADOOP_OPTS -Djava.net.preferIPv4Stack=true"
+
+# Command specific options appended to HADOOP_OPTS when specified
+export HADOOP_NAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} $HADOOP_NAMENODE_OPTS"
+export HADOOP_DATANODE_OPTS="-Dhadoop.security.logger=ERROR,RFAS $HADOOP_DATANODE_OPTS"
+
+export HADOOP_SECONDARYNAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} $HADOOP_SECONDARYNAMENODE_OPTS"
+
+export HADOOP_NFS3_OPTS="$HADOOP_NFS3_OPTS"
+export HADOOP_PORTMAP_OPTS="-Xmx512m $HADOOP_PORTMAP_OPTS"
+
+# The following applies to multiple commands (fs, dfs, fsck, distcp etc)
+export HADOOP_CLIENT_OPTS="-Xmx512m $HADOOP_CLIENT_OPTS"
+#HADOOP_JAVA_PLATFORM_OPTS="-XX:-UsePerfData $HADOOP_JAVA_PLATFORM_OPTS"
+
+# On secure datanodes, user to run the datanode as after dropping privileges.
+# This **MUST** be uncommented to enable secure HDFS if using privileged ports
+# to provide authentication of data transfer protocol. This **MUST NOT** be
+# defined if SASL is configured for authentication of data transfer protocol
+# using non-privileged ports.
+export HADOOP_SECURE_DN_USER=${HADOOP_SECURE_DN_USER}
+
+# Where log files are stored. $HADOOP_HOME/logs by default.
+#export HADOOP_LOG_DIR=${HADOOP_LOG_DIR}/$USER
+
+# Where log files are stored in the secure data environment.
+export HADOOP_SECURE_DN_LOG_DIR=${HADOOP_LOG_DIR}/${HADOOP_HDFS_USER}
+
+###
+# HDFS Mover specific parameters
+###
+# Specify the JVM options to be used when starting the HDFS Mover.
+# These options will be appended to the options specified as HADOOP_OPTS
+# and therefore may override any similar flags set in HADOOP_OPTS
+#
+# export HADOOP_MOVER_OPTS=""
+
+###
+# Advanced Users Only!
+###
+
+# The directory where pid files are stored. /tmp by default.
+# NOTE: this should be set to a directory that can only be written to by
+# the user that will run the hadoop daemons. Otherwise there is the
+# potential for a symlink attack.
+export HADOOP_PID_DIR=${HADOOP_PID_DIR}
+export HADOOP_SECURE_DN_PID_DIR=${HADOOP_PID_DIR}
+
+# A string representing this instance of hadoop. $USER by default.
+export HADOOP_IDENT_STRING=$USER
diff --git a/dev/test-cluster/hdfs-site.xml b/dev/test-cluster/hdfs-site.xml
new file mode 100644
index 000000000..40fcbb5d6
--- /dev/null
+++ b/dev/test-cluster/hdfs-site.xml
@@ -0,0 +1,32 @@
+
+
+
+
+
+
+
+
+ dfs.replication
+ 1
+
+
+ dfs.namenode.name.dir
+ /tmp/run/hdfs/namenode
+
+
+ dfs.datanode.data.dir
+ /tmp/run/hdfs/datanode
+
+
diff --git a/dev/test-cluster/setup-cluster.sh b/dev/test-cluster/setup-cluster.sh
new file mode 100755
index 000000000..eea058f80
--- /dev/null
+++ b/dev/test-cluster/setup-cluster.sh
@@ -0,0 +1,42 @@
+#!/usr/bin/env bash
+
+WORK_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+
+cd $WORK_DIR
+
+echo JAVA_HOME is $JAVA_HOME
+
+mkdir ~/opt
+cd ~/opt
+wget https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop2.7.tgz
+tar -xzf spark-3.0.0-bin-hadoop2.7.tgz
+wget https://archive.apache.org/dist/hadoop/core/hadoop-2.7.7/hadoop-2.7.7.tar.gz
+tar -xzf hadoop-2.7.7.tar.gz
+
+cd $WORK_DIR
+
+cp ./core-site.xml ~/opt/hadoop-2.7.7/etc/hadoop/
+cp ./hdfs-site.xml ~/opt/hadoop-2.7.7/etc/hadoop/
+cp ./yarn-site.xml ~/opt/hadoop-2.7.7/etc/hadoop/
+cp ./hadoop-env.sh ~/opt/hadoop-2.7.7/etc/hadoop/
+cp ./spark-defaults.conf ~/opt/spark-3.0.0-bin-hadoop2.7/conf
+
+# create directories
+mkdir -p /tmp/run/hdfs/namenode
+mkdir -p /tmp/run/hdfs/datanode
+
+# hdfs format
+~/opt/hadoop-2.7.7/bin/hdfs namenode -format
+
+export HADOOP_HOME=~/opt/hadoop-2.7.7
+export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
+export SPARK_HOME=~/opt/spark-3.0.0-bin-hadoop2.7
+
+export PATH=$HADOOP_HOME/bin:$SPARK_HOME/bin:$PATH
+
+# start hdfs and yarn
+$HADOOP_HOME/sbin/start-dfs.sh
+$HADOOP_HOME/sbin/start-yarn.sh
+
+hadoop fs -ls /
+yarn node -list
diff --git a/dev/test-cluster/setup-python3-env.sh b/dev/test-cluster/setup-python3-env.sh
new file mode 100755
index 000000000..29208dc5e
--- /dev/null
+++ b/dev/test-cluster/setup-python3-env.sh
@@ -0,0 +1,12 @@
+#!/usr/bin/env bash
+
+sudo apt-get update
+sudo apt-get install python3-pip python3-setuptools python3-wheel
+
+pip3 install --user numpy
+
+echo python is in $(which python)
+python --version
+
+echo python3 is in $(which python3)
+python3 --version
diff --git a/dev/test-cluster/spark-defaults.conf b/dev/test-cluster/spark-defaults.conf
new file mode 100644
index 000000000..1c25bb2ec
--- /dev/null
+++ b/dev/test-cluster/spark-defaults.conf
@@ -0,0 +1,34 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Default system properties included when running spark-submit.
+# This is useful for setting default environmental settings.
+
+# Example:
+# spark.master spark://master:7077
+# spark.eventLog.enabled true
+# spark.eventLog.dir hdfs://namenode:8021/directory
+# spark.serializer org.apache.spark.serializer.KryoSerializer
+# spark.driver.memory 5g
+# spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"
+
+spark.master yarn
+spark.serializer org.apache.spark.serializer.KryoSerializer
+spark.driver.memory 3g
+spark.executor.num 2
+spark.executor.cores 1
+spark.executor.memory 4g
diff --git a/dev/test-cluster/workloads/kmeans-pyspark.py b/dev/test-cluster/workloads/kmeans-pyspark.py
new file mode 100644
index 000000000..cf93e6034
--- /dev/null
+++ b/dev/test-cluster/workloads/kmeans-pyspark.py
@@ -0,0 +1,70 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+An example demonstrating k-means clustering.
+Run with:
+ bin/spark-submit examples/src/main/python/ml/kmeans_example.py
+
+This example requires NumPy (http://www.numpy.org/).
+"""
+from __future__ import print_function
+import sys
+
+# $example on$
+from pyspark.ml.clustering import KMeans
+from pyspark.ml.evaluation import ClusteringEvaluator
+# $example off$
+
+from pyspark.sql import SparkSession
+
+if __name__ == "__main__":
+ spark = SparkSession\
+ .builder\
+ .appName("KMeansExample")\
+ .getOrCreate()
+
+ if (len(sys.argv) != 2) :
+ println("Require data file path as input parameter")
+ sys.exit(1)
+
+ # $example on$
+ # Loads data.
+ dataset = spark.read.format("libsvm").load(sys.argv[1])
+
+ # Trains a k-means model.
+ kmeans = KMeans().setK(2).setSeed(1)
+ model = kmeans.fit(dataset)
+
+ # Make predictions
+ predictions = model.transform(dataset)
+
+ # Evaluate clustering by computing Silhouette score
+ evaluator = ClusteringEvaluator()
+
+ silhouette = evaluator.evaluate(predictions)
+ print("Silhouette with squared euclidean distance = " + str(silhouette))
+
+ # Shows the result.
+ centers = model.clusterCenters()
+ print("Cluster Centers: ")
+ for center in centers:
+ print(center)
+ # $example off$
+
+ spark.stop()
+
diff --git a/dev/test-cluster/workloads/run-kmeans-pyspark.sh b/dev/test-cluster/workloads/run-kmeans-pyspark.sh
new file mode 100755
index 000000000..e07f3f7b6
--- /dev/null
+++ b/dev/test-cluster/workloads/run-kmeans-pyspark.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+
+source ../envs.sh
+
+# Data file is from Spark Examples (data/mllib/sample_kmeans_data.txt), the data file should be copied to HDFS
+$HADOOP_HOME/bin/hadoop fs -mkdir -p $OAP_MLLIB_DATA_ROOT
+$HADOOP_HOME/bin/hadoop fs -copyFromLocal $SPARK_HOME/data/mllib/sample_kmeans_data.txt $OAP_MLLIB_DATA_ROOT
+
+# User should check the requested resources are acturally allocated by cluster manager or Intel MLlib will behave incorrectly
+SPARK_MASTER=yarn
+SPARK_DRIVER_MEMORY=1G
+SPARK_NUM_EXECUTORS=2
+SPARK_EXECUTOR_CORES=1
+SPARK_EXECUTOR_MEMORY=1G
+
+SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES '*' 2)
+
+# ======================================================= #
+
+# Check env
+if [[ -z $SPARK_HOME ]]; then
+ echo SPARK_HOME not defined!
+ exit 1
+fi
+
+if [[ -z $HADOOP_HOME ]]; then
+ echo HADOOP_HOME not defined!
+ exit 1
+fi
+
+APP_PY="$OAP_MLLIB_ROOT/dev/test-cluster/workloads/kmeans-pyspark.py"
+DATA_FILE=$OAP_MLLIB_DATA_ROOT/sample_kmeans_data.txt
+
+$SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \
+ --num-executors $SPARK_NUM_EXECUTORS \
+ --driver-memory $SPARK_DRIVER_MEMORY \
+ --executor-cores $SPARK_EXECUTOR_CORES \
+ --executor-memory $SPARK_EXECUTOR_MEMORY \
+ --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \
+ --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \
+ --conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \
+ --conf "spark.driver.extraClassPath=$SPARK_DRIVER_CLASSPATH" \
+ --conf "spark.executor.extraClassPath=$SPARK_EXECUTOR_CLASSPATH" \
+ --conf "spark.shuffle.reduceLocality.enabled=false" \
+ --conf "spark.network.timeout=1200s" \
+ --conf "spark.task.maxFailures=1" \
+ --jars $OAP_MLLIB_JAR \
+ $APP_PY $DATA_FILE
diff --git a/dev/test-cluster/yarn-site.xml b/dev/test-cluster/yarn-site.xml
new file mode 100644
index 000000000..ff74d23a7
--- /dev/null
+++ b/dev/test-cluster/yarn-site.xml
@@ -0,0 +1,67 @@
+
+
+
+
+
+ yarn.nodemanager.aux-services
+ mapreduce_shuffle
+
+
+ yarn.nodemanager.aux-services.mapreduce.shuffle.class
+ org.apache.hadoop.mapred.ShuffleHandler
+
+
+ yarn.resourcemanager.hostname
+ localhost
+
+
+ yarn.resourcemanager.address
+ localhost:8032
+
+
+
+ yarn.nodemanager.resource.memory-mb
+ 7168
+
+
+ yarn.nodemanager.resource.cpu-vcores
+ 2
+
+
+ yarn.nodemanager.vmem-check-enabled
+ false
+
+
+ yarn.nodemanager.vmem-pmem-ratio
+ 2
+
+
+ yarn.scheduler.minimum-allocation-mb
+ 1024
+
+
+ yarn.scheduler.maximum-allocation-mb
+ 7168
+
+
+ yarn.scheduler.minimum-allocation-vcores
+ 1
+
+
+ yarn.scheduler.maximum-allocation-vcores
+ 2
+
+
+
diff --git a/examples/als-hibench/build.sh b/examples/als-hibench/build.sh
new file mode 100755
index 000000000..8cbc692be
--- /dev/null
+++ b/examples/als-hibench/build.sh
@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+
+mvn clean package
\ No newline at end of file
diff --git a/examples/als-hibench/pom.xml b/examples/als-hibench/pom.xml
new file mode 100644
index 000000000..68e02c256
--- /dev/null
+++ b/examples/als-hibench/pom.xml
@@ -0,0 +1,100 @@
+
+ 4.0.0
+
+ com.intel.oap
+ oap-mllib-examples
+ 0.9.0-with-spark-3.0.0
+ jar
+
+ ALSHiBenchExample
+ https://github.com/Intel-bigdata/OAP
+
+
+ UTF-8
+ 2.12.10
+ 2.12
+ 3.0.0
+
+
+
+
+
+ org.scala-lang
+ scala-library
+ 2.12.10
+
+
+
+ com.github.scopt
+ scopt_2.12
+ 3.7.0
+
+
+
+
+
+
+
+
+
+
+ org.apache.spark
+ spark-sql_2.12
+ ${spark.version}
+ provided
+
+
+
+ org.apache.spark
+ spark-mllib_2.12
+ ${spark.version}
+ provided
+
+
+
+
+
+
+
+ org.scala-tools
+ maven-scala-plugin
+ 2.15.2
+
+
+
+ compile
+ testCompile
+
+
+
+
+ ${scala.version}
+
+ -target:jvm-1.8
+
+
+
+
+ maven-assembly-plugin
+ 3.0.0
+
+ false
+
+ jar-with-dependencies
+
+
+
+
+ assembly
+ package
+
+ single
+
+
+
+
+
+
+
+
diff --git a/examples/als-hibench/run-hibench-oap-mllib.sh b/examples/als-hibench/run-hibench-oap-mllib.sh
new file mode 100755
index 000000000..050b80558
--- /dev/null
+++ b/examples/als-hibench/run-hibench-oap-mllib.sh
@@ -0,0 +1,73 @@
+#!/usr/bin/env bash
+
+export HDFS_ROOT=hdfs://sr591:8020
+export OAP_MLLIB_ROOT=/home/xiaochang/Works/OAP-xwu99-als/oap-mllib
+
+SPARK_MASTER=yarn
+SPARK_DRIVER_MEMORY=16G
+SPARK_NUM_EXECUTORS=6
+SPARK_EXECUTOR_CORES=28
+SPARK_EXECUTOR_MEMORY_OVERHEAD=25G
+SPARK_EXECUTOR_MEMORY=100G
+
+SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES '*' 2)
+#SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES)
+
+# ======================================================= #
+
+# for log suffix
+SUFFIX=$( basename -s .sh "${BASH_SOURCE[0]}" )
+
+# Check envs
+if [[ -z $SPARK_HOME ]]; then
+ echo SPARK_HOME not defined!
+ exit 1
+fi
+
+if [[ -z $HADOOP_HOME ]]; then
+ echo HADOOP_HOME not defined!
+ exit 1
+fi
+
+export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
+
+# Target jar built
+OAP_MLLIB_JAR_NAME=oap-mllib-0.9.0-with-spark-3.0.0.jar
+OAP_MLLIB_JAR=$OAP_MLLIB_ROOT/mllib-dal/target/$OAP_MLLIB_JAR_NAME
+
+# Use absolute path
+SPARK_DRIVER_CLASSPATH=$OAP_MLLIB_JAR
+# Use relative path
+SPARK_EXECUTOR_CLASSPATH=./$OAP_MLLIB_JAR_NAME
+
+APP_JAR=target/oap-mllib-examples-0.9.0-with-spark-3.0.0.jar
+APP_CLASS=com.intel.hibench.sparkbench.ml.ALSExample
+
+HDFS_INPUT=hdfs://sr591:8020/HiBench/ALS/Input
+RANK=10
+NUM_ITERATIONS=1
+LAMBDA=0.1
+IMPLICIT=true
+
+/usr/bin/time -p $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \
+ --num-executors $SPARK_NUM_EXECUTORS \
+ --driver-memory $SPARK_DRIVER_MEMORY \
+ --executor-cores $SPARK_EXECUTOR_CORES \
+ --executor-memory $SPARK_EXECUTOR_MEMORY \
+ --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \
+ --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \
+ --conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \
+ --conf "spark.driver.extraClassPath=$SPARK_DRIVER_CLASSPATH" \
+ --conf "spark.executor.extraClassPath=$SPARK_EXECUTOR_CLASSPATH" \
+ --conf "spark.shuffle.reduceLocality.enabled=false" \
+ --conf "spark.executor.memoryOverhead=$SPARK_EXECUTOR_MEMORY_OVERHEAD" \
+ --conf "spark.network.timeout=1200s" \
+ --conf "spark.task.maxFailures=1" \
+ --jars $OAP_MLLIB_JAR \
+ --class $APP_CLASS \
+ $APP_JAR \
+ --rank $RANK --numIterations $NUM_ITERATIONS --implicitPrefs $IMPLICIT --lambda $LAMBDA \
+ --numProductBlocks $SPARK_DEFAULT_PARALLELISM --numUserBlocks $SPARK_DEFAULT_PARALLELISM \
+ $HDFS_INPUT \
+ 2>&1 | tee ALS-$SUFFIX-$(date +%m%d_%H_%M_%S).log
+
diff --git a/examples/als-hibench/run-hibench-vanilla.sh b/examples/als-hibench/run-hibench-vanilla.sh
new file mode 100755
index 000000000..6cb6b3ae7
--- /dev/null
+++ b/examples/als-hibench/run-hibench-vanilla.sh
@@ -0,0 +1,61 @@
+#!/usr/bin/env bash
+
+export HDFS_ROOT=hdfs://sr591:8020
+
+SPARK_MASTER=yarn
+SPARK_DRIVER_MEMORY=16G
+SPARK_NUM_EXECUTORS=6
+SPARK_EXECUTOR_CORES=28
+SPARK_EXECUTOR_MEMORY_OVERHEAD=25G
+SPARK_EXECUTOR_MEMORY=100G
+
+SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES '*' 2)
+
+# ======================================================= #
+
+# for log suffix
+SUFFIX=$( basename -s .sh "${BASH_SOURCE[0]}" )
+
+# Check envs
+if [[ -z $SPARK_HOME ]]; then
+ echo SPARK_HOME not defined!
+ exit 1
+fi
+
+if [[ -z $HADOOP_HOME ]]; then
+ echo HADOOP_HOME not defined!
+ exit 1
+fi
+
+export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
+
+APP_JAR=target/oap-mllib-examples-0.9.0-with-spark-3.0.0.jar
+APP_CLASS=com.intel.hibench.sparkbench.ml.ALSExample
+
+HDFS_INPUT=hdfs://sr591:8020/HiBench/ALS/Input
+RANK=10
+NUM_ITERATIONS=1
+LAMBDA=0.1
+IMPLICIT=true
+
+/usr/bin/time -p $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \
+ --num-executors $SPARK_NUM_EXECUTORS \
+ --driver-memory $SPARK_DRIVER_MEMORY \
+ --executor-cores $SPARK_EXECUTOR_CORES \
+ --executor-memory $SPARK_EXECUTOR_MEMORY \
+ --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \
+ --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \
+ --conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \
+ --conf "spark.driver.extraClassPath=$SPARK_DRIVER_CLASSPATH" \
+ --conf "spark.executor.extraClassPath=$SPARK_EXECUTOR_CLASSPATH" \
+ --conf "spark.shuffle.reduceLocality.enabled=false" \
+ --conf "spark.executor.memoryOverhead=$SPARK_EXECUTOR_MEMORY_OVERHEAD" \
+ --conf "spark.network.timeout=1200s" \
+ --conf "spark.task.maxFailures=1" \
+ --class $APP_CLASS \
+ $APP_JAR \
+ --rank $RANK --numIterations $NUM_ITERATIONS --implicitPrefs $IMPLICIT --lambda $LAMBDA \
+ --numProductBlocks $SPARK_DEFAULT_PARALLELISM --numUserBlocks $SPARK_DEFAULT_PARALLELISM \
+ $HDFS_INPUT \
+ 2>&1 | tee ALS-$SUFFIX-$(date +%m%d_%H_%M_%S).log
+
diff --git a/examples/als-hibench/src/main/scala/com/intel/hibench/sparkbench/ml/ALSExample.scala b/examples/als-hibench/src/main/scala/com/intel/hibench/sparkbench/ml/ALSExample.scala
new file mode 100644
index 000000000..5a29bcc80
--- /dev/null
+++ b/examples/als-hibench/src/main/scala/com/intel/hibench/sparkbench/ml/ALSExample.scala
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.intel.hibench.sparkbench.ml
+
+import org.apache.spark.ml.evaluation.RegressionEvaluator
+import org.apache.spark.ml.recommendation.ALS
+import org.apache.spark.ml.recommendation.ALS.Rating
+import org.apache.spark.sql.SparkSession
+import scopt.OptionParser
+
+object ALSExample {
+
+ case class Params(
+ dataPath: String = null,
+ numIterations: Int = 10,
+ lambda: Double = 0.1,
+ rank: Int = 10,
+ numUserBlocks: Int = 10,
+ numItemBlocks: Int = 10,
+ implicitPrefs: Boolean = false)
+
+ def main(args: Array[String]) {
+ val defaultParams = Params()
+
+ val parser = new OptionParser[Params]("ALS") {
+ head("ALS: an example app for ALS on User-Item data.")
+ opt[Int]("rank")
+ .text(s"rank, default: ${defaultParams.rank}")
+ .action((x, c) => c.copy(rank = x))
+ opt[Int]("numIterations")
+ .text(s"number of iterations, default: ${defaultParams.numIterations}")
+ .action((x, c) => c.copy(numIterations = x))
+ opt[Double]("lambda")
+ .text(s"regularization parameter, default: ${defaultParams.lambda}")
+ .action((x, c) => c.copy(lambda = x))
+ opt[Int]("numUserBlocks")
+ .text(s"number of user blocks, default: ${defaultParams.numUserBlocks}")
+ .action((x, c) => c.copy(numUserBlocks = x))
+ opt[Int]("numProductBlocks")
+ .text(s"number of product blocks, default: ${defaultParams.numItemBlocks}")
+ .action((x, c) => c.copy(numItemBlocks = x))
+ opt[Boolean]("implicitPrefs")
+ .text("implicit preference, default: ${defaultParams.implicitPrefs}")
+ .action((x, c) => c.copy(implicitPrefs = x))
+ arg[String]("")
+ .required()
+ .text("Input paths to a User-Product dataset of ratings")
+ .action((x, c) => c.copy(dataPath = x))
+ }
+ parser.parse(args, defaultParams) match {
+ case Some(params) => run(params)
+ case _ => sys.exit(1)
+ }
+ }
+
+ def run(params: Params): Unit = {
+ val spark = SparkSession
+ .builder
+ .appName(s"ALS with $params")
+ .getOrCreate()
+ val sc = spark.sparkContext
+
+ import spark.implicits._
+
+ val ratings = sc.objectFile[Rating[Int]](params.dataPath).toDF()
+
+ val Array(training, test) = ratings.randomSplit(Array(0.8, 0.2), 1L)
+
+ // Build the recommendation model using ALS on the training data
+ val als = new ALS()
+ .setRank(params.rank)
+ .setMaxIter(params.numIterations)
+ .setRegParam(params.lambda)
+ .setImplicitPrefs(params.implicitPrefs)
+ .setNumUserBlocks(params.numUserBlocks)
+ .setNumItemBlocks(params.numItemBlocks)
+ .setUserCol("user")
+ .setItemCol("item")
+ .setRatingCol("rating")
+ val model = als.fit(training)
+
+ // Evaluate the model by computing the RMSE on the test data
+ // Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
+ model.setColdStartStrategy("drop")
+ val predictions = model.transform(test)
+
+ val evaluator = new RegressionEvaluator()
+ .setMetricName("rmse")
+ .setLabelCol("rating")
+ .setPredictionCol("prediction")
+ val rmse = evaluator.evaluate(predictions)
+ println(s"Root-mean-square error = $rmse")
+
+ spark.stop()
+ }
+}
diff --git a/examples/als-pyspark/als-pyspark.py b/examples/als-pyspark/als-pyspark.py
new file mode 100644
index 000000000..8847ca2b9
--- /dev/null
+++ b/examples/als-pyspark/als-pyspark.py
@@ -0,0 +1,67 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+import sys
+if sys.version >= '3':
+ long = int
+
+from pyspark.sql import SparkSession
+
+# $example on$
+from pyspark.ml.evaluation import RegressionEvaluator
+from pyspark.ml.recommendation import ALS
+from pyspark.sql import Row
+# $example off$
+
+if __name__ == "__main__":
+ spark = SparkSession\
+ .builder\
+ .appName("ALSExample")\
+ .getOrCreate()
+
+ if (len(sys.argv) != 2) :
+ print("Require data file path as input parameter")
+ sys.exit(1)
+
+ # $example on$
+ lines = spark.read.text(sys.argv[1]).rdd
+ parts = lines.map(lambda row: row.value.split("::"))
+ ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
+ rating=float(p[2])))
+ ratings = spark.createDataFrame(ratingsRDD)
+ # (training, test) = ratings.randomSplit([0.8, 0.2])
+
+ # Build the recommendation model using ALS on the training data
+ # Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
+ als = ALS(rank=10, maxIter=5, regParam=0.01, implicitPrefs=True, alpha=40.0,
+ userCol="userId", itemCol="movieId", ratingCol="rating",
+ coldStartStrategy="drop")
+ print("\nALS training with implicitPrefs={}, rank={}, maxIter={}, regParam={}, alpha={}, seed={}\n".format(
+ als.getImplicitPrefs(), als.getRank(), als.getMaxIter(), als.getRegParam(), als.getAlpha(), als.getSeed()
+ ))
+ model = als.fit(ratings)
+
+ # Evaluate the model by computing the RMSE on the test data
+ # predictions = model.transform(test)
+ # evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
+ # predictionCol="prediction")
+ # rmse = evaluator.evaluate(predictions)
+ # print("Root-mean-square error = " + str(rmse))
+
+ spark.stop()
diff --git a/examples/als-pyspark/run.sh b/examples/als-pyspark/run.sh
new file mode 100755
index 000000000..b3ba1b6d2
--- /dev/null
+++ b/examples/als-pyspark/run.sh
@@ -0,0 +1,75 @@
+#!/usr/bin/env bash
+
+# == User to customize the following environments ======= #
+
+# Set user Spark and Hadoop home directory
+#export SPARK_HOME=/path/to/your/spark/home
+#export HADOOP_HOME=/path/to/your/hadoop/home
+# Set user HDFS Root
+export HDFS_ROOT=hdfs://sr549:8020
+# Set user Intel MLlib Root directory
+export OAP_MLLIB_ROOT=/home/xiaochang/Works/OAP-xwu99-als/oap-mllib
+# Set IP and Port for oneCCL KVS, you can select any one of the worker nodes and set CCL_KVS_IP_PORT to its IP and Port
+# IP can be got with `hostname -I`, if multiple IPs are returned, the first IP should be used. Port can be any available port.
+# For example, if one of the worker IP is 192.168.0.1 and an available port is 51234.
+# CCL_KVS_IP_PORT can be set in the format of 192.168.0.1_51234
+# Incorrectly setting this value will result in hanging when oneCCL initialize
+export CCL_KVS_IP_PORT=10.0.2.149_51234
+
+# Data file is from Spark Examples (data/mllib/sample_kmeans_data.txt), the data file should be copied to HDFS
+DATA_FILE=data/onedal_als_csr_ratings.txt
+
+# == User to customize Spark executor cores and memory == #
+
+# User should check the requested resources are acturally allocated by cluster manager or Intel MLlib will behave incorrectly
+SPARK_MASTER=yarn
+SPARK_DRIVER_MEMORY=1G
+SPARK_NUM_EXECUTORS=2
+SPARK_EXECUTOR_CORES=1
+SPARK_EXECUTOR_MEMORY=1G
+
+SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES '*' 2)
+
+# ======================================================= #
+
+# Check env
+if [[ -z $SPARK_HOME ]]; then
+ echo SPARK_HOME not defined!
+ exit 1
+fi
+
+if [[ -z $HADOOP_HOME ]]; then
+ echo HADOOP_HOME not defined!
+ exit 1
+fi
+
+export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
+
+# Target jar built
+OAP_MLLIB_JAR_NAME=oap-mllib-0.9.0-with-spark-3.0.0.jar
+OAP_MLLIB_JAR=$OAP_MLLIB_ROOT/mllib-dal/target/$OAP_MLLIB_JAR_NAME
+
+# Use absolute path
+SPARK_DRIVER_CLASSPATH=$OAP_MLLIB_JAR
+# Use relative path
+SPARK_EXECUTOR_CLASSPATH=./$OAP_MLLIB_JAR_NAME
+
+APP_PY=als-pyspark.py
+
+/usr/bin/time -p $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \
+ --num-executors $SPARK_NUM_EXECUTORS \
+ --driver-memory $SPARK_DRIVER_MEMORY \
+ --executor-cores $SPARK_EXECUTOR_CORES \
+ --executor-memory $SPARK_EXECUTOR_MEMORY \
+ --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \
+ --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \
+ --conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \
+ --conf "spark.driver.extraClassPath=$SPARK_DRIVER_CLASSPATH" \
+ --conf "spark.executor.extraClassPath=$SPARK_EXECUTOR_CLASSPATH" \
+ --conf "spark.executorEnv.CCL_KVS_IP_PORT=$CCL_KVS_IP_PORT" \
+ --conf "spark.shuffle.reduceLocality.enabled=false" \
+ --conf "spark.network.timeout=1200s" \
+ --conf "spark.task.maxFailures=1" \
+ --jars $OAP_MLLIB_JAR \
+ $APP_PY $DATA_FILE \
+ 2>&1 | tee ALS-$(date +%m%d_%H_%M_%S).log
diff --git a/examples/kmeans-hibench/build.sh b/examples/kmeans-hibench/build.sh
new file mode 100755
index 000000000..da373645b
--- /dev/null
+++ b/examples/kmeans-hibench/build.sh
@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+
+mvn clean package
diff --git a/examples/kmeans-hibench/pom.xml b/examples/kmeans-hibench/pom.xml
new file mode 100644
index 000000000..3f5b56e29
--- /dev/null
+++ b/examples/kmeans-hibench/pom.xml
@@ -0,0 +1,99 @@
+
+ 4.0.0
+
+ com.intel.oap
+ oap-mllib-examples
+ 1.1.0-with-spark-3.0.0
+ jar
+
+ KMeansHiBenchExample
+ https://github.com/oap-project/oap-mllib.git
+
+
+ UTF-8
+ 2.12.10
+ 2.12
+ 3.0.0
+
+
+
+
+
+ org.scala-lang
+ scala-library
+ 2.12.10
+
+
+
+ com.github.scopt
+ scopt_2.12
+ 3.7.0
+
+
+
+ org.apache.mahout
+ mahout-hdfs
+ 14.1
+
+
+
+ org.apache.spark
+ spark-sql_2.12
+ ${spark.version}
+ provided
+
+
+
+ org.apache.spark
+ spark-mllib_2.12
+ ${spark.version}
+ provided
+
+
+
+
+
+
+
+ org.scala-tools
+ maven-scala-plugin
+ 2.15.2
+
+
+
+ compile
+ testCompile
+
+
+
+
+ ${scala.version}
+
+ -target:jvm-1.8
+
+
+
+
+ maven-assembly-plugin
+ 3.0.0
+
+ false
+
+ jar-with-dependencies
+
+
+
+
+ assembly
+ package
+
+ single
+
+
+
+
+
+
+
+
diff --git a/examples/kmeans-hibench/run-hibench-oap-mllib.sh b/examples/kmeans-hibench/run-hibench-oap-mllib.sh
new file mode 100755
index 000000000..caa42584f
--- /dev/null
+++ b/examples/kmeans-hibench/run-hibench-oap-mllib.sh
@@ -0,0 +1,86 @@
+#!/usr/bin/env bash
+
+# == User to customize the following environments ======= #
+
+# Set user Spark and Hadoop home directory
+export SPARK_HOME=/path/to/your/spark/home
+export HADOOP_HOME=/path/to/your/hadoop/home
+# Set user HDFS Root
+export HDFS_ROOT=hdfs://your_hostname:8020
+# Set user Intel MLlib Root directory
+export OAP_MLLIB_ROOT=/path/to/your/OAP/oap-mllib
+# Set IP and Port for oneCCL KVS, you can select any one of the worker nodes and set CCL_KVS_IP_PORT to its IP and Port
+# IP can be got with `hostname -I`, if multiple IPs are returned, the first IP should be used. Port can be any available port.
+# For example, if one of the worker IP is 192.168.0.1 and an available port is 51234.
+# CCL_KVS_IP_PORT can be set in the format of 192.168.0.1_51234
+# Incorrectly setting this value will result in hanging when oneCCL initialize
+export CCL_KVS_IP_PORT=192.168.0.1_51234
+
+# == User to customize Spark executor cores and memory == #
+
+# User should check the requested resources are acturally allocated by cluster manager or Intel MLlib will behave incorrectly
+SPARK_MASTER=yarn
+SPARK_DRIVER_MEMORY=8G
+SPARK_NUM_EXECUTORS=6
+SPARK_EXECUTOR_CORES=15
+SPARK_EXECUTOR_MEMORY_OVERHEAD=25G
+SPARK_EXECUTOR_MEMORY=50G
+
+SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES '*' 2)
+
+# ======================================================= #
+
+# for log suffix
+SUFFIX=$( basename -s .sh "${BASH_SOURCE[0]}" )
+
+# Check envs
+if [[ -z $SPARK_HOME ]]; then
+ echo SPARK_HOME not defined!
+ exit 1
+fi
+
+if [[ -z $HADOOP_HOME ]]; then
+ echo HADOOP_HOME not defined!
+ exit 1
+fi
+
+export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
+
+# Target jar built
+OAP_MLLIB_JAR_NAME=oap-mllib-0.9.0-with-spark-3.0.0.jar
+OAP_MLLIB_JAR=$OAP_MLLIB_ROOT/mllib-dal/target/$OAP_MLLIB_JAR_NAME
+
+# Use absolute path
+SPARK_DRIVER_CLASSPATH=$OAP_MLLIB_JAR
+# Use relative path
+SPARK_EXECUTOR_CLASSPATH=./$OAP_MLLIB_JAR_NAME
+
+APP_JAR=target/oap-mllib-examples-0.9.0-with-spark-3.0.0.jar
+APP_CLASS=com.intel.hibench.sparkbench.ml.DenseKMeansDS
+
+K=200
+INIT_MODE=Random
+MAX_ITERATION=20
+INPUT_HDFS=$HDFS_ROOT/HiBench/Kmeans/Input/samples
+
+/usr/bin/time -p $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \
+ --num-executors $SPARK_NUM_EXECUTORS \
+ --driver-memory $SPARK_DRIVER_MEMORY \
+ --executor-cores $SPARK_EXECUTOR_CORES \
+ --executor-memory $SPARK_EXECUTOR_MEMORY \
+ --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \
+ --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \
+ --conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \
+ --conf "spark.driver.extraClassPath=$SPARK_DRIVER_CLASSPATH" \
+ --conf "spark.executor.extraClassPath=$SPARK_EXECUTOR_CLASSPATH" \
+ --conf "spark.executorEnv.CCL_KVS_IP_PORT=$CCL_KVS_IP_PORT" \
+ --conf "spark.shuffle.reduceLocality.enabled=false" \
+ --conf "spark.executor.memoryOverhead=$SPARK_EXECUTOR_MEMORY_OVERHEAD" \
+ --conf "spark.memory.fraction=0.8" \
+ --conf "spark.network.timeout=1200s" \
+ --conf "spark.task.maxFailures=1" \
+ --jars $OAP_MLLIB_JAR \
+ --class $APP_CLASS \
+ $APP_JAR \
+ -k $K --initMode $INIT_MODE --numIterations $MAX_ITERATION $INPUT_HDFS \
+ 2>&1 | tee KMeansHiBench-$SUFFIX-$(date +%m%d_%H_%M_%S).log
diff --git a/examples/kmeans-hibench/run-hibench-vanilla.sh b/examples/kmeans-hibench/run-hibench-vanilla.sh
new file mode 100755
index 000000000..475c25aff
--- /dev/null
+++ b/examples/kmeans-hibench/run-hibench-vanilla.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+
+# == User to customize the following environments ======= #
+
+# Set user Spark and Hadoop home directory
+export SPARK_HOME=/path/to/your/spark/home
+export HADOOP_HOME=/path/to/your/hadoop/home
+# Set user HDFS Root
+export HDFS_ROOT=hdfs://your_hostname:8020
+
+# == User to customize Spark executor cores and memory == #
+
+SPARK_MASTER=yarn
+SPARK_DRIVER_MEMORY=8G
+SPARK_NUM_EXECUTORS=6
+SPARK_EXECUTOR_CORES=15
+SPARK_EXECUTOR_MEMORY=75G
+
+SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES '*' 2)
+
+# ======================================================= #
+
+# for log suffix
+SUFFIX=$( basename -s .sh "${BASH_SOURCE[0]}" )
+
+# Check envs
+if [[ -z $SPARK_HOME ]]; then
+ echo SPARK_HOME not defined!
+ exit 1
+fi
+
+if [[ -z $HADOOP_HOME ]]; then
+ echo HADOOP_HOME not defined!
+ exit 1
+fi
+
+export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
+
+APP_JAR=target/oap-mllib-examples-0.9.0-with-spark-3.0.0.jar
+APP_CLASS=com.intel.hibench.sparkbench.ml.DenseKMeansDS
+
+K=200
+INIT_MODE=Random
+MAX_ITERATION=20
+INPUT_HDFS=$HDFS_ROOT/HiBench/Kmeans/Input/samples
+
+/usr/bin/time -p $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \
+ --num-executors $SPARK_NUM_EXECUTORS \
+ --driver-memory $SPARK_DRIVER_MEMORY \
+ --executor-cores $SPARK_EXECUTOR_CORES \
+ --executor-memory $SPARK_EXECUTOR_MEMORY \
+ --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \
+ --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \
+ --conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \
+ --class $APP_CLASS \
+ $APP_JAR \
+ -k $K --initMode $INIT_MODE --numIterations $MAX_ITERATION $INPUT_HDFS \
+ 2>&1 | tee KMeansHiBench-$SUFFIX-$(date +%m%d_%H_%M_%S).log
diff --git a/examples/kmeans-hibench/src/main/scala/com/intel/hibench/sparkbench/ml/DenseKMeansDS.scala b/examples/kmeans-hibench/src/main/scala/com/intel/hibench/sparkbench/ml/DenseKMeansDS.scala
new file mode 100644
index 000000000..3a949bb1c
--- /dev/null
+++ b/examples/kmeans-hibench/src/main/scala/com/intel/hibench/sparkbench/ml/DenseKMeansDS.scala
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.intel.hibench.sparkbench.ml
+
+import breeze.linalg.DenseVector
+import org.apache.hadoop.io.LongWritable
+import org.apache.mahout.math.VectorWritable
+import org.apache.spark.ml.clustering.KMeans
+import org.apache.spark.ml.evaluation.ClusteringEvaluator
+import org.apache.spark.ml.linalg.{Vector, Vectors}
+import org.apache.spark.{SparkConf, SparkContext}
+import org.apache.spark.sql._
+import scopt.OptionParser
+import org.apache.spark.sql.SparkSession
+
+object DenseKMeansDS {
+
+ object InitializationMode extends Enumeration {
+ type InitializationMode = Value
+ val Random, Parallel = Value
+ }
+
+ import com.intel.hibench.sparkbench.ml.DenseKMeansDS.InitializationMode._
+
+ case class Params(input: String = null,
+ k: Int = -1,
+ numIterations: Int = 10,
+ initializationMode: InitializationMode = Random)
+
+ def main(args: Array[String]) {
+ val defaultParams = Params()
+
+ val parser = new OptionParser[Params]("DenseKMeans") {
+ head("DenseKMeans: an example k-means app for dense data.")
+ opt[Int]('k', "k")
+ .required()
+ .text(s"number of clusters, required")
+ .action((x, c) => c.copy(k = x))
+ opt[Int]("numIterations")
+ .text(s"number of iterations, default; ${defaultParams.numIterations}")
+ .action((x, c) => c.copy(numIterations = x))
+ opt[String]("initMode")
+ .text(s"initialization mode (${InitializationMode.values.mkString(",")}), " +
+ s"default: ${defaultParams.initializationMode}")
+ .action((x, c) => c.copy(initializationMode = InitializationMode.withName(x)))
+ arg[String]("")
+ .text("input paths to examples")
+ .required()
+ .action((x, c) => c.copy(input = x))
+ }
+
+ parser.parse(args, defaultParams).map { params =>
+ run(params)
+ }.getOrElse {
+ sys.exit(1)
+ }
+ }
+
+ def run(params: Params) {
+ val spark = SparkSession
+ .builder
+ .appName(s"DenseKMeansDS with $params")
+ .getOrCreate()
+ import spark.implicits._
+
+ val sc = spark.sparkContext
+
+ val data = sc.sequenceFile[LongWritable, VectorWritable](params.input)
+
+ // Should use Tuple1 to warp around for calling toDF
+ val dataset = data.map { case (k, v) =>
+ var vector: Array[Double] = new Array[Double](v.get().size)
+ for (i <- 0 until v.get().size) vector(i) = v.get().get(i)
+ Tuple1(Vectors.dense(vector))
+ }.toDF("features")
+
+ val initMode = params.initializationMode match {
+ case Random => "random"
+ case Parallel => "k-means||"
+ }
+
+ val model = new KMeans()
+ .setInitMode(initMode)
+ .setK(params.k)
+ .setMaxIter(params.numIterations)
+ .setSeed(1L)
+ .fit(dataset)
+
+ spark.stop()
+ }
+}
+
diff --git a/mllib-dal/pom.xml b/mllib-dal/pom.xml
index 01e002830..4e51f9157 100644
--- a/mllib-dal/pom.xml
+++ b/mllib-dal/pom.xml
@@ -218,10 +218,12 @@
${env.CCL_ROOT}/lib
- libpmi.so.1
- libresizable_pmi.so.1
+
+
+ libmpi.so.12.0.0
libfabric.so.1
- libccl_atl_ofi.so.1
+ libccl.so
+
@@ -271,9 +273,13 @@
${project.build.testOutputDirectory}/lib/libtbbmalloc.so.2
- ${project.build.testOutputDirectory}/lib/libccl_atl_ofi.so.1
- ${project.build.testOutputDirectory}/lib/libccl_atl_ofi.so
+ ${project.build.testOutputDirectory}/lib/libmpi.so.12.0.0
+ ${project.build.testOutputDirectory}/lib/libmpi.so.12
+
+
+
+
diff --git a/mllib-dal/src/assembly/assembly.xml b/mllib-dal/src/assembly/assembly.xml
index 137f19b81..498b90e02 100644
--- a/mllib-dal/src/assembly/assembly.xml
+++ b/mllib-dal/src/assembly/assembly.xml
@@ -58,26 +58,21 @@
-
+
lib
-
+
lib
+ libmpi.so.12
-
+
lib
lib
-
-
-
- lib
- libccl_atl_ofi.so
-
\ No newline at end of file
diff --git a/mllib-dal/src/main/java/org/apache/spark/ml/recommendation/ALSPartitionInfo.java b/mllib-dal/src/main/java/org/apache/spark/ml/recommendation/ALSPartitionInfo.java
new file mode 100644
index 000000000..5f2561772
--- /dev/null
+++ b/mllib-dal/src/main/java/org/apache/spark/ml/recommendation/ALSPartitionInfo.java
@@ -0,0 +1,6 @@
+package org.apache.spark.ml.recommendation;
+
+public class ALSPartitionInfo {
+ public int ratingsNum;
+ public int csrRowNum;
+}
diff --git a/mllib-dal/src/main/java/org/apache/spark/ml/recommendation/ALSResult.java b/mllib-dal/src/main/java/org/apache/spark/ml/recommendation/ALSResult.java
new file mode 100644
index 000000000..67173a0c5
--- /dev/null
+++ b/mllib-dal/src/main/java/org/apache/spark/ml/recommendation/ALSResult.java
@@ -0,0 +1,9 @@
+package org.apache.spark.ml.recommendation;
+
+public class ALSResult {
+ public long rankId = -1;
+ public long cUsersFactorsNumTab;
+ public long cItemsFactorsNumTab;
+ public long cUserOffset;
+ public long cItemOffset;
+}
diff --git a/mllib-dal/src/main/java/org/apache/spark/ml/util/LibLoader.java b/mllib-dal/src/main/java/org/apache/spark/ml/util/LibLoader.java
index 5b51451ae..d8ea09a23 100644
--- a/mllib-dal/src/main/java/org/apache/spark/ml/util/LibLoader.java
+++ b/mllib-dal/src/main/java/org/apache/spark/ml/util/LibLoader.java
@@ -21,7 +21,8 @@
import java.io.*;
import java.util.UUID;
import java.util.logging.Level;
-import java.util.logging.Logger;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import com.intel.daal.utils.LibUtils;
@@ -30,8 +31,7 @@ public final class LibLoader {
// Make sure loading libraries from different temp directory for each process
private final static String subDir = "MLlibDAL_" + UUID.randomUUID();
- private static final Logger logger = Logger.getLogger(LibLoader.class.getName());
- private static final Level logLevel = Level.INFO;
+ private static final Logger log = LoggerFactory.getLogger("LibLoader");
/**
* Get temp dir for exacting lib files
@@ -54,24 +54,23 @@ public static synchronized void loadLibraries() throws IOException {
/**
* Load oneCCL libs in dependency order
*/
- public static synchronized void loadLibCCL() throws IOException {
- loadFromJar(subDir, "libpmi.so.1");
- loadFromJar(subDir, "libresizable_pmi.so.1");
+ private static synchronized void loadLibCCL() throws IOException {
loadFromJar(subDir, "libfabric.so.1");
+ loadFromJar(subDir, "libmpi.so.12");
+ loadFromJar(subDir, "libccl.so");
loadFromJar(subDir, "libsockets-fi.so");
- loadFromJar(subDir, "libccl_atl_ofi.so");
}
/**
* Load MLlibDAL lib, it depends TBB libs that are loaded by oneDAL,
* so this function should be called after oneDAL loadLibrary
*/
- public static synchronized void loadLibMLlibDAL() throws IOException {
+ private static synchronized void loadLibMLlibDAL() throws IOException {
// oneDAL Java API doesn't load correct libtbb version for oneAPI Beta 10
// Rename in pom.xml and assembly.xml to workaround.
// See https://github.com/oneapi-src/oneDAL/issues/1254 -->
LibUtils.loadLibrary();
-
+
loadFromJar(subDir, "libMLlibDAL.so");
}
@@ -82,12 +81,12 @@ public static synchronized void loadLibMLlibDAL() throws IOException {
* @param name library name
*/
private static void loadFromJar(String path, String name) throws IOException {
- logger.log(logLevel, "Loading " + name + " ...");
+ log.debug("Loading " + name + " ...");
File fileOut = createTempFile(path, name);
// File exists already
if (fileOut == null) {
- logger.log(logLevel, "DONE: Loading library as resource.");
+ log.debug("DONE: Loading library as resource.");
return;
}
@@ -97,7 +96,7 @@ private static void loadFromJar(String path, String name) throws IOException {
}
try (OutputStream streamOut = new FileOutputStream(fileOut)) {
- logger.log(logLevel, "Writing resource to temp file.");
+ log.debug("Writing resource to temp file.");
byte[] buffer = new byte[32768];
while (true) {
@@ -116,7 +115,7 @@ private static void loadFromJar(String path, String name) throws IOException {
}
System.load(fileOut.toString());
- logger.log(logLevel, "DONE: Loading library as resource.");
+ log.debug("DONE: Loading library as resource.");
}
/**
diff --git a/mllib-dal/src/main/native/ALSDALImpl.cpp b/mllib-dal/src/main/native/ALSDALImpl.cpp
new file mode 100644
index 000000000..53212dc1d
--- /dev/null
+++ b/mllib-dal/src/main/native/ALSDALImpl.cpp
@@ -0,0 +1,668 @@
+#include
+#include
+#include
+#include
+
+#include "OneCCL.h"
+#include "ALSShuffle.h"
+#include "org_apache_spark_ml_recommendation_ALSDALImpl.h"
+#include "service.h"
+
+using namespace std;
+using namespace daal;
+using namespace daal::algorithms;
+using namespace daal::algorithms::implicit_als;
+
+const int ccl_root = 0;
+
+typedef float algorithmFPType; /* Algorithm floating-point type */
+
+NumericTablePtr userOffset;
+NumericTablePtr itemOffset;
+
+// KeyValueDataCollectionPtr userOffsetsOnMaster;
+// KeyValueDataCollectionPtr itemOffsetsOnMaster;
+
+CSRNumericTablePtr dataTable;
+CSRNumericTablePtr transposedDataTable;
+
+KeyValueDataCollectionPtr userStep3LocalInput;
+KeyValueDataCollectionPtr itemStep3LocalInput;
+
+training::DistributedPartialResultStep4Ptr itemsPartialResultLocal;
+training::DistributedPartialResultStep4Ptr usersPartialResultLocal;
+std::vector itemsPartialResultsMaster;
+std::vector usersPartialResultsMaster;
+
+template
+void gather(size_t rankId, ccl::communicator &comm, size_t nBlocks, const ByteBuffer& nodeResults, T* result) {
+ vector perNodeArchLengthMaster(nBlocks);
+ size_t perNodeArchLength = nodeResults.size();
+ ByteBuffer serializedData;
+
+ vector recv_counts(nBlocks);
+ for (size_t i = 0; i < nBlocks; i++) recv_counts[i] = sizeof(size_t);
+
+ // MPI_Gather(&perNodeArchLength, sizeof(int), MPI_CHAR, perNodeArchLengthMaster,
+ // sizeof(int), MPI_CHAR, ccl_root, MPI_COMM_WORLD);
+ ccl::allgatherv(&perNodeArchLength, sizeof(size_t), perNodeArchLengthMaster.data(), recv_counts,
+ ccl::datatype::uint8, comm).wait();
+
+ // should resize for all ranks for ccl_allgatherv
+ size_t memoryBuf = 0;
+ for (size_t i = 0; i < nBlocks; i++) {
+ memoryBuf += perNodeArchLengthMaster[i];
+ }
+ serializedData.resize(memoryBuf);
+
+ std::vector displs(nBlocks);
+ if (rankId == ccl_root) {
+ size_t shift = 0;
+ for (size_t i = 0; i < nBlocks; i++) {
+ displs[i] = shift;
+ shift += perNodeArchLengthMaster[i];
+ }
+ }
+
+ /* Transfer partial results to step 2 on the root node */
+ // MPI_Gatherv(&nodeResults[0], perNodeArchLength, MPI_CHAR, &serializedData[0],
+ // perNodeArchLengthMaster, displs, MPI_CHAR, ccl_root,
+ // MPI_COMM_WORLD);
+ ccl::allgatherv(&nodeResults[0], perNodeArchLength, &serializedData[0],
+ perNodeArchLengthMaster, ccl::datatype::uint8, comm).wait();
+
+ if (rankId == ccl_root) {
+ for (size_t i = 0; i < nBlocks; i++) {
+ /* Deserialize partial results from step 1 */
+ result[i] = result[i]->cast(deserializeDAALObject(&serializedData[0] + displs[i],
+ perNodeArchLengthMaster[i]));
+ }
+ }
+}
+
+// void gatherUsers(const ByteBuffer & nodeResults, int nBlocks)
+// {
+// size_t perNodeArchLengthMaster[nBlocks];
+// size_t perNodeArchLength = nodeResults.size();
+// ByteBuffer serializedData;
+// size_t recv_counts[nBlocks];
+// for (int i = 0; i < nBlocks; i++) {
+// recv_counts[i] = sizeof(size_t);
+// }
+
+// ccl_request_t request;
+// // MPI_Allgather(&perNodeArchLength, sizeof(int), MPI_CHAR,
+// perNodeArchLengthMaster, sizeof(int), MPI_CHAR, MPI_COMM_WORLD);
+// ccl_allgatherv(&perNodeArchLength, sizeof(size_t), perNodeArchLengthMaster,
+// recv_counts, ccl_dtype_char, NULL, NULL, NULL, &request); ccl_wait(request);
+
+// size_t memoryBuf = 0;
+// for (int i = 0; i < nBlocks; i++)
+// {
+// memoryBuf += perNodeArchLengthMaster[i];
+// }
+// serializedData.resize(memoryBuf);
+
+// size_t shift = 0;
+// std::vector displs(nBlocks);
+// for (int i = 0; i < nBlocks; i++)
+// {
+// displs[i] = shift;
+// shift += perNodeArchLengthMaster[i];
+// }
+
+// /* Transfer partial results to step 2 on the root node */
+// // MPI_Allgatherv(&nodeResults[0], perNodeArchLength, MPI_CHAR, &serializedData[0],
+// perNodeArchLengthMaster, displs, MPI_CHAR, MPI_COMM_WORLD);
+// ccl_allgatherv(&nodeResults[0], perNodeArchLength, &serializedData[0],
+// perNodeArchLengthMaster, ccl_dtype_char, NULL, NULL, NULL, &request);
+// ccl_wait(request);
+
+// usersPartialResultsMaster.resize(nBlocks);
+// for (int i = 0; i < nBlocks; i++)
+// {
+// /* Deserialize partial results from step 4 */
+// usersPartialResultsMaster[i] =
+// training::DistributedPartialResultStep4::cast(deserializeDAALObject(&serializedData[0]
+// + displs[i], perNodeArchLengthMaster[i]));
+// }
+// }
+
+// void gatherItems(const ByteBuffer & nodeResults, size_t nBlocks)
+// {
+// size_t perNodeArchLengthMaster[nBlocks];
+// size_t perNodeArchLength = nodeResults.size();
+// ByteBuffer serializedData;
+// size_t recv_counts[nBlocks];
+// for (size_t i = 0; i < nBlocks; i++) {
+// recv_counts[i] = sizeof(size_t);
+// }
+
+// ccl_request_t request;
+// // MPI_Allgather(&perNodeArchLength, sizeof(int), MPI_CHAR,
+// perNodeArchLengthMaster, sizeof(int), MPI_CHAR, MPI_COMM_WORLD);
+// ccl_allgatherv(&perNodeArchLength, sizeof(size_t), perNodeArchLengthMaster,
+// recv_counts, ccl_dtype_char, NULL, NULL, NULL, &request); ccl_wait(request);
+
+// size_t memoryBuf = 0;
+// for (size_t i = 0; i < nBlocks; i++)
+// {
+// memoryBuf += perNodeArchLengthMaster[i];
+// }
+// serializedData.resize(memoryBuf);
+
+// size_t shift = 0;
+// std::vector displs(nBlocks);
+// for (size_t i = 0; i < nBlocks; i++)
+// {
+// displs[i] = shift;
+// shift += perNodeArchLengthMaster[i];
+// }
+
+// /* Transfer partial results to step 2 on the root node */
+// // MPI_Allgatherv(&nodeResults[0], perNodeArchLength, MPI_CHAR, &serializedData[0],
+// perNodeArchLengthMaster, displs, MPI_CHAR, MPI_COMM_WORLD);
+// ccl_allgatherv(&nodeResults[0], perNodeArchLength, &serializedData[0],
+// perNodeArchLengthMaster, ccl_dtype_char, NULL, NULL, NULL, &request);
+// ccl_wait(request);
+
+// itemsPartialResultsMaster.resize(nBlocks);
+// for (size_t i = 0; i < nBlocks; i++)
+// {
+// /* Deserialize partial results from step 4 */
+// itemsPartialResultsMaster[i] =
+// training::DistributedPartialResultStep4::cast(deserializeDAALObject(&serializedData[0]
+// + displs[i], perNodeArchLengthMaster[i]));
+// }
+// }
+
+template
+void all2all(ccl::communicator &comm, ByteBuffer* nodeResults, size_t nBlocks, KeyValueDataCollectionPtr result) {
+ size_t memoryBuf = 0;
+ size_t shift = 0;
+ vector perNodeArchLengths(nBlocks);
+ vector perNodeArchLengthsRecv(nBlocks);
+ std::vector sdispls(nBlocks);
+ ByteBuffer serializedSendData;
+ ByteBuffer serializedRecvData;
+
+ for (size_t i = 0; i < nBlocks; i++) {
+ perNodeArchLengths[i] = nodeResults[i].size();
+ memoryBuf += perNodeArchLengths[i];
+ sdispls[i] = shift;
+ shift += perNodeArchLengths[i];
+ }
+ serializedSendData.resize(memoryBuf);
+
+ /* memcpy to avoid double compute */
+ memoryBuf = 0;
+ for (size_t i = 0; i < nBlocks; i++) {
+ for (size_t j = 0; j < perNodeArchLengths[i]; j++)
+ serializedSendData[memoryBuf + j] = nodeResults[i][j];
+ memoryBuf += perNodeArchLengths[i];
+ }
+
+ // MPI_Alltoall(perNodeArchLengths, sizeof(int), MPI_CHAR, perNodeArchLengthsRecv,
+ // sizeof(int), MPI_CHAR, MPI_COMM_WORLD);
+ ccl::alltoall(perNodeArchLengths.data(), perNodeArchLengthsRecv.data(), sizeof(size_t), ccl::datatype::uint8, comm).wait();
+
+ memoryBuf = 0;
+ shift = 0;
+ std::vector rdispls(nBlocks);
+ for (size_t i = 0; i < nBlocks; i++) {
+ memoryBuf += perNodeArchLengthsRecv[i];
+ rdispls[i] = shift;
+ shift += perNodeArchLengthsRecv[i];
+ }
+
+ serializedRecvData.resize(memoryBuf);
+
+ /* Transfer partial results to step 2 on the root node */
+ // MPI_Alltoallv(&serializedSendData[0], perNodeArchLengths, sdispls, MPI_CHAR,
+ // &serializedRecvData[0], perNodeArchLengthsRecv, rdispls, MPI_CHAR,
+ // MPI_COMM_WORLD);
+ ccl::alltoallv(&serializedSendData[0], perNodeArchLengths, &serializedRecvData[0],
+ perNodeArchLengthsRecv, ccl::datatype::uint8, comm).wait();
+
+ for (size_t i = 0; i < nBlocks; i++) {
+ (*result)[i] = T::cast(deserializeDAALObject(&serializedRecvData[rdispls[i]],
+ perNodeArchLengthsRecv[i]));
+ }
+}
+
+KeyValueDataCollectionPtr initializeStep1Local(size_t rankId, size_t partitionId,
+ size_t nBlocks, size_t nUsers,
+ size_t nFactors) {
+ int usersPartition[1] = {(int)nBlocks};
+
+ /* Create an algorithm object to initialize the implicit ALS model with the default
+ * method */
+ training::init::Distributed
+ initAlgorithm;
+ initAlgorithm.parameter.fullNUsers = nUsers;
+ initAlgorithm.parameter.nFactors = nFactors;
+ initAlgorithm.parameter.seed += rankId;
+ initAlgorithm.parameter.partition.reset(
+ new HomogenNumericTable((int*)usersPartition, 1, 1));
+ /* Pass a training data set and dependent values to the algorithm */
+ initAlgorithm.input.set(training::init::data, dataTable);
+
+ /* Initialize the implicit ALS model */
+ initAlgorithm.compute();
+
+ training::init::PartialResultPtr partialResult = initAlgorithm.getPartialResult();
+ itemStep3LocalInput = partialResult->get(training::init::outputOfInitForComputeStep3);
+ userOffset = partialResult->get(training::init::offsets, (size_t)rankId);
+ // if (rankId == ccl_root)
+ // {
+ // userOffsetsOnMaster = partialResult->get(training::init::offsets);
+ // }
+ PartialModelPtr partialModelLocal = partialResult->get(training::init::partialModel);
+
+ itemsPartialResultLocal.reset(new training::DistributedPartialResultStep4());
+ itemsPartialResultLocal->set(training::outputOfStep4ForStep1, partialModelLocal);
+
+ return partialResult->get(training::init::outputOfStep1ForStep2);
+}
+
+void initializeStep2Local(size_t rankId, size_t partitionId,
+ const KeyValueDataCollectionPtr& initStep2LocalInput) {
+ /* Create an algorithm object to perform the second step of the implicit ALS
+ * initialization algorithm */
+ training::init::Distributed
+ initAlgorithm;
+
+ initAlgorithm.input.set(training::init::inputOfStep2FromStep1, initStep2LocalInput);
+
+ /* Compute partial results of the second step on local nodes */
+ initAlgorithm.compute();
+
+ training::init::DistributedPartialResultStep2Ptr partialResult =
+ initAlgorithm.getPartialResult();
+ transposedDataTable =
+ CSRNumericTable::cast(partialResult->get(training::init::transposedData));
+ userStep3LocalInput = partialResult->get(training::init::outputOfInitForComputeStep3);
+ itemOffset = partialResult->get(training::init::offsets, (size_t)rankId);
+ // if (rankId == ccl_root)
+ // {
+ // itemOffsetsOnMaster = partialResult->get(training::init::offsets);
+ // }
+}
+
+void initializeModel(size_t rankId, ccl::communicator &comm, size_t partitionId, size_t nBlocks, size_t nUsers,
+ size_t nFactors) {
+ std::cout << "ALS (native): initializeModel " << std::endl;
+
+ auto t1 = std::chrono::high_resolution_clock::now();
+
+ KeyValueDataCollectionPtr initStep1LocalResult =
+ initializeStep1Local(rankId, partitionId, nBlocks, nUsers, nFactors);
+
+ /* MPI_Alltoallv to populate initStep2LocalInput */
+ ByteBuffer nodeCPs[nBlocks];
+ for (size_t i = 0; i < nBlocks; i++) {
+ serializeDAALObject((*initStep1LocalResult)[i].get(), nodeCPs[i]);
+ }
+ KeyValueDataCollectionPtr initStep2LocalInput(new KeyValueDataCollection());
+ all2all(comm, nodeCPs, nBlocks, initStep2LocalInput);
+
+ initializeStep2Local(rankId, partitionId, initStep2LocalInput);
+
+ auto t2 = std::chrono::high_resolution_clock::now();
+ auto duration = std::chrono::duration_cast(t2 - t1).count();
+ std::cout << "ALS (native): initializeModel took " << duration << " secs" << std::endl;
+}
+
+training::DistributedPartialResultStep1Ptr computeStep1Local(
+ const training::DistributedPartialResultStep4Ptr& partialResultLocal,
+ size_t nFactors) {
+ /* Create algorithm objects to compute implicit ALS algorithm in the distributed
+ * processing mode on the local node using the default method */
+ training::Distributed algorithm;
+ algorithm.parameter.nFactors = nFactors;
+
+ /* Set input objects for the algorithm */
+ algorithm.input.set(training::partialModel,
+ partialResultLocal->get(training::outputOfStep4ForStep1));
+
+ /* Compute partial estimates on local nodes */
+ algorithm.compute();
+
+ /* Get the computed partial estimates */
+ return algorithm.getPartialResult();
+}
+
+NumericTablePtr computeStep2Master(
+ const training::DistributedPartialResultStep1Ptr* step1LocalResultsOnMaster,
+ size_t nFactors, size_t nBlocks) {
+ /* Create algorithm objects to compute implicit ALS algorithm in the distributed
+ * processing mode on the master node using the default method */
+ training::Distributed algorithm;
+ algorithm.parameter.nFactors = nFactors;
+
+ /* Set input objects for the algorithm */
+ for (size_t i = 0; i < nBlocks; i++) {
+ algorithm.input.add(training::inputOfStep2FromStep1, step1LocalResultsOnMaster[i]);
+ }
+
+ /* Compute a partial estimate on the master node from the partial estimates on local
+ * nodes */
+ algorithm.compute();
+
+ return algorithm.getPartialResult()->get(training::outputOfStep2ForStep4);
+}
+
+KeyValueDataCollectionPtr computeStep3Local(
+ const NumericTablePtr& offset,
+ const training::DistributedPartialResultStep4Ptr& partialResultLocal,
+ const KeyValueDataCollectionPtr& step3LocalInput, size_t nFactors) {
+ training::Distributed algorithm;
+ algorithm.parameter.nFactors = nFactors;
+
+ algorithm.input.set(training::partialModel,
+ partialResultLocal->get(training::outputOfStep4ForStep3));
+ algorithm.input.set(training::inputOfStep3FromInit, step3LocalInput);
+ algorithm.input.set(training::offset, offset);
+
+ algorithm.compute();
+
+ return algorithm.getPartialResult()->get(training::outputOfStep3ForStep4);
+}
+
+training::DistributedPartialResultStep4Ptr computeStep4Local(
+ const CSRNumericTablePtr& dataTable, const NumericTablePtr& step2MasterResult,
+ const KeyValueDataCollectionPtr& step4LocalInput, size_t nFactors) {
+ training::Distributed algorithm;
+ algorithm.parameter.nFactors = nFactors;
+
+ algorithm.input.set(training::partialModels, step4LocalInput);
+ algorithm.input.set(training::partialData, dataTable);
+ algorithm.input.set(training::inputOfStep4FromStep2, step2MasterResult);
+
+ algorithm.compute();
+
+ return algorithm.getPartialResult();
+}
+
+void trainModel(size_t rankId, ccl::communicator &comm, size_t partitionId, size_t nBlocks, size_t nFactors,
+ size_t maxIterations) {
+ std::cout << "ALS (native): trainModel" << std::endl;
+
+ auto tStart = std::chrono::high_resolution_clock::now();
+
+ training::DistributedPartialResultStep1Ptr step1LocalResultsOnMaster[nBlocks];
+ training::DistributedPartialResultStep1Ptr step1LocalResult;
+ NumericTablePtr step2MasterResult;
+ KeyValueDataCollectionPtr step3LocalResult;
+ KeyValueDataCollectionPtr step4LocalInput(new KeyValueDataCollection());
+
+ ByteBuffer nodeCPs[nBlocks];
+ ByteBuffer nodeResults;
+ ByteBuffer crossProductBuf;
+ int crossProductLen;
+
+ for (size_t iteration = 0; iteration < maxIterations; iteration++) {
+ auto t1 = std::chrono::high_resolution_clock::now();
+
+ //
+ // Update partial users factors
+ //
+ step1LocalResult = computeStep1Local(itemsPartialResultLocal, nFactors);
+
+ serializeDAALObject(step1LocalResult.get(), nodeResults);
+
+ /* Gathering step1LocalResult on the master */
+ gather(rankId, comm, nBlocks, nodeResults, step1LocalResultsOnMaster);
+
+ if (rankId == ccl_root) {
+ step2MasterResult =
+ computeStep2Master(step1LocalResultsOnMaster, nFactors, nBlocks);
+ serializeDAALObject(step2MasterResult.get(), crossProductBuf);
+ crossProductLen = crossProductBuf.size();
+ }
+
+ // MPI_Bcast(&crossProductLen, sizeof(int), MPI_CHAR, ccl_root, MPI_COMM_WORLD);
+ ccl::broadcast(&crossProductLen, sizeof(int), ccl::datatype::uint8, ccl_root, comm).wait();
+
+ if (rankId != ccl_root) {
+ crossProductBuf.resize(crossProductLen);
+ }
+ // MPI_Bcast(&crossProductBuf[0], crossProductLen, MPI_CHAR, ccl_root,
+ // MPI_COMM_WORLD);
+ ccl::broadcast(&crossProductBuf[0], crossProductLen, ccl::datatype::uint8, ccl_root, comm).wait();
+
+ step2MasterResult =
+ NumericTable::cast(deserializeDAALObject(&crossProductBuf[0], crossProductLen));
+
+ step3LocalResult = computeStep3Local(itemOffset, itemsPartialResultLocal,
+ itemStep3LocalInput, nFactors);
+
+ /* MPI_Alltoallv to populate step4LocalInput */
+ for (size_t i = 0; i < nBlocks; i++) {
+ serializeDAALObject((*step3LocalResult)[i].get(), nodeCPs[i]);
+ }
+ all2all(comm, nodeCPs, nBlocks, step4LocalInput);
+
+ usersPartialResultLocal = computeStep4Local(transposedDataTable, step2MasterResult,
+ step4LocalInput, nFactors);
+
+ //
+ // Update partial items factors
+ //
+ step1LocalResult = computeStep1Local(usersPartialResultLocal, nFactors);
+
+ serializeDAALObject(step1LocalResult.get(), nodeResults);
+
+ /* Gathering step1LocalResult on the master */
+ gather(rankId, comm, nBlocks, nodeResults, step1LocalResultsOnMaster);
+
+ if (rankId == ccl_root) {
+ step2MasterResult =
+ computeStep2Master(step1LocalResultsOnMaster, nFactors, nBlocks);
+ serializeDAALObject(step2MasterResult.get(), crossProductBuf);
+ crossProductLen = crossProductBuf.size();
+ }
+
+ // MPI_Bcast(&crossProductLen, sizeof(int), MPI_CHAR, ccl_root, MPI_COMM_WORLD);
+ ccl::broadcast(&crossProductLen, sizeof(int), ccl::datatype::uint8, ccl_root, comm).wait();
+
+ if (rankId != ccl_root) {
+ crossProductBuf.resize(crossProductLen);
+ }
+
+ // MPI_Bcast(&crossProductBuf[0], crossProductLen, MPI_CHAR, ccl_root,
+ // MPI_COMM_WORLD);
+ ccl::broadcast(&crossProductBuf[0], crossProductLen, ccl::datatype::uint8, ccl_root, comm).wait();
+
+ step2MasterResult =
+ NumericTable::cast(deserializeDAALObject(&crossProductBuf[0], crossProductLen));
+
+ step3LocalResult = computeStep3Local(userOffset, usersPartialResultLocal,
+ userStep3LocalInput, nFactors);
+
+ /* MPI_Alltoallv to populate step4LocalInput */
+ for (size_t i = 0; i < nBlocks; i++) {
+ serializeDAALObject((*step3LocalResult)[i].get(), nodeCPs[i]);
+ }
+ all2all(comm, nodeCPs, nBlocks, step4LocalInput);
+
+ itemsPartialResultLocal =
+ computeStep4Local(dataTable, step2MasterResult, step4LocalInput, nFactors);
+
+ auto t2 = std::chrono::high_resolution_clock::now();
+ auto duration = std::chrono::duration_cast(t2 - t1).count();
+ std::cout << "ALS (native): iteration " << iteration << " took " << duration
+ << " secs" << std::endl;
+ }
+
+ auto tEnd = std::chrono::high_resolution_clock::now();
+ auto durationTotal =
+ std::chrono::duration_cast(tEnd - tStart).count();
+ std::cout << "ALS (native): trainModel took " << durationTotal << " secs" << std::endl;
+
+ /*Gather all itemsPartialResultLocal to itemsPartialResultsMaster on the master and
+ * distributing the result over other ranks*/
+ // serializeDAALObject(itemsPartialResultLocal.get(), nodeResults);
+ // gatherItems(nodeResults, nBlocks);
+
+ // serializeDAALObject(usersPartialResultLocal.get(), nodeResults);
+ // gatherUsers(nodeResults, nBlocks);
+}
+
+static size_t getOffsetFromOffsetTable(NumericTablePtr offsetTable) {
+ size_t ret;
+ BlockDescriptor block;
+ offsetTable->getBlockOfRows(0, 1, readOnly, block);
+ ret = (size_t)((block.getBlockPtr())[0]);
+ offsetTable->releaseBlockOfRows(block);
+
+ return ret;
+}
+
+/*
+ * Class: org_apache_spark_ml_recommendation_ALSDALImpl
+ * Method: cShuffleData
+ * Signature:
+ * (Ljava/nio/ByteBuffer;IILorg/apache/spark/ml/recommendation/ALSPartitionInfo;)Ljava/nio/ByteBuffer;
+ */
+JNIEXPORT jobject JNICALL Java_org_apache_spark_ml_recommendation_ALSDALImpl_cShuffleData(
+ JNIEnv* env, jobject obj, jobject dataBuffer, jint nTotalKeys, jint nBlocks,
+ jobject infoObj) {
+ // cout << "cShuffleData: rank " << rankId << endl;
+ cout << "RATING_SIZE: " << RATING_SIZE << endl;
+
+ ccl::communicator &comm = getComm();
+
+ jbyte* ratingsBuf = (jbyte*)env->GetDirectBufferAddress(dataBuffer);
+
+ jlong ratingsNum = env->GetDirectBufferCapacity(dataBuffer) / RATING_SIZE;
+
+ std::vector ratingPartitions(nBlocks);
+
+ for (int i = 0; i < ratingsNum; i++) {
+ Rating* rating = (Rating*)(ratingsBuf + RATING_SIZE * i);
+ int partition = getPartiton(rating->user, nTotalKeys, nBlocks);
+ ratingPartitions[partition].push_back(*rating);
+ }
+
+ // for (int i = 0; i < nBlocks; i++) {
+ // cout << "Partition " << i << endl;
+ // for (auto r : ratingPartitions[i]) {
+ // cout << r.user << " " << r.item << " " << r.rating << endl;
+ // }
+ // }
+
+ size_t newRatingsNum = 0;
+ size_t newCsrRowNum = 0;
+ Rating* ratings = shuffle_all2all(comm, ratingPartitions, nBlocks, newRatingsNum, newCsrRowNum);
+
+ // Get the class of the input object
+ jclass clazz = env->GetObjectClass(infoObj);
+ // Get Field references
+ jfieldID ratingsNumField = env->GetFieldID(clazz, "ratingsNum", "I");
+ jfieldID csrRowNumField = env->GetFieldID(clazz, "csrRowNum", "I");
+
+ env->SetIntField(infoObj, ratingsNumField, newRatingsNum);
+ env->SetIntField(infoObj, csrRowNumField, newCsrRowNum);
+
+ return env->NewDirectByteBuffer(ratings, newRatingsNum*RATING_SIZE);
+}
+
+/*
+ * Class: org_apache_spark_ml_recommendation_ALSDALImpl
+ * Method: cDALImplictALS
+ * Signature: (JJIIDDIIILorg/apache/spark/ml/recommendation/ALSResult;)J
+ */
+
+JNIEXPORT jlong JNICALL Java_org_apache_spark_ml_recommendation_ALSDALImpl_cDALImplictALS(
+ JNIEnv* env, jobject obj, jlong numTableAddr, jlong nUsers, jint nFactors,
+ jint maxIter, jdouble regParam, jdouble alpha, jint executor_num, jint executor_cores,
+ jint partitionId, jobject resultObj) {
+
+ ccl::communicator &comm = getComm();
+ size_t rankId = comm.rank();
+
+ dataTable = *((CSRNumericTablePtr*)numTableAddr);
+ // dataTable.reset(createFloatSparseTable("/home/xiaochang/github/oneDAL-upstream/samples/daal/cpp/mpi/data/distributed/implicit_als_csr_1.csv"));
+
+ // printNumericTable(dataTable, "cDALImplictALS", 10);
+ cout << "ALS (native): Input info: " << endl;
+ cout << "- NumberOfRows: " << dataTable->getNumberOfRows() << endl;
+ cout << "- NumberOfColumns: " << dataTable->getNumberOfColumns() << endl;
+ cout << "- NumberOfRatings: " << dataTable->getDataSize() << endl;
+ cout << "- fullNUsers: " << nUsers << endl;
+ cout << "- nFactors: " << nFactors << endl;
+
+ // Set number of threads for oneDAL to use for each rank
+ services::Environment::getInstance()->setNumberOfThreads(executor_cores);
+ int nThreadsNew = services::Environment::getInstance()->getNumberOfThreads();
+ cout << "oneDAL (native): Number of threads used: " << nThreadsNew << endl;
+
+ int nBlocks = executor_num;
+ initializeModel(rankId, comm, partitionId, nBlocks, nUsers, nFactors);
+ trainModel(rankId, comm, partitionId, executor_num, nFactors, maxIter);
+
+ auto pUser =
+ usersPartialResultLocal->get(training::outputOfStep4ForStep1)->getFactors();
+ // auto pUserIndices =
+ // usersPartialResultLocal->get(training::outputOfStep4ForStep1)->getIndices();
+ auto pItem =
+ itemsPartialResultLocal->get(training::outputOfStep4ForStep1)->getFactors();
+ // auto pItemIndices =
+ // itemsPartialResultsMaster[i]->get(training::outputOfStep4ForStep1)->getIndices();
+
+ std::cout << "\n=== Results for Rank " << rankId << "===\n" << std::endl;
+ // std::cout << "Partition ID: " << partitionId << std::endl;
+ printNumericTable(pUser, "User Factors (first 10 rows):", 10);
+ printNumericTable(pItem, "Item Factors (first 10 rows):", 10);
+ std::cout << "User Offset: " << getOffsetFromOffsetTable(userOffset) << std::endl;
+ std::cout << "Item Offset: " << getOffsetFromOffsetTable(itemOffset) << std::endl;
+ std::cout << std::endl;
+
+ // printNumericTable(userOffset, "userOffset");
+ // printNumericTable(itemOffset, "itemOffset");
+
+ // if (rankId == ccl_root) {
+ // for (int i = 0; i < nBlocks; i++) {
+ // printNumericTable(NumericTable::cast((*userOffsetsOnMaster)[i]),
+ // "userOffsetsOnMaster");
+ // }
+
+ // for (int i = 0; i < nBlocks; i++) {
+ // printNumericTable(NumericTable::cast((*itemOffsetsOnMaster)[i]),
+ // "itemOffsetsOnMaster");
+ // }
+ // }
+
+ // printf("native pUser %ld, pItem %ld", (jlong)&pUser, (jlong)&pItem);
+
+ // Get the class of the input object
+ jclass clazz = env->GetObjectClass(resultObj);
+
+ // Fill in rankId
+ jfieldID cRankIdField = env->GetFieldID(clazz, "rankId", "J");
+ env->SetLongField(resultObj, cRankIdField, (jlong)rankId);
+
+ // Fill in cUsersFactorsNumTab & cItemsFactorsNumTab
+ // Get Field references
+ jfieldID cUsersFactorsNumTabField = env->GetFieldID(clazz, "cUsersFactorsNumTab", "J");
+ jfieldID cItemsFactorsNumTabField = env->GetFieldID(clazz, "cItemsFactorsNumTab", "J");
+ // Set factors as result, should use heap memory
+ NumericTablePtr* retUser = new NumericTablePtr(pUser);
+ NumericTablePtr* retItem = new NumericTablePtr(pItem);
+ env->SetLongField(resultObj, cUsersFactorsNumTabField, (jlong)retUser);
+ env->SetLongField(resultObj, cItemsFactorsNumTabField, (jlong)retItem);
+
+ // Fill in cUserOffset & cItemOffset
+ jfieldID cUserOffsetField = env->GetFieldID(clazz, "cUserOffset", "J");
+ assert(cUserOffsetField != NULL);
+ env->SetLongField(resultObj, cUserOffsetField,
+ (jlong)getOffsetFromOffsetTable(userOffset));
+
+ jfieldID cItemOffsetField = env->GetFieldID(clazz, "cItemOffset", "J");
+ assert(cItemOffsetField != NULL);
+ env->SetLongField(resultObj, cItemOffsetField,
+ (jlong)getOffsetFromOffsetTable(itemOffset));
+
+ return 0;
+}
diff --git a/mllib-dal/src/main/native/ALSShuffle.cpp b/mllib-dal/src/main/native/ALSShuffle.cpp
new file mode 100644
index 000000000..73440d253
--- /dev/null
+++ b/mllib-dal/src/main/native/ALSShuffle.cpp
@@ -0,0 +1,102 @@
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "ALSShuffle.h"
+
+using namespace std;
+
+std::vector recvData;
+
+jlong getPartiton(jlong key, jlong totalKeys, long nBlocks) {
+
+ jlong itemsInBlock = totalKeys / nBlocks;
+
+ return min(key / itemsInBlock, nBlocks - 1);
+}
+
+// Compares two Rating according to userId.
+bool compareRatingByUser(Rating r1, Rating r2)
+{
+ if (r1.user < r2.user)
+ return true;
+ if (r1.user == r2.user && r1.item < r2.item)
+ return true;
+ return false;
+}
+
+bool compareRatingUserEquality(Rating &r1, Rating &r2) {
+ return r1.user == r2.user;
+}
+
+int distinct_count(std::vector &data) {
+ long curUser = -1;
+ long count = 0;
+ for (auto i : data) {
+ if (i.user > curUser) {
+ curUser = i.user;
+ count += 1;
+ }
+ }
+ return count;
+}
+
+Rating * shuffle_all2all(ccl::communicator &comm, std::vector &partitions, size_t nBlocks, size_t &newRatingsNum, size_t &newCsrRowNum) {
+ size_t sendBufSize = 0;
+ size_t recvBufSize = 0;
+ vector perNodeSendLens(nBlocks);
+ vector perNodeRecvLens(nBlocks);
+
+ ByteBuffer sendData;
+
+ // Calculate send buffer size
+ for (size_t i = 0; i < nBlocks; i++) {
+ perNodeSendLens[i] = partitions[i].size() * RATING_SIZE;
+ // cout << "rank " << rankId << " Send partition " << i << " size " << perNodeSendLens[i] << endl;
+ sendBufSize += perNodeSendLens[i];
+ }
+ cout << "sendData size " << sendBufSize << endl;
+ sendData.resize(sendBufSize);
+
+ // Fill in send buffer
+ size_t offset = 0;
+ for (size_t i = 0; i < nBlocks; i++)
+ {
+ memcpy(sendData.data()+offset, partitions[i].data(), perNodeSendLens[i]);
+ offset += perNodeSendLens[i];
+ }
+
+ // Send lens first
+ ccl::alltoall(perNodeSendLens.data(), perNodeRecvLens.data(), sizeof(size_t), ccl::datatype::uint8, comm).wait();
+
+ // Calculate recv buffer size
+ for (size_t i = 0; i < nBlocks; i++) {
+ // cout << "rank " << rankId << " Recv partition " << i << " size " << perNodeRecvLens[i] << endl;
+ recvBufSize += perNodeRecvLens[i];
+ }
+
+ int ratingsNum = recvBufSize / RATING_SIZE;
+ recvData.resize(ratingsNum);
+
+ // Send data
+ ccl::alltoallv(sendData.data(), perNodeSendLens, recvData.data(), perNodeRecvLens, ccl::datatype::uint8, comm).wait();
+
+ sort(recvData.begin(), recvData.end(), compareRatingByUser);
+
+ // for (auto r : recvData) {
+ // cout << r.user << " " << r.item << " " << r.rating << endl;
+ // }
+
+ newRatingsNum = recvData.size();
+ // RatingPartition::iterator iter = std::unique(recvData.begin(), recvData.end(), compareRatingUserEquality);
+ // newCsrRowNum = std::distance(recvData.begin(), iter);
+ newCsrRowNum = distinct_count(recvData);
+
+ cout << "newRatingsNum: " << newRatingsNum << " newCsrRowNum: " << newCsrRowNum << endl;
+
+ return recvData.data();
+}
+
diff --git a/mllib-dal/src/main/native/ALSShuffle.h b/mllib-dal/src/main/native/ALSShuffle.h
new file mode 100644
index 000000000..dbe864978
--- /dev/null
+++ b/mllib-dal/src/main/native/ALSShuffle.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include
+
+struct Rating {
+ jlong user;
+ jlong item;
+ jfloat rating;
+} __attribute__((packed));
+
+const int RATING_SIZE = sizeof(Rating);
+
+typedef std::vector ByteBuffer;
+typedef std::vector RatingPartition;
+
+jlong getPartiton(jlong key, jlong totalKeys, long nBlocks);
+Rating * shuffle_all2all(ccl::communicator &comm, std::vector &partitions, size_t nBlocks, size_t &ratingsNum, size_t &csrRowNum);
diff --git a/mllib-dal/src/main/native/KMeansDALImpl.cpp b/mllib-dal/src/main/native/KMeansDALImpl.cpp
index 8ee123754..d9c7a2f29 100644
--- a/mllib-dal/src/main/native/KMeansDALImpl.cpp
+++ b/mllib-dal/src/main/native/KMeansDALImpl.cpp
@@ -14,13 +14,14 @@
* limitations under the License.
*******************************************************************************/
-#include
+#include
#include
+#include
+#include
#include "service.h"
#include "org_apache_spark_ml_clustering_KMeansDALImpl.h"
-#include
-#include
+#include "OneCCL.h"
using namespace std;
using namespace daal;
@@ -30,7 +31,8 @@ const int ccl_root = 0;
typedef double algorithmFPType; /* Algorithm floating-point type */
-static NumericTablePtr kmeans_compute(int rankId, const NumericTablePtr & pData, const NumericTablePtr & initialCentroids,
+static NumericTablePtr kmeans_compute(int rankId, ccl::communicator &comm,
+ const NumericTablePtr & pData, const NumericTablePtr & initialCentroids,
size_t nClusters, size_t nBlocks, algorithmFPType &ret_cost)
{
const bool isRoot = (rankId == ccl_root);
@@ -43,17 +45,13 @@ static NumericTablePtr kmeans_compute(int rankId, const NumericTablePtr & pData,
CentroidsArchLength = inputArch.getSizeOfArchive();
}
- ccl_request_t request;
-
/* Get partial results from the root node */
- ccl_bcast(&CentroidsArchLength, sizeof(size_t), ccl_dtype_char, ccl_root, NULL, NULL, NULL, &request);
- ccl_wait(request);
+ ccl::broadcast(&CentroidsArchLength, sizeof(size_t), ccl::datatype::uint8, ccl_root, comm).wait();
ByteBuffer nodeCentroids(CentroidsArchLength);
if (isRoot) inputArch.copyArchiveToArray(&nodeCentroids[0], CentroidsArchLength);
- ccl_bcast(&nodeCentroids[0], CentroidsArchLength, ccl_dtype_char, ccl_root, NULL, NULL, NULL, &request);
- ccl_wait(request);
+ ccl::broadcast(&nodeCentroids[0], CentroidsArchLength, ccl::datatype::uint8, ccl_root, comm).wait();
/* Deserialize centroids data */
OutputDataArchive outArch(nodeCentroids.size() ? &nodeCentroids[0] : NULL, CentroidsArchLength);
@@ -79,7 +77,7 @@ static NumericTablePtr kmeans_compute(int rankId, const NumericTablePtr & pData,
ByteBuffer serializedData;
/* Serialized data is of equal size on each node if each node called compute() equal number of times */
- size_t* recvCounts = new size_t[nBlocks];
+ vector recvCounts(nBlocks);
for (size_t i = 0; i < nBlocks; i++)
{
recvCounts[i] = perNodeArchLength;
@@ -90,10 +88,7 @@ static NumericTablePtr kmeans_compute(int rankId, const NumericTablePtr & pData,
dataArch.copyArchiveToArray(&nodeResults[0], perNodeArchLength);
/* Transfer partial results to step 2 on the root node */
- ccl_allgatherv(&nodeResults[0], perNodeArchLength, &serializedData[0], recvCounts, ccl_dtype_char, NULL, NULL, NULL, &request);
- ccl_wait(request);
-
- delete [] recvCounts;
+ ccl::allgatherv(&nodeResults[0], perNodeArchLength, &serializedData[0], recvCounts, ccl::datatype::uint8, comm).wait();
if (isRoot)
{
@@ -168,8 +163,8 @@ JNIEXPORT jlong JNICALL Java_org_apache_spark_ml_clustering_KMeansDALImpl_cKMean
jint executor_num, jint executor_cores,
jobject resultObj) {
- size_t rankId;
- ccl_get_comm_rank(NULL, &rankId);
+ ccl::communicator &comm = getComm();
+ size_t rankId = comm.rank();
NumericTablePtr pData = *((NumericTablePtr *)pNumTabData);
NumericTablePtr centroids = *((NumericTablePtr *)pNumTabCenters);
@@ -189,16 +184,14 @@ JNIEXPORT jlong JNICALL Java_org_apache_spark_ml_clustering_KMeansDALImpl_cKMean
for (it = 0; it < iteration_num && !converged; it++) {
auto t1 = std::chrono::high_resolution_clock::now();
- newCentroids = kmeans_compute(rankId, pData, centroids, cluster_num, executor_num, totalCost);
+ newCentroids = kmeans_compute(rankId, comm, pData, centroids, cluster_num, executor_num, totalCost);
if (rankId == ccl_root) {
converged = areAllCentersConverged(centroids, newCentroids, tolerance);
}
// Sync converged status
- ccl_request_t request;
- ccl_bcast(&converged, 1, ccl_dtype_char, ccl_root, NULL, NULL, NULL, &request);
- ccl_wait(request);
+ ccl::broadcast(&converged, 1, ccl::datatype::uint8, ccl_root, comm).wait();
centroids = newCentroids;
@@ -228,4 +221,4 @@ JNIEXPORT jlong JNICALL Java_org_apache_spark_ml_clustering_KMeansDALImpl_cKMean
return (jlong)ret;
} else
return (jlong)0;
-}
+}
\ No newline at end of file
diff --git a/mllib-dal/src/main/native/Makefile b/mllib-dal/src/main/native/Makefile
index dfb5076ff..23222e646 100644
--- a/mllib-dal/src/main/native/Makefile
+++ b/mllib-dal/src/main/native/Makefile
@@ -31,17 +31,17 @@ INCS := -I $(JAVA_HOME)/include \
# Use static link if possible, TBB is only available as dynamic libs
-LIBS := -L${CCL_ROOT}/lib -l:libccl.a \
+LIBS := -L${CCL_ROOT}/lib -lccl \
-L$(DAALROOT)/lib/intel64 -l:libdaal_core.a -l:libdaal_thread.a \
- -L$(TBBROOT)/lib/intel64/gcc4.8 -ltbb -ltbbmalloc
+ -L$(TBBROOT)/lib -ltbb -ltbbmalloc
# TODO: Add signal chaining support, should fix linking, package so and loading
# -L$(JAVA_HOME)/jre/lib/amd64 -ljsig
CPP_SRCS += \
-./OneCCL.cpp ./OneDAL.cpp ./KMeansDALImpl.cpp ./PCADALImpl.cpp ./service.cpp ./error_handling.cpp
+./OneCCL.cpp ./OneDAL.cpp ./KMeansDALImpl.cpp ./PCADALImpl.cpp ./ALSDALImpl.cpp ./ALSShuffle.cpp ./service.cpp ./error_handling.cpp
OBJS += \
-./OneCCL.o ./OneDAL.o ./KMeansDALImpl.o ./PCADALImpl.o ./service.o ./error_handling.o
+./OneCCL.o ./OneDAL.o ./KMeansDALImpl.o ./PCADALImpl.o ./ALSDALImpl.o ./ALSShuffle.o ./service.o ./error_handling.o
# Output Binary
OUTPUT = ../../../target/libMLlibDAL.so
diff --git a/mllib-dal/src/main/native/OneCCL.cpp b/mllib-dal/src/main/native/OneCCL.cpp
index b23be9737..c733c7b33 100644
--- a/mllib-dal/src/main/native/OneCCL.cpp
+++ b/mllib-dal/src/main/native/OneCCL.cpp
@@ -1,26 +1,63 @@
#include
-#include
+#include
+
+#include
+#include
+#include
+#include
+
+#include
+#include
+#include
+
+#include
+
#include "org_apache_spark_ml_util_OneCCL__.h"
+// todo: fill initial comm_size and rank_id
+size_t comm_size;
+size_t rank_id;
+
+std::vector g_comms;
+
+ccl::communicator &getComm() {
+ return g_comms[0];
+}
+
JNIEXPORT jint JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_c_1init
- (JNIEnv *env, jobject obj, jobject param) {
+ (JNIEnv *env, jobject obj, jint size, jint rank, jstring ip_port, jobject param) {
- std::cout << "oneCCL (native): init" << std::endl;
+ std::cerr << "OneCCL (native): init" << std::endl;
- ccl_init();
+ auto t1 = std::chrono::high_resolution_clock::now();
- jclass cls = env->GetObjectClass(param);
- jfieldID fid_comm_size = env->GetFieldID(cls, "commSize", "J");
- jfieldID fid_rank_id = env->GetFieldID(cls, "rankId", "J");
+ ccl::init();
+
+ const char *str = env->GetStringUTFChars(ip_port, 0);
+ ccl::string ccl_ip_port(str);
- size_t comm_size;
- size_t rank_id;
+ auto kvs_attr = ccl::create_kvs_attr();
+ kvs_attr.set(ccl_ip_port);
- ccl_get_comm_size(NULL, &comm_size);
- ccl_get_comm_rank(NULL, &rank_id);
+ ccl::shared_ptr_class kvs;
+ kvs = ccl::create_main_kvs(kvs_attr);
+
+ g_comms.push_back(ccl::create_communicator(size, rank, kvs));
+
+ auto t2 = std::chrono::high_resolution_clock::now();
+ auto duration = std::chrono::duration_cast( t2 - t1 ).count();
+ std::cerr << "OneCCL (native): init took " << duration << " secs" << std::endl;
+
+ rank_id = getComm().rank();
+ comm_size = getComm().size();
+
+ jclass cls = env->GetObjectClass(param);
+ jfieldID fid_comm_size = env->GetFieldID(cls, "commSize", "J");
+ jfieldID fid_rank_id = env->GetFieldID(cls, "rankId", "J");
env->SetLongField(param, fid_comm_size, comm_size);
env->SetLongField(param, fid_rank_id, rank_id);
+ env->ReleaseStringUTFChars(ip_port, str);
return 1;
}
@@ -33,9 +70,10 @@ JNIEXPORT jint JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_c_1init
JNIEXPORT void JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_c_1cleanup
(JNIEnv *env, jobject obj) {
- std::cout << "oneCCL (native): cleanup" << std::endl;
+ g_comms.pop_back();
+
+ std::cerr << "OneCCL (native): cleanup" << std::endl;
- ccl_finalize();
}
/*
@@ -44,12 +82,9 @@ JNIEXPORT void JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_c_1cleanup
* Signature: ()Z
*/
JNIEXPORT jboolean JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_isRoot
- (JNIEnv *env, jobject obj) {
+ (JNIEnv *env, jobject obj) {
- size_t rank_id;
- ccl_get_comm_rank(NULL, &rank_id);
-
- return (rank_id == 0);
+ return getComm().rank() == 0;
}
/*
@@ -59,12 +94,7 @@ JNIEXPORT jboolean JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_isRoot
*/
JNIEXPORT jint JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_rankID
(JNIEnv *env, jobject obj) {
-
- size_t rank_id;
- ccl_get_comm_rank(NULL, &rank_id);
-
- return rank_id;
-
+ return getComm().rank();
}
/*
@@ -85,3 +115,115 @@ JNIEXPORT jint JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_setEnv
return err;
}
+
+static const int CCL_IP_LEN = 128;
+std::list local_host_ips;
+
+static int fill_local_host_ip() {
+ struct ifaddrs *ifaddr, *ifa;
+ int family = AF_UNSPEC;
+ char local_ip[CCL_IP_LEN];
+ if (getifaddrs(&ifaddr) < 0) {
+ // LOG_ERROR("fill_local_host_ip: can not get host IP");
+ return -1;
+ }
+
+ const char iface_name[] = "lo";
+ local_host_ips.clear();
+
+ for (ifa = ifaddr; ifa != NULL; ifa = ifa->ifa_next) {
+ if (ifa->ifa_addr == NULL)
+ continue;
+ if (strstr(ifa->ifa_name, iface_name) == NULL) {
+ family = ifa->ifa_addr->sa_family;
+ if (family == AF_INET) {
+ memset(local_ip, 0, CCL_IP_LEN);
+ int res = getnameinfo(
+ ifa->ifa_addr,
+ (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6),
+ local_ip,
+ CCL_IP_LEN,
+ NULL,
+ 0,
+ NI_NUMERICHOST);
+ if (res != 0) {
+ std::string s("fill_local_host_ip: getnameinfo error > ");
+ s.append(gai_strerror(res));
+ // LOG_ERROR(s.c_str());
+ return -1;
+ }
+ local_host_ips.push_back(local_ip);
+ }
+ }
+ }
+ if (local_host_ips.empty()) {
+ // LOG_ERROR("fill_local_host_ip: can't find interface to get host IP");
+ return -1;
+ }
+ // memset(local_host_ip, 0, CCL_IP_LEN);
+ // strncpy(local_host_ip, local_host_ips.front().c_str(), CCL_IP_LEN);
+
+ // for (auto &ip : local_host_ips)
+ // cout << ip << endl;
+
+ freeifaddrs(ifaddr);
+ return 0;
+}
+
+static bool is_valid_ip(char ip[]) {
+ if (fill_local_host_ip() == -1) {
+ std::cerr << "fill_local_host_ip error" << std::endl;
+ };
+ for (std::list::iterator it = local_host_ips.begin(); it != local_host_ips.end(); ++it) {
+ if (*it == ip) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/*
+ * Class: org_apache_spark_ml_util_OneCCL__
+ * Method: getAvailPort
+ * Signature: (Ljava/lang/String;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_c_1getAvailPort
+ (JNIEnv *env, jobject obj, jstring localIP) {
+
+ // start from beginning of dynamic port
+ const int port_start_base = 3000;
+
+ char* local_host_ip = (char *) env->GetStringUTFChars(localIP, NULL);
+
+ // check if the input ip is one of host's ips
+ if (!is_valid_ip(local_host_ip))
+ return -1;
+
+ struct sockaddr_in main_server_address;
+ int server_listen_sock;
+ in_port_t port = port_start_base;
+
+ if ((server_listen_sock = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+ perror("OneCCL (native) getAvailPort error!");
+ return -1;
+ }
+
+ main_server_address.sin_family = AF_INET;
+ main_server_address.sin_addr.s_addr = inet_addr(local_host_ip);
+ main_server_address.sin_port = htons(port);
+
+ // search for available port
+ while (bind(server_listen_sock,
+ (const struct sockaddr *)&main_server_address,
+ sizeof(main_server_address)) < 0) {
+ port++;
+ main_server_address.sin_port = htons(port);
+ }
+
+ close(server_listen_sock);
+
+ env->ReleaseStringUTFChars(localIP, local_host_ip);
+
+ return port;
+}
diff --git a/mllib-dal/src/main/native/OneCCL.h b/mllib-dal/src/main/native/OneCCL.h
new file mode 100644
index 000000000..b579c4697
--- /dev/null
+++ b/mllib-dal/src/main/native/OneCCL.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#include
+
+ccl::communicator &getComm();
diff --git a/mllib-dal/src/main/native/OneDAL.cpp b/mllib-dal/src/main/native/OneDAL.cpp
index 2210ffd1c..792225c3e 100644
--- a/mllib-dal/src/main/native/OneDAL.cpp
+++ b/mllib-dal/src/main/native/OneDAL.cpp
@@ -19,6 +19,8 @@
#include
#include "org_apache_spark_ml_util_OneDAL__.h"
+#include "service.h"
+
using namespace daal;
using namespace daal::data_management;
@@ -123,3 +125,51 @@ JNIEXPORT jboolean JNICALL Java_org_apache_spark_ml_util_OneDAL_00024_cCheckPlat
// Only guarantee compatibility and performance on Intel platforms, use oneDAL lib function
return daal_check_is_intel_cpu();
}
+
+/*
+ * Class: org_apache_spark_ml_util_OneDAL__
+ * Method: cNewCSRNumericTable
+ * Signature: ([F[J[JJJ)J
+ */
+JNIEXPORT jlong JNICALL Java_org_apache_spark_ml_util_OneDAL_00024_cNewCSRNumericTable
+ (JNIEnv *env, jobject, jfloatArray data, jlongArray colIndices, jlongArray rowOffsets, jlong nFeatures, jlong nVectors) {
+
+ long numData = env->GetArrayLength(data);
+ // long numColIndices = numData;
+ // long numRowOffsets = env->GetArrayLength(rowOffsets);
+
+ size_t * resultRowOffsets = NULL;
+ size_t * resultColIndices = NULL;
+ float * resultData = NULL;
+ CSRNumericTable * numericTable = new CSRNumericTable(resultData, resultColIndices, resultRowOffsets, nFeatures, nVectors);
+ numericTable->allocateDataMemory(numData);
+ numericTable->getArrays(&resultData, &resultColIndices, &resultRowOffsets);
+
+ size_t * pRowOffsets = (size_t *)env->GetLongArrayElements(rowOffsets, 0);
+ size_t * pColIndices = (size_t *)env->GetLongArrayElements(colIndices, 0);
+ float * pData = env->GetFloatArrayElements(data, 0);
+
+ // std::memcpy(resultRowOffsets, pRowOffsets, numRowOffsets*sizeof(jlong));
+ // std::memcpy(resultColIndices, pColIndices, numColIndices*sizeof(jlong));
+ // std::memcpy(resultData, pData, numData*sizeof(float));
+
+ for (size_t i = 0; i < (size_t)numData; ++i)
+ {
+ resultData[i] = pData[i];
+ resultColIndices[i] = pColIndices[i];
+ }
+ for (size_t i = 0; i < (size_t)nVectors + 1; ++i)
+ {
+ resultRowOffsets[i] = pRowOffsets[i];
+ }
+
+ env->ReleaseLongArrayElements(rowOffsets, (jlong *)pRowOffsets, 0);
+ env->ReleaseLongArrayElements(colIndices, (jlong *)pColIndices, 0);
+ env->ReleaseFloatArrayElements(data, pData, 0);
+
+ CSRNumericTablePtr *ret = new CSRNumericTablePtr(numericTable);
+
+ //printNumericTable(*ret, "cNewCSRNumericTable", 10);
+
+ return (jlong)ret;
+}
diff --git a/mllib-dal/src/main/native/PCADALImpl.cpp b/mllib-dal/src/main/native/PCADALImpl.cpp
index 3b06fc0dc..33e2bc95d 100644
--- a/mllib-dal/src/main/native/PCADALImpl.cpp
+++ b/mllib-dal/src/main/native/PCADALImpl.cpp
@@ -1,4 +1,3 @@
-#include
#include
#include "service.h"
@@ -7,6 +6,7 @@
#include
#include "org_apache_spark_ml_feature_PCADALImpl.h"
+#include "OneCCL.h"
using namespace std;
using namespace daal;
@@ -24,8 +24,9 @@ typedef double algorithmFPType; /* Algorithm floating-point type */
JNIEXPORT jlong JNICALL Java_org_apache_spark_ml_feature_PCADALImpl_cPCATrainDAL(
JNIEnv *env, jobject obj, jlong pNumTabData, jint k, jint executor_num, jint executor_cores,
jobject resultObj) {
- size_t rankId;
- ccl_get_comm_rank(NULL, &rankId);
+
+ ccl::communicator &comm = getComm();
+ size_t rankId = comm.rank();
const size_t nBlocks = executor_num;
const int comm_size = executor_num;
@@ -40,6 +41,8 @@ JNIEXPORT jlong JNICALL Java_org_apache_spark_ml_feature_PCADALImpl_cPCATrainDAL
int nThreadsNew = services::Environment::getInstance()->getNumberOfThreads();
cout << "oneDAL (native): Number of threads used: " << nThreadsNew << endl;
+ auto t1 = std::chrono::high_resolution_clock::now();
+
pca::Distributed localAlgorithm;
/* Set the input data set to the algorithm */
@@ -48,6 +51,12 @@ JNIEXPORT jlong JNICALL Java_org_apache_spark_ml_feature_PCADALImpl_cPCATrainDAL
/* Compute PCA decomposition */
localAlgorithm.compute();
+ auto t2 = std::chrono::high_resolution_clock::now();
+ auto duration = std::chrono::duration_cast( t2 - t1 ).count();
+ std::cout << "PCA (native): local step took " << duration << " secs" << std::endl;
+
+ t1 = std::chrono::high_resolution_clock::now();
+
/* Serialize partial results required by step 2 */
services::SharedPtr serializedData;
InputDataArchive dataArch;
@@ -59,31 +68,31 @@ JNIEXPORT jlong JNICALL Java_org_apache_spark_ml_feature_PCADALImpl_cPCATrainDAL
byte* nodeResults = new byte[perNodeArchLength];
dataArch.copyArchiveToArray(nodeResults, perNodeArchLength);
- ccl_request_t request;
+ t2 = std::chrono::high_resolution_clock::now();
+
+ duration = std::chrono::duration_cast( t2 - t1 ).count();
+ std::cout << "PCA (native): serializing partial results took " << duration << " secs" << std::endl;
- size_t* recv_counts = new size_t[comm_size * perNodeArchLength];
+ vector recv_counts(comm_size * perNodeArchLength);
for (int i = 0; i < comm_size; i++) recv_counts[i] = perNodeArchLength;
cout << "PCA (native): ccl_allgatherv receiving " << perNodeArchLength * nBlocks << " bytes" << endl;
- auto t1 = std::chrono::high_resolution_clock::now();
+ t1 = std::chrono::high_resolution_clock::now();
/* Transfer partial results to step 2 on the root node */
// MPI_Gather(nodeResults, perNodeArchLength, MPI_CHAR, serializedData.get(),
// perNodeArchLength, MPI_CHAR, ccl_root, MPI_COMM_WORLD);
- ccl_allgatherv(nodeResults, perNodeArchLength, serializedData.get(), recv_counts,
- ccl_dtype_char, NULL, NULL, NULL, &request);
- ccl_wait(request);
+ ccl::allgatherv(nodeResults, perNodeArchLength, serializedData.get(), recv_counts,
+ ccl::datatype::uint8, comm).wait();
- auto t2 = std::chrono::high_resolution_clock::now();
+ t2 = std::chrono::high_resolution_clock::now();
- auto duration = std::chrono::duration_cast( t2 - t1 ).count();
+ duration = std::chrono::duration_cast( t2 - t1 ).count();
std::cout << "PCA (native): ccl_allgatherv took " << duration << " secs" << std::endl;
- delete[] nodeResults;
-
if (rankId == ccl_root) {
- auto t1 = std::chrono::high_resolution_clock::now();
+ auto t1 = std::chrono::high_resolution_clock::now();
/* Create an algorithm for principal component analysis using the svdDense method
* on the master node */
diff --git a/mllib-dal/src/main/native/build-jni.sh b/mllib-dal/src/main/native/build-jni.sh
index bee614dcd..dacd8382b 100755
--- a/mllib-dal/src/main/native/build-jni.sh
+++ b/mllib-dal/src/main/native/build-jni.sh
@@ -18,4 +18,5 @@ javah -d $WORK_DIR/javah -classpath "$WORK_DIR/../../../target/classes:$DAAL_JAR
org.apache.spark.ml.util.OneCCL$ \
org.apache.spark.ml.util.OneDAL$ \
org.apache.spark.ml.clustering.KMeansDALImpl \
- org.apache.spark.ml.feature.PCADALImpl
+ org.apache.spark.ml.feature.PCADALImpl \
+ org.apache.spark.ml.recommendation.ALSDALImpl
diff --git a/mllib-dal/src/main/native/javah/org_apache_spark_ml_recommendation_ALSDALImpl.h b/mllib-dal/src/main/native/javah/org_apache_spark_ml_recommendation_ALSDALImpl.h
new file mode 100644
index 000000000..73024d05b
--- /dev/null
+++ b/mllib-dal/src/main/native/javah/org_apache_spark_ml_recommendation_ALSDALImpl.h
@@ -0,0 +1,29 @@
+/* DO NOT EDIT THIS FILE - it is machine generated */
+#include
+/* Header for class org_apache_spark_ml_recommendation_ALSDALImpl */
+
+#ifndef _Included_org_apache_spark_ml_recommendation_ALSDALImpl
+#define _Included_org_apache_spark_ml_recommendation_ALSDALImpl
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Class: org_apache_spark_ml_recommendation_ALSDALImpl
+ * Method: cDALImplictALS
+ * Signature: (JJIIDDIIILorg/apache/spark/ml/recommendation/ALSResult;)J
+ */
+JNIEXPORT jlong JNICALL Java_org_apache_spark_ml_recommendation_ALSDALImpl_cDALImplictALS
+ (JNIEnv *, jobject, jlong, jlong, jint, jint, jdouble, jdouble, jint, jint, jint, jobject);
+
+/*
+ * Class: org_apache_spark_ml_recommendation_ALSDALImpl
+ * Method: cShuffleData
+ * Signature: (Ljava/nio/ByteBuffer;IILorg/apache/spark/ml/recommendation/ALSPartitionInfo;)Ljava/nio/ByteBuffer;
+ */
+JNIEXPORT jobject JNICALL Java_org_apache_spark_ml_recommendation_ALSDALImpl_cShuffleData
+ (JNIEnv *, jobject, jobject, jint, jint, jobject);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mllib-dal/src/main/native/javah/org_apache_spark_ml_util_OneCCL__.h b/mllib-dal/src/main/native/javah/org_apache_spark_ml_util_OneCCL__.h
index 60825ae3f..580c34bf9 100644
--- a/mllib-dal/src/main/native/javah/org_apache_spark_ml_util_OneCCL__.h
+++ b/mllib-dal/src/main/native/javah/org_apache_spark_ml_util_OneCCL__.h
@@ -10,10 +10,10 @@ extern "C" {
/*
* Class: org_apache_spark_ml_util_OneCCL__
* Method: c_init
- * Signature: (Lorg/apache/spark/ml/util/CCLParam;)I
+ * Signature: (IILjava/lang/String;Lorg/apache/spark/ml/util/CCLParam;)I
*/
JNIEXPORT jint JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_c_1init
- (JNIEnv *, jobject, jobject);
+ (JNIEnv *, jobject, jint, jint, jstring, jobject);
/*
* Class: org_apache_spark_ml_util_OneCCL__
@@ -47,6 +47,14 @@ JNIEXPORT jint JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_rankID
JNIEXPORT jint JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_setEnv
(JNIEnv *, jobject, jstring, jstring, jboolean);
+/*
+ * Class: org_apache_spark_ml_util_OneCCL__
+ * Method: c_getAvailPort
+ * Signature: (Ljava/lang/String;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_spark_ml_util_OneCCL_00024_c_1getAvailPort
+ (JNIEnv *, jobject, jstring);
+
#ifdef __cplusplus
}
#endif
diff --git a/mllib-dal/src/main/native/javah/org_apache_spark_ml_util_OneDAL__.h b/mllib-dal/src/main/native/javah/org_apache_spark_ml_util_OneDAL__.h
index 5f67d9428..d88462d66 100644
--- a/mllib-dal/src/main/native/javah/org_apache_spark_ml_util_OneDAL__.h
+++ b/mllib-dal/src/main/native/javah/org_apache_spark_ml_util_OneDAL__.h
@@ -47,6 +47,14 @@ JNIEXPORT void JNICALL Java_org_apache_spark_ml_util_OneDAL_00024_cFreeDataMemor
JNIEXPORT jboolean JNICALL Java_org_apache_spark_ml_util_OneDAL_00024_cCheckPlatformCompatibility
(JNIEnv *, jobject);
+/*
+ * Class: org_apache_spark_ml_util_OneDAL__
+ * Method: cNewCSRNumericTable
+ * Signature: ([F[J[JJJ)J
+ */
+JNIEXPORT jlong JNICALL Java_org_apache_spark_ml_util_OneDAL_00024_cNewCSRNumericTable
+ (JNIEnv *, jobject, jfloatArray, jlongArray, jlongArray, jlong, jlong);
+
#ifdef __cplusplus
}
#endif
diff --git a/mllib-dal/src/main/native/service.cpp b/mllib-dal/src/main/native/service.cpp
index 9316b3b62..623767406 100644
--- a/mllib-dal/src/main/native/service.cpp
+++ b/mllib-dal/src/main/native/service.cpp
@@ -125,6 +125,10 @@ CSRNumericTable * createSparseTable(const std::string & datasetFileName)
return numericTable;
}
+CSRNumericTable * createFloatSparseTable(const std::string & datasetFileName) {
+ return createSparseTable(datasetFileName);
+}
+
void printAprioriItemsets(NumericTablePtr largeItemsetsTable, NumericTablePtr largeItemsetsSupportTable, size_t nItemsetToPrint = 20)
{
size_t largeItemsetCount = largeItemsetsSupportTable->getNumberOfRows();
diff --git a/mllib-dal/src/main/native/service.h b/mllib-dal/src/main/native/service.h
index 37b702aea..b6a2cc5c5 100644
--- a/mllib-dal/src/main/native/service.h
+++ b/mllib-dal/src/main/native/service.h
@@ -43,5 +43,8 @@ typedef std::vector ByteBuffer;
void printNumericTable(const NumericTablePtr & dataTable, const char * message = "", size_t nPrintedRows = 0, size_t nPrintedCols = 0,
size_t interval = 10);
+size_t serializeDAALObject(SerializationIface * pData, ByteBuffer & buffer);
+SerializationIfacePtr deserializeDAALObject(daal::byte * buff, size_t length);
+CSRNumericTable * createFloatSparseTable(const std::string & datasetFileName);
#endif
diff --git a/mllib-dal/src/main/scala/org/apache/spark/ml/clustering/KMeansDALImpl.scala b/mllib-dal/src/main/scala/org/apache/spark/ml/clustering/KMeansDALImpl.scala
index 5f29fe441..e9e7ec36d 100644
--- a/mllib-dal/src/main/scala/org/apache/spark/ml/clustering/KMeansDALImpl.scala
+++ b/mllib-dal/src/main/scala/org/apache/spark/ml/clustering/KMeansDALImpl.scala
@@ -40,8 +40,6 @@ class KMeansDALImpl (
instr.foreach(_.logInfo(s"Processing partitions with $executorNum executors"))
- val executorIPAddress = Utils.sparkFirstExecutorIP(data.sparkContext)
-
// repartition to executorNum if not enough partitions
val dataForConversion = if (data.getNumPartitions < executorNum) {
data.repartition(executorNum).setName("Repartitioned for conversion").cache()
@@ -49,6 +47,13 @@ class KMeansDALImpl (
data
}
+ val executorIPAddress = Utils.sparkFirstExecutorIP(dataForConversion.sparkContext)
+ val kvsIP = dataForConversion.sparkContext.conf.get("spark.oap.mllib.oneccl.kvs.ip", executorIPAddress)
+ val kvsPortDetected = Utils.checkExecutorAvailPort(dataForConversion, kvsIP)
+ val kvsPort = dataForConversion.sparkContext.conf.getInt("spark.oap.mllib.oneccl.kvs.port", kvsPortDetected)
+
+ val kvsIPPort = kvsIP+"_"+kvsPort
+
val partitionDims = Utils.getPartitionDims(dataForConversion)
// filter the empty partitions
@@ -64,14 +69,14 @@ class KMeansDALImpl (
val it = entry._3
val numCols = partitionDims(index)._2
- println(s"KMeansDALImpl: Partition index: $index, numCols: $numCols, numRows: $numRows")
+ logDebug(s"KMeansDALImpl: Partition index: $index, numCols: $numCols, numRows: $numRows")
// Build DALMatrix, this will load libJavaAPI, libtbb, libtbbmalloc
val context = new DaalContext()
val matrix = new DALMatrix(context, classOf[java.lang.Double],
numCols.toLong, numRows.toLong, NumericTable.AllocationFlag.DoAllocate)
- println("KMeansDALImpl: Loading native libraries" )
+ logDebug("KMeansDALImpl: Loading native libraries" )
// oneDAL libs should be loaded by now, extract libMLlibDAL.so to temp file and load
LibLoader.loadLibraries()
@@ -111,10 +116,9 @@ class KMeansDALImpl (
}.cache()
- val results = coalescedTables.mapPartitions { table =>
+ val results = coalescedTables.mapPartitionsWithIndex { (rank, table) =>
val tableArr = table.next()
-
- OneCCL.init(executorNum, executorIPAddress, OneCCL.KVS_PORT)
+ OneCCL.init(executorNum, rank, kvsIPPort)
val initCentroids = OneDAL.makeNumericTable(centers)
val result = new KMeansResult()
diff --git a/mllib-dal/src/main/scala/org/apache/spark/ml/feature/PCADALImpl.scala b/mllib-dal/src/main/scala/org/apache/spark/ml/feature/PCADALImpl.scala
index 1760aa171..e1bba3d37 100644
--- a/mllib-dal/src/main/scala/org/apache/spark/ml/feature/PCADALImpl.scala
+++ b/mllib-dal/src/main/scala/org/apache/spark/ml/feature/PCADALImpl.scala
@@ -18,19 +18,20 @@
package org.apache.spark.ml.feature
import java.util.Arrays
-
import com.intel.daal.data_management.data.{HomogenNumericTable, NumericTable}
+import org.apache.spark.internal.Logging
import org.apache.spark.ml.linalg._
import org.apache.spark.ml.util.{OneCCL, OneDAL, Utils}
import org.apache.spark.mllib.feature.{PCAModel => MLlibPCAModel}
import org.apache.spark.mllib.linalg.{DenseMatrix => OldDenseMatrix, Vectors => OldVectors}
import org.apache.spark.rdd.RDD
-import org.apache.spark.mllib.feature.{ StandardScaler => MLlibStandardScaler }
+import org.apache.spark.mllib.feature.{StandardScaler => MLlibStandardScaler}
class PCADALImpl (
val k: Int,
val executorNum: Int,
- val executorCores: Int) extends Serializable {
+ val executorCores: Int)
+ extends Serializable with Logging {
// Normalize data before apply fitWithDAL
private def normalizeData(input: RDD[Vector]) : RDD[Vector] = {
@@ -40,17 +41,23 @@ class PCADALImpl (
res.map(_.asML)
}
- def fitWithDAL(input: RDD[Vector]) : MLlibPCAModel = {
+ def fitWithDAL(data: RDD[Vector]) : MLlibPCAModel = {
- val normalizedData = normalizeData(input)
+ val normalizedData = normalizeData(data)
val coalescedTables = OneDAL.rddVectorToNumericTables(normalizedData, executorNum)
- val executorIPAddress = Utils.sparkFirstExecutorIP(input.sparkContext)
+ val executorIPAddress = Utils.sparkFirstExecutorIP(coalescedTables.sparkContext)
+ val kvsIP = coalescedTables.sparkContext.conf.get("spark.oap.mllib.oneccl.kvs.ip", executorIPAddress)
+
+ val kvsPortDetected = Utils.checkExecutorAvailPort(coalescedTables, kvsIP)
+ val kvsPort = coalescedTables.sparkContext.conf.getInt("spark.oap.mllib.oneccl.kvs.port", kvsPortDetected)
+
+ val kvsIPPort = kvsIP+"_"+kvsPort
- val results = coalescedTables.mapPartitions { table =>
+ val results = coalescedTables.mapPartitionsWithIndex { (rank, table) =>
val tableArr = table.next()
- OneCCL.init(executorNum, executorIPAddress, OneCCL.KVS_PORT)
+ OneCCL.init(executorNum, rank, kvsIPPort)
val result = new PCAResult()
cPCATrainDAL(
diff --git a/mllib-dal/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib-dal/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
new file mode 100644
index 000000000..9196873fb
--- /dev/null
+++ b/mllib-dal/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
@@ -0,0 +1,1885 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.recommendation
+
+import java.{util => ju}
+import java.io.IOException
+import java.util.Locale
+
+import scala.collection.mutable
+import scala.reflect.ClassTag
+import scala.util.{Sorting, Try}
+import scala.util.hashing.byteswap64
+
+import com.github.fommil.netlib.BLAS.{getInstance => blas}
+import org.apache.hadoop.fs.Path
+import org.json4s.DefaultFormats
+import org.json4s.JsonDSL._
+
+import org.apache.spark.{Dependency, Partitioner, ShuffleDependency, SparkContext, SparkException}
+import org.apache.spark.annotation.Since
+import org.apache.spark.internal.Logging
+import org.apache.spark.ml.{Estimator, Model}
+import org.apache.spark.ml.linalg.BLAS
+import org.apache.spark.ml.param._
+import org.apache.spark.ml.param.shared._
+import org.apache.spark.ml.util.{Utils => DALImplUtils, _}
+import org.apache.spark.ml.util.Instrumentation.instrumented
+import org.apache.spark.mllib.linalg.CholeskyDecomposition
+import org.apache.spark.mllib.optimization.NNLS
+import org.apache.spark.rdd.{DeterministicLevel, RDD}
+import org.apache.spark.sql.{DataFrame, Dataset}
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types._
+import org.apache.spark.storage.StorageLevel
+import org.apache.spark.util.{BoundedPriorityQueue, Utils}
+import org.apache.spark.util.collection.{OpenHashMap, OpenHashSet, SortDataFormat, Sorter}
+import org.apache.spark.util.random.XORShiftRandom
+
+/**
+ * Common params for ALS and ALSModel.
+ */
+private[recommendation] trait ALSModelParams extends Params with HasPredictionCol
+ with HasBlockSize {
+ /**
+ * Param for the column name for user ids. Ids must be integers. Other
+ * numeric types are supported for this column, but will be cast to integers as long as they
+ * fall within the integer value range.
+ * Default: "user"
+ * @group param
+ */
+ val userCol = new Param[String](this, "userCol", "column name for user ids. Ids must be within " +
+ "the integer value range.")
+
+ /** @group getParam */
+ def getUserCol: String = $(userCol)
+
+ /**
+ * Param for the column name for item ids. Ids must be integers. Other
+ * numeric types are supported for this column, but will be cast to integers as long as they
+ * fall within the integer value range.
+ * Default: "item"
+ * @group param
+ */
+ val itemCol = new Param[String](this, "itemCol", "column name for item ids. Ids must be within " +
+ "the integer value range.")
+
+ /** @group getParam */
+ def getItemCol: String = $(itemCol)
+
+ /**
+ * Attempts to safely cast a user/item id to an Int. Throws an exception if the value is
+ * out of integer range or contains a fractional part.
+ */
+ protected[recommendation] val checkedCast = udf { (n: Any) =>
+ n match {
+ case v: Int => v // Avoid unnecessary casting
+ case v: Number =>
+ val intV = v.intValue
+ // Checks if number within Int range and has no fractional part.
+ if (v.doubleValue == intV) {
+ intV
+ } else {
+ throw new IllegalArgumentException(s"ALS only supports values in Integer range " +
+ s"and without fractional part for columns ${$(userCol)} and ${$(itemCol)}. " +
+ s"Value $n was either out of Integer range or contained a fractional part that " +
+ s"could not be converted.")
+ }
+ case _ => throw new IllegalArgumentException(s"ALS only supports values in Integer range " +
+ s"for columns ${$(userCol)} and ${$(itemCol)}. Value $n was not numeric.")
+ }
+ }
+
+ /**
+ * Param for strategy for dealing with unknown or new users/items at prediction time.
+ * This may be useful in cross-validation or production scenarios, for handling user/item ids
+ * the model has not seen in the training data.
+ * Supported values:
+ * - "nan": predicted value for unknown ids will be NaN.
+ * - "drop": rows in the input DataFrame containing unknown ids will be dropped from
+ * the output DataFrame containing predictions.
+ * Default: "nan".
+ * @group expertParam
+ */
+ val coldStartStrategy = new Param[String](this, "coldStartStrategy",
+ "strategy for dealing with unknown or new users/items at prediction time. This may be " +
+ "useful in cross-validation or production scenarios, for handling user/item ids the model " +
+ "has not seen in the training data. Supported values: " +
+ s"${ALSModel.supportedColdStartStrategies.mkString(",")}.",
+ (s: String) =>
+ ALSModel.supportedColdStartStrategies.contains(s.toLowerCase(Locale.ROOT)))
+
+ /** @group expertGetParam */
+ def getColdStartStrategy: String = $(coldStartStrategy).toLowerCase(Locale.ROOT)
+
+ setDefault(blockSize -> 4096)
+}
+
+/**
+ * Common params for ALS.
+ */
+private[recommendation] trait ALSParams extends ALSModelParams with HasMaxIter with HasRegParam
+ with HasCheckpointInterval with HasSeed {
+
+ /**
+ * Param for rank of the matrix factorization (positive).
+ * Default: 10
+ * @group param
+ */
+ val rank = new IntParam(this, "rank", "rank of the factorization", ParamValidators.gtEq(1))
+
+ /** @group getParam */
+ def getRank: Int = $(rank)
+
+ /**
+ * Param for number of user blocks (positive).
+ * Default: 10
+ * @group param
+ */
+ val numUserBlocks = new IntParam(this, "numUserBlocks", "number of user blocks",
+ ParamValidators.gtEq(1))
+
+ /** @group getParam */
+ def getNumUserBlocks: Int = $(numUserBlocks)
+
+ /**
+ * Param for number of item blocks (positive).
+ * Default: 10
+ * @group param
+ */
+ val numItemBlocks = new IntParam(this, "numItemBlocks", "number of item blocks",
+ ParamValidators.gtEq(1))
+
+ /** @group getParam */
+ def getNumItemBlocks: Int = $(numItemBlocks)
+
+ /**
+ * Param to decide whether to use implicit preference.
+ * Default: false
+ * @group param
+ */
+ val implicitPrefs = new BooleanParam(this, "implicitPrefs", "whether to use implicit preference")
+
+ /** @group getParam */
+ def getImplicitPrefs: Boolean = $(implicitPrefs)
+
+ /**
+ * Param for the alpha parameter in the implicit preference formulation (nonnegative).
+ * Default: 1.0
+ * @group param
+ */
+ val alpha = new DoubleParam(this, "alpha", "alpha for implicit preference",
+ ParamValidators.gtEq(0))
+
+ /** @group getParam */
+ def getAlpha: Double = $(alpha)
+
+ /**
+ * Param for the column name for ratings.
+ * Default: "rating"
+ * @group param
+ */
+ val ratingCol = new Param[String](this, "ratingCol", "column name for ratings")
+
+ /** @group getParam */
+ def getRatingCol: String = $(ratingCol)
+
+ /**
+ * Param for whether to apply nonnegativity constraints.
+ * Default: false
+ * @group param
+ */
+ val nonnegative = new BooleanParam(
+ this, "nonnegative", "whether to use nonnegative constraint for least squares")
+
+ /** @group getParam */
+ def getNonnegative: Boolean = $(nonnegative)
+
+ /**
+ * Param for StorageLevel for intermediate datasets. Pass in a string representation of
+ * `StorageLevel`. Cannot be "NONE".
+ * Default: "MEMORY_AND_DISK".
+ *
+ * @group expertParam
+ */
+ val intermediateStorageLevel = new Param[String](this, "intermediateStorageLevel",
+ "StorageLevel for intermediate datasets. Cannot be 'NONE'.",
+ (s: String) => Try(StorageLevel.fromString(s)).isSuccess && s != "NONE")
+
+ /** @group expertGetParam */
+ def getIntermediateStorageLevel: String = $(intermediateStorageLevel)
+
+ /**
+ * Param for StorageLevel for ALS model factors. Pass in a string representation of
+ * `StorageLevel`.
+ * Default: "MEMORY_AND_DISK".
+ *
+ * @group expertParam
+ */
+ val finalStorageLevel = new Param[String](this, "finalStorageLevel",
+ "StorageLevel for ALS model factors.",
+ (s: String) => Try(StorageLevel.fromString(s)).isSuccess)
+
+ /** @group expertGetParam */
+ def getFinalStorageLevel: String = $(finalStorageLevel)
+
+ setDefault(rank -> 10, maxIter -> 10, regParam -> 0.1, numUserBlocks -> 10, numItemBlocks -> 10,
+ implicitPrefs -> false, alpha -> 1.0, userCol -> "user", itemCol -> "item",
+ ratingCol -> "rating", nonnegative -> false, checkpointInterval -> 10,
+ intermediateStorageLevel -> "MEMORY_AND_DISK", finalStorageLevel -> "MEMORY_AND_DISK",
+ coldStartStrategy -> "nan")
+
+ /**
+ * Validates and transforms the input schema.
+ *
+ * @param schema input schema
+ * @return output schema
+ */
+ protected def validateAndTransformSchema(schema: StructType): StructType = {
+ // user and item will be cast to Int
+ SchemaUtils.checkNumericType(schema, $(userCol))
+ SchemaUtils.checkNumericType(schema, $(itemCol))
+ // rating will be cast to Float
+ SchemaUtils.checkNumericType(schema, $(ratingCol))
+ SchemaUtils.appendColumn(schema, $(predictionCol), FloatType)
+ }
+}
+
+/**
+ * Model fitted by ALS.
+ *
+ * @param rank rank of the matrix factorization model
+ * @param userFactors a DataFrame that stores user factors in two columns: `id` and `features`
+ * @param itemFactors a DataFrame that stores item factors in two columns: `id` and `features`
+ */
+@Since("1.3.0")
+class ALSModel private[ml] (
+ @Since("1.4.0") override val uid: String,
+ @Since("1.4.0") val rank: Int,
+ @transient val userFactors: DataFrame,
+ @transient val itemFactors: DataFrame)
+ extends Model[ALSModel] with ALSModelParams with MLWritable {
+
+ /** @group setParam */
+ @Since("1.4.0")
+ def setUserCol(value: String): this.type = set(userCol, value)
+
+ /** @group setParam */
+ @Since("1.4.0")
+ def setItemCol(value: String): this.type = set(itemCol, value)
+
+ /** @group setParam */
+ @Since("1.3.0")
+ def setPredictionCol(value: String): this.type = set(predictionCol, value)
+
+ /** @group expertSetParam */
+ @Since("2.2.0")
+ def setColdStartStrategy(value: String): this.type = set(coldStartStrategy, value)
+
+ /**
+ * Set block size for stacking input data in matrices.
+ * Default is 4096.
+ *
+ * @group expertSetParam
+ */
+ @Since("3.0.0")
+ def setBlockSize(value: Int): this.type = set(blockSize, value)
+
+ private val predict = udf { (featuresA: Seq[Float], featuresB: Seq[Float]) =>
+ if (featuresA != null && featuresB != null) {
+ var dotProduct = 0.0f
+ var i = 0
+ while (i < rank) {
+ dotProduct += featuresA(i) * featuresB(i)
+ i += 1
+ }
+ dotProduct
+ } else {
+ Float.NaN
+ }
+ }
+
+ @Since("2.0.0")
+ override def transform(dataset: Dataset[_]): DataFrame = {
+ transformSchema(dataset.schema)
+ // create a new column named map(predictionCol) by running the predict UDF.
+ val predictions = dataset
+ .join(userFactors,
+ checkedCast(dataset($(userCol))) === userFactors("id"), "left")
+ .join(itemFactors,
+ checkedCast(dataset($(itemCol))) === itemFactors("id"), "left")
+ .select(dataset("*"),
+ predict(userFactors("features"), itemFactors("features")).as($(predictionCol)))
+ getColdStartStrategy match {
+ case ALSModel.Drop =>
+ predictions.na.drop("all", Seq($(predictionCol)))
+ case ALSModel.NaN =>
+ predictions
+ }
+ }
+
+ @Since("1.3.0")
+ override def transformSchema(schema: StructType): StructType = {
+ // user and item will be cast to Int
+ SchemaUtils.checkNumericType(schema, $(userCol))
+ SchemaUtils.checkNumericType(schema, $(itemCol))
+ SchemaUtils.appendColumn(schema, $(predictionCol), FloatType)
+ }
+
+ @Since("1.5.0")
+ override def copy(extra: ParamMap): ALSModel = {
+ val copied = new ALSModel(uid, rank, userFactors, itemFactors)
+ copyValues(copied, extra).setParent(parent)
+ }
+
+ @Since("1.6.0")
+ override def write: MLWriter = new ALSModel.ALSModelWriter(this)
+
+ @Since("3.0.0")
+ override def toString: String = {
+ s"ALSModel: uid=$uid, rank=$rank"
+ }
+
+ /**
+ * Returns top `numItems` items recommended for each user, for all users.
+ * @param numItems max number of recommendations for each user
+ * @return a DataFrame of (userCol: Int, recommendations), where recommendations are
+ * stored as an array of (itemCol: Int, rating: Float) Rows.
+ */
+ @Since("2.2.0")
+ def recommendForAllUsers(numItems: Int): DataFrame = {
+ recommendForAll(userFactors, itemFactors, $(userCol), $(itemCol), numItems, $(blockSize))
+ }
+
+ /**
+ * Returns top `numItems` items recommended for each user id in the input data set. Note that if
+ * there are duplicate ids in the input dataset, only one set of recommendations per unique id
+ * will be returned.
+ * @param dataset a Dataset containing a column of user ids. The column name must match `userCol`.
+ * @param numItems max number of recommendations for each user.
+ * @return a DataFrame of (userCol: Int, recommendations), where recommendations are
+ * stored as an array of (itemCol: Int, rating: Float) Rows.
+ */
+ @Since("2.3.0")
+ def recommendForUserSubset(dataset: Dataset[_], numItems: Int): DataFrame = {
+ val srcFactorSubset = getSourceFactorSubset(dataset, userFactors, $(userCol))
+ recommendForAll(srcFactorSubset, itemFactors, $(userCol), $(itemCol), numItems, $(blockSize))
+ }
+
+ /**
+ * Returns top `numUsers` users recommended for each item, for all items.
+ * @param numUsers max number of recommendations for each item
+ * @return a DataFrame of (itemCol: Int, recommendations), where recommendations are
+ * stored as an array of (userCol: Int, rating: Float) Rows.
+ */
+ @Since("2.2.0")
+ def recommendForAllItems(numUsers: Int): DataFrame = {
+ recommendForAll(itemFactors, userFactors, $(itemCol), $(userCol), numUsers, $(blockSize))
+ }
+
+ /**
+ * Returns top `numUsers` users recommended for each item id in the input data set. Note that if
+ * there are duplicate ids in the input dataset, only one set of recommendations per unique id
+ * will be returned.
+ * @param dataset a Dataset containing a column of item ids. The column name must match `itemCol`.
+ * @param numUsers max number of recommendations for each item.
+ * @return a DataFrame of (itemCol: Int, recommendations), where recommendations are
+ * stored as an array of (userCol: Int, rating: Float) Rows.
+ */
+ @Since("2.3.0")
+ def recommendForItemSubset(dataset: Dataset[_], numUsers: Int): DataFrame = {
+ val srcFactorSubset = getSourceFactorSubset(dataset, itemFactors, $(itemCol))
+ recommendForAll(srcFactorSubset, userFactors, $(itemCol), $(userCol), numUsers, $(blockSize))
+ }
+
+ /**
+ * Returns a subset of a factor DataFrame limited to only those unique ids contained
+ * in the input dataset.
+ * @param dataset input Dataset containing id column to user to filter factors.
+ * @param factors factor DataFrame to filter.
+ * @param column column name containing the ids in the input dataset.
+ * @return DataFrame containing factors only for those ids present in both the input dataset and
+ * the factor DataFrame.
+ */
+ private def getSourceFactorSubset(
+ dataset: Dataset[_],
+ factors: DataFrame,
+ column: String): DataFrame = {
+ factors
+ .join(dataset.select(column), factors("id") === dataset(column), joinType = "left_semi")
+ .select(factors("id"), factors("features"))
+ }
+
+ /**
+ * Makes recommendations for all users (or items).
+ *
+ * Note: the previous approach used for computing top-k recommendations
+ * used a cross-join followed by predicting a score for each row of the joined dataset.
+ * However, this results in exploding the size of intermediate data. While Spark SQL makes it
+ * relatively efficient, the approach implemented here is significantly more efficient.
+ *
+ * This approach groups factors into blocks and computes the top-k elements per block,
+ * using dot product and an efficient [[BoundedPriorityQueue]] (instead of gemm).
+ * It then computes the global top-k by aggregating the per block top-k elements with
+ * a [[TopByKeyAggregator]]. This significantly reduces the size of intermediate and shuffle data.
+ * This is the DataFrame equivalent to the approach used in
+ * [[org.apache.spark.mllib.recommendation.MatrixFactorizationModel]].
+ *
+ * @param srcFactors src factors for which to generate recommendations
+ * @param dstFactors dst factors used to make recommendations
+ * @param srcOutputColumn name of the column for the source ID in the output DataFrame
+ * @param dstOutputColumn name of the column for the destination ID in the output DataFrame
+ * @param num max number of recommendations for each record
+ * @return a DataFrame of (srcOutputColumn: Int, recommendations), where recommendations are
+ * stored as an array of (dstOutputColumn: Int, rating: Float) Rows.
+ */
+ private def recommendForAll(
+ srcFactors: DataFrame,
+ dstFactors: DataFrame,
+ srcOutputColumn: String,
+ dstOutputColumn: String,
+ num: Int,
+ blockSize: Int): DataFrame = {
+ import srcFactors.sparkSession.implicits._
+
+ val srcFactorsBlocked = blockify(srcFactors.as[(Int, Array[Float])], blockSize)
+ val dstFactorsBlocked = blockify(dstFactors.as[(Int, Array[Float])], blockSize)
+ val ratings = srcFactorsBlocked.crossJoin(dstFactorsBlocked)
+ .as[(Seq[(Int, Array[Float])], Seq[(Int, Array[Float])])]
+ .flatMap { case (srcIter, dstIter) =>
+ val m = srcIter.size
+ val n = math.min(dstIter.size, num)
+ val output = new Array[(Int, Int, Float)](m * n)
+ var i = 0
+ val pq = new BoundedPriorityQueue[(Int, Float)](num)(Ordering.by(_._2))
+ srcIter.foreach { case (srcId, srcFactor) =>
+ dstIter.foreach { case (dstId, dstFactor) =>
+ // We use F2jBLAS which is faster than a call to native BLAS for vector dot product
+ val score = BLAS.f2jBLAS.sdot(rank, srcFactor, 1, dstFactor, 1)
+ pq += dstId -> score
+ }
+ pq.foreach { case (dstId, score) =>
+ output(i) = (srcId, dstId, score)
+ i += 1
+ }
+ pq.clear()
+ }
+ output.toSeq
+ }
+ // We'll force the IDs to be Int. Unfortunately this converts IDs to Int in the output.
+ val topKAggregator = new TopByKeyAggregator[Int, Int, Float](num, Ordering.by(_._2))
+ val recs = ratings.as[(Int, Int, Float)].groupByKey(_._1).agg(topKAggregator.toColumn)
+ .toDF("id", "recommendations")
+
+ val arrayType = ArrayType(
+ new StructType()
+ .add(dstOutputColumn, IntegerType)
+ .add("rating", FloatType)
+ )
+ recs.select($"id".as(srcOutputColumn), $"recommendations".cast(arrayType))
+ }
+
+ /**
+ * Blockifies factors to improve the efficiency of cross join
+ */
+ private def blockify(
+ factors: Dataset[(Int, Array[Float])],
+ blockSize: Int): Dataset[Seq[(Int, Array[Float])]] = {
+ import factors.sparkSession.implicits._
+ factors.mapPartitions(_.grouped(blockSize))
+ }
+
+}
+
+@Since("1.6.0")
+object ALSModel extends MLReadable[ALSModel] {
+
+ private val NaN = "nan"
+ private val Drop = "drop"
+ private[recommendation] final val supportedColdStartStrategies = Array(NaN, Drop)
+
+ @Since("1.6.0")
+ override def read: MLReader[ALSModel] = new ALSModelReader
+
+ @Since("1.6.0")
+ override def load(path: String): ALSModel = super.load(path)
+
+ private[ALSModel] class ALSModelWriter(instance: ALSModel) extends MLWriter {
+
+ override protected def saveImpl(path: String): Unit = {
+ val extraMetadata = "rank" -> instance.rank
+ DefaultParamsWriter.saveMetadata(instance, path, sc, Some(extraMetadata))
+ val userPath = new Path(path, "userFactors").toString
+ instance.userFactors.write.format("parquet").save(userPath)
+ val itemPath = new Path(path, "itemFactors").toString
+ instance.itemFactors.write.format("parquet").save(itemPath)
+ }
+ }
+
+ private class ALSModelReader extends MLReader[ALSModel] {
+
+ /** Checked against metadata when loading model */
+ private val className = classOf[ALSModel].getName
+
+ override def load(path: String): ALSModel = {
+ val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
+ implicit val format = DefaultFormats
+ val rank = (metadata.metadata \ "rank").extract[Int]
+ val userPath = new Path(path, "userFactors").toString
+ val userFactors = sparkSession.read.format("parquet").load(userPath)
+ val itemPath = new Path(path, "itemFactors").toString
+ val itemFactors = sparkSession.read.format("parquet").load(itemPath)
+
+ val model = new ALSModel(metadata.uid, rank, userFactors, itemFactors)
+
+ metadata.getAndSetParams(model)
+ model
+ }
+ }
+}
+
+/**
+ * Alternating Least Squares (ALS) matrix factorization.
+ *
+ * ALS attempts to estimate the ratings matrix `R` as the product of two lower-rank matrices,
+ * `X` and `Y`, i.e. `X * Yt = R`. Typically these approximations are called 'factor' matrices.
+ * The general approach is iterative. During each iteration, one of the factor matrices is held
+ * constant, while the other is solved for using least squares. The newly-solved factor matrix is
+ * then held constant while solving for the other factor matrix.
+ *
+ * This is a blocked implementation of the ALS factorization algorithm that groups the two sets
+ * of factors (referred to as "users" and "products") into blocks and reduces communication by only
+ * sending one copy of each user vector to each product block on each iteration, and only for the
+ * product blocks that need that user's feature vector. This is achieved by pre-computing some
+ * information about the ratings matrix to determine the "out-links" of each user (which blocks of
+ * products it will contribute to) and "in-link" information for each product (which of the feature
+ * vectors it receives from each user block it will depend on). This allows us to send only an
+ * array of feature vectors between each user block and product block, and have the product block
+ * find the users' ratings and update the products based on these messages.
+ *
+ * For implicit preference data, the algorithm used is based on
+ * "Collaborative Filtering for Implicit Feedback Datasets", available at
+ * https://doi.org/10.1109/ICDM.2008.22, adapted for the blocked approach used here.
+ *
+ * Essentially instead of finding the low-rank approximations to the rating matrix `R`,
+ * this finds the approximations for a preference matrix `P` where the elements of `P` are 1 if
+ * r is greater than 0 and 0 if r is less than or equal to 0. The ratings then act as 'confidence'
+ * values related to strength of indicated user
+ * preferences rather than explicit ratings given to items.
+ *
+ * Note: the input rating dataset to the ALS implementation should be deterministic.
+ * Nondeterministic data can cause failure during fitting ALS model.
+ * For example, an order-sensitive operation like sampling after a repartition makes dataset
+ * output nondeterministic, like `dataset.repartition(2).sample(false, 0.5, 1618)`.
+ * Checkpointing sampled dataset or adding a sort before sampling can help make the dataset
+ * deterministic.
+ */
+@Since("1.3.0")
+class ALS(@Since("1.4.0") override val uid: String) extends Estimator[ALSModel] with ALSParams
+ with DefaultParamsWritable {
+
+ import org.apache.spark.ml.recommendation.ALS.Rating
+
+ @Since("1.4.0")
+ def this() = this(Identifiable.randomUID("als"))
+
+ /** @group setParam */
+ @Since("1.3.0")
+ def setRank(value: Int): this.type = set(rank, value)
+
+ /** @group setParam */
+ @Since("1.3.0")
+ def setNumUserBlocks(value: Int): this.type = set(numUserBlocks, value)
+
+ /** @group setParam */
+ @Since("1.3.0")
+ def setNumItemBlocks(value: Int): this.type = set(numItemBlocks, value)
+
+ /** @group setParam */
+ @Since("1.3.0")
+ def setImplicitPrefs(value: Boolean): this.type = set(implicitPrefs, value)
+
+ /** @group setParam */
+ @Since("1.3.0")
+ def setAlpha(value: Double): this.type = set(alpha, value)
+
+ /** @group setParam */
+ @Since("1.3.0")
+ def setUserCol(value: String): this.type = set(userCol, value)
+
+ /** @group setParam */
+ @Since("1.3.0")
+ def setItemCol(value: String): this.type = set(itemCol, value)
+
+ /** @group setParam */
+ @Since("1.3.0")
+ def setRatingCol(value: String): this.type = set(ratingCol, value)
+
+ /** @group setParam */
+ @Since("1.3.0")
+ def setPredictionCol(value: String): this.type = set(predictionCol, value)
+
+ /** @group setParam */
+ @Since("1.3.0")
+ def setMaxIter(value: Int): this.type = set(maxIter, value)
+
+ /** @group setParam */
+ @Since("1.3.0")
+ def setRegParam(value: Double): this.type = set(regParam, value)
+
+ /** @group setParam */
+ @Since("1.3.0")
+ def setNonnegative(value: Boolean): this.type = set(nonnegative, value)
+
+ /** @group setParam */
+ @Since("1.4.0")
+ def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value)
+
+ /** @group setParam */
+ @Since("1.3.0")
+ def setSeed(value: Long): this.type = set(seed, value)
+
+ /** @group expertSetParam */
+ @Since("2.0.0")
+ def setIntermediateStorageLevel(value: String): this.type = set(intermediateStorageLevel, value)
+
+ /** @group expertSetParam */
+ @Since("2.0.0")
+ def setFinalStorageLevel(value: String): this.type = set(finalStorageLevel, value)
+
+ /** @group expertSetParam */
+ @Since("2.2.0")
+ def setColdStartStrategy(value: String): this.type = set(coldStartStrategy, value)
+
+ /**
+ * Set block size for stacking input data in matrices.
+ * Default is 4096.
+ *
+ * @group expertSetParam
+ */
+ @Since("3.0.0")
+ def setBlockSize(value: Int): this.type = set(blockSize, value)
+
+ /**
+ * Sets both numUserBlocks and numItemBlocks to the specific value.
+ *
+ * @group setParam
+ */
+ @Since("1.3.0")
+ def setNumBlocks(value: Int): this.type = {
+ setNumUserBlocks(value)
+ setNumItemBlocks(value)
+ this
+ }
+
+ @Since("2.0.0")
+ override def fit(dataset: Dataset[_]): ALSModel = instrumented { instr =>
+ transformSchema(dataset.schema)
+ import dataset.sparkSession.implicits._
+
+ val r = if ($(ratingCol) != "") col($(ratingCol)).cast(FloatType) else lit(1.0f)
+ val ratings = dataset
+ .select(checkedCast(col($(userCol))), checkedCast(col($(itemCol))), r)
+ .rdd
+ .map { row =>
+ Rating(row.getInt(0), row.getInt(1), row.getFloat(2))
+ }
+
+ instr.logPipelineStage(this)
+ instr.logDataset(dataset)
+ instr.logParams(this, rank, numUserBlocks, numItemBlocks, implicitPrefs, alpha, userCol,
+ itemCol, ratingCol, predictionCol, maxIter, regParam, nonnegative, checkpointInterval,
+ seed, intermediateStorageLevel, finalStorageLevel, blockSize)
+
+ val (userFactors, itemFactors) = ALS.train(ratings, rank = $(rank),
+ numUserBlocks = $(numUserBlocks), numItemBlocks = $(numItemBlocks),
+ maxIter = $(maxIter), regParam = $(regParam), implicitPrefs = $(implicitPrefs),
+ alpha = $(alpha), nonnegative = $(nonnegative),
+ intermediateRDDStorageLevel = StorageLevel.fromString($(intermediateStorageLevel)),
+ finalRDDStorageLevel = StorageLevel.fromString($(finalStorageLevel)),
+ checkpointInterval = $(checkpointInterval), seed = $(seed))
+ val userDF = userFactors.toDF("id", "features")
+ val itemDF = itemFactors.toDF("id", "features")
+ val model = new ALSModel(uid, $(rank), userDF, itemDF).setBlockSize($(blockSize))
+ .setParent(this)
+ copyValues(model)
+ }
+
+ @Since("1.3.0")
+ override def transformSchema(schema: StructType): StructType = {
+ validateAndTransformSchema(schema)
+ }
+
+ @Since("1.5.0")
+ override def copy(extra: ParamMap): ALS = defaultCopy(extra)
+}
+
+
+/**
+ * An implementation of ALS that supports generic ID types, specialized for Int and Long. This is
+ * exposed as a developer API for users who do need other ID types. But it is not recommended
+ * because it increases the shuffle size and memory requirement during training. For simplicity,
+ * users and items must have the same type. The number of distinct users/items should be smaller
+ * than 2 billion.
+ */
+object ALS extends DefaultParamsReadable[ALS] with Logging {
+
+ /**
+ * Rating class for better code readability.
+ */
+ case class Rating[@specialized(Int, Long) ID](user: ID, item: ID, rating: Float)
+
+ @Since("1.6.0")
+ override def load(path: String): ALS = super.load(path)
+
+ /** Trait for least squares solvers applied to the normal equation. */
+ private[recommendation] trait LeastSquaresNESolver extends Serializable {
+ /** Solves a least squares problem with regularization (possibly with other constraints). */
+ def solve(ne: NormalEquation, lambda: Double): Array[Float]
+ }
+
+ /** Cholesky solver for least square problems. */
+ private[recommendation] class CholeskySolver extends LeastSquaresNESolver {
+
+ /**
+ * Solves a least squares problem with L2 regularization:
+ *
+ * min norm(A x - b)^2^ + lambda * norm(x)^2^
+ *
+ * @param ne a [[NormalEquation]] instance that contains AtA, Atb, and n (number of instances)
+ * @param lambda regularization constant
+ * @return the solution x
+ */
+ override def solve(ne: NormalEquation, lambda: Double): Array[Float] = {
+ val k = ne.k
+ // Add scaled lambda to the diagonals of AtA.
+ var i = 0
+ var j = 2
+ while (i < ne.triK) {
+ ne.ata(i) += lambda
+ i += j
+ j += 1
+ }
+ CholeskyDecomposition.solve(ne.ata, ne.atb)
+ val x = new Array[Float](k)
+ i = 0
+ while (i < k) {
+ x(i) = ne.atb(i).toFloat
+ i += 1
+ }
+ ne.reset()
+ x
+ }
+ }
+
+ /** NNLS solver. */
+ private[recommendation] class NNLSSolver extends LeastSquaresNESolver {
+ private var rank: Int = -1
+ private var workspace: NNLS.Workspace = _
+ private var ata: Array[Double] = _
+ private var initialized: Boolean = false
+
+ private def initialize(rank: Int): Unit = {
+ if (!initialized) {
+ this.rank = rank
+ workspace = NNLS.createWorkspace(rank)
+ ata = new Array[Double](rank * rank)
+ initialized = true
+ } else {
+ require(this.rank == rank)
+ }
+ }
+
+ /**
+ * Solves a nonnegative least squares problem with L2 regularization:
+ *
+ * min_x_ norm(A x - b)^2^ + lambda * n * norm(x)^2^
+ * subject to x >= 0
+ */
+ override def solve(ne: NormalEquation, lambda: Double): Array[Float] = {
+ val rank = ne.k
+ initialize(rank)
+ fillAtA(ne.ata, lambda)
+ val x = NNLS.solve(ata, ne.atb, workspace)
+ ne.reset()
+ x.map(x => x.toFloat)
+ }
+
+ /**
+ * Given a triangular matrix in the order of fillXtX above, compute the full symmetric square
+ * matrix that it represents, storing it into destMatrix.
+ */
+ private def fillAtA(triAtA: Array[Double], lambda: Double): Unit = {
+ var i = 0
+ var pos = 0
+ var a = 0.0
+ while (i < rank) {
+ var j = 0
+ while (j <= i) {
+ a = triAtA(pos)
+ ata(i * rank + j) = a
+ ata(j * rank + i) = a
+ pos += 1
+ j += 1
+ }
+ ata(i * rank + i) += lambda
+ i += 1
+ }
+ }
+ }
+
+ /**
+ * Representing a normal equation to solve the following weighted least squares problem:
+ *
+ * minimize \sum,,i,, c,,i,, (a,,i,,^T^ x - d,,i,,)^2^ + lambda * x^T^ x.
+ *
+ * Its normal equation is given by
+ *
+ * \sum,,i,, c,,i,, (a,,i,, a,,i,,^T^ x - d,,i,, a,,i,,) + lambda * x = 0.
+ *
+ * Distributing and letting b,,i,, = c,,i,, * d,,i,,
+ *
+ * \sum,,i,, c,,i,, a,,i,, a,,i,,^T^ x - b,,i,, a,,i,, + lambda * x = 0.
+ */
+ private[recommendation] class NormalEquation(val k: Int) extends Serializable {
+
+ /** Number of entries in the upper triangular part of a k-by-k matrix. */
+ val triK = k * (k + 1) / 2
+ /** A^T^ * A */
+ val ata = new Array[Double](triK)
+ /** A^T^ * b */
+ val atb = new Array[Double](k)
+
+ private val da = new Array[Double](k)
+ private val upper = "U"
+
+ private def copyToDouble(a: Array[Float]): Unit = {
+ var i = 0
+ while (i < k) {
+ da(i) = a(i)
+ i += 1
+ }
+ }
+
+ /** Adds an observation. */
+ def add(a: Array[Float], b: Double, c: Double = 1.0): NormalEquation = {
+ require(c >= 0.0)
+ require(a.length == k)
+ copyToDouble(a)
+ blas.dspr(upper, k, c, da, 1, ata)
+ if (b != 0.0) {
+ blas.daxpy(k, b, da, 1, atb, 1)
+ }
+ this
+ }
+
+ /** Merges another normal equation object. */
+ def merge(other: NormalEquation): NormalEquation = {
+ require(other.k == k)
+ blas.daxpy(ata.length, 1.0, other.ata, 1, ata, 1)
+ blas.daxpy(atb.length, 1.0, other.atb, 1, atb, 1)
+ this
+ }
+
+ /** Resets everything to zero, which should be called after each solve. */
+ def reset(): Unit = {
+ ju.Arrays.fill(ata, 0.0)
+ ju.Arrays.fill(atb, 0.0)
+ }
+ }
+
+ def train[ID: ClassTag]( // scalastyle:ignore
+ ratings: RDD[Rating[ID]],
+ rank: Int = 10,
+ numUserBlocks: Int = 10,
+ numItemBlocks: Int = 10,
+ maxIter: Int = 10,
+ regParam: Double = 0.1,
+ implicitPrefs: Boolean = false,
+ alpha: Double = 1.0,
+ nonnegative: Boolean = false,
+ intermediateRDDStorageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK,
+ finalRDDStorageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK,
+ checkpointInterval: Int = 10,
+ seed: Long = 0L)(
+ implicit ord: Ordering[ID]): (RDD[(ID, Array[Float])], RDD[(ID, Array[Float])]) = {
+
+ val isPlatformSupported = DALImplUtils.checkClusterPlatformCompatibility(ratings.sparkContext)
+
+ val (userIdAndFactors, itemIdAndFactors) =
+ if (implicitPrefs && isPlatformSupported) {
+ new ALSDALImpl(ratings, rank, maxIter, regParam, alpha, seed).run()
+ } else {
+ trainMLlib(ratings, rank, numUserBlocks, numItemBlocks, maxIter, regParam, implicitPrefs,
+ alpha, nonnegative, intermediateRDDStorageLevel, finalRDDStorageLevel,
+ checkpointInterval, seed)
+ }
+
+ (userIdAndFactors, itemIdAndFactors)
+ }
+
+ private def trainDAL[ID: ClassTag](
+ ratings: RDD[Rating[ID]],
+ rank: Int,
+ maxIter: Int,
+ regParam: Double,
+ alpha: Double,
+ seed: Long): (RDD[(ID, Array[Float])], RDD[(ID, Array[Float])]) = {
+ null
+ }
+
+ /**
+ * Implementation of the ALS algorithm.
+ *
+ * This implementation of the ALS factorization algorithm partitions the two sets of factors among
+ * Spark workers so as to reduce network communication by only sending one copy of each factor
+ * vector to each Spark worker on each iteration, and only if needed. This is achieved by
+ * precomputing some information about the ratings matrix to determine which users require which
+ * item factors and vice versa. See the Scaladoc for `InBlock` for a detailed explanation of how
+ * the precomputation is done.
+ *
+ * In addition, since each iteration of calculating the factor matrices depends on the known
+ * ratings, which are spread across Spark partitions, a naive implementation would incur
+ * significant network communication overhead between Spark workers, as the ratings RDD would be
+ * repeatedly shuffled during each iteration. This implementation reduces that overhead by
+ * performing the shuffling operation up front, precomputing each partition's ratings dependencies
+ * and duplicating those values to the appropriate workers before starting iterations to solve for
+ * the factor matrices. See the Scaladoc for `OutBlock` for a detailed explanation of how the
+ * precomputation is done.
+ *
+ * Note that the term "rating block" is a bit of a misnomer, as the ratings are not partitioned by
+ * contiguous blocks from the ratings matrix but by a hash function on the rating's location in
+ * the matrix. If it helps you to visualize the partitions, it is easier to think of the term
+ * "block" as referring to a subset of an RDD containing the ratings rather than a contiguous
+ * submatrix of the ratings matrix.
+ */
+ private def trainMLlib[ID: ClassTag]( // scalastyle:ignore
+ ratings: RDD[Rating[ID]],
+ rank: Int = 10,
+ numUserBlocks: Int = 10,
+ numItemBlocks: Int = 10,
+ maxIter: Int = 10,
+ regParam: Double = 0.1,
+ implicitPrefs: Boolean = false,
+ alpha: Double = 1.0,
+ nonnegative: Boolean = false,
+ intermediateRDDStorageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK,
+ finalRDDStorageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK,
+ checkpointInterval: Int = 10,
+ seed: Long = 0L)(
+ implicit ord: Ordering[ID]): (RDD[(ID, Array[Float])], RDD[(ID, Array[Float])]) = {
+
+ require(!ratings.isEmpty(), s"No ratings available from $ratings")
+ require(intermediateRDDStorageLevel != StorageLevel.NONE,
+ "ALS is not designed to run without persisting intermediate RDDs.")
+
+ val sc = ratings.sparkContext
+
+ // Precompute the rating dependencies of each partition
+ val userPart = new ALSPartitioner(numUserBlocks)
+ val itemPart = new ALSPartitioner(numItemBlocks)
+ val blockRatings = partitionRatings(ratings, userPart, itemPart)
+ .persist(intermediateRDDStorageLevel)
+ val (userInBlocks, userOutBlocks) =
+ makeBlocks("user", blockRatings, userPart, itemPart, intermediateRDDStorageLevel)
+ userOutBlocks.count() // materialize blockRatings and user blocks
+ val swappedBlockRatings = blockRatings.map {
+ case ((userBlockId, itemBlockId), RatingBlock(userIds, itemIds, localRatings)) =>
+ ((itemBlockId, userBlockId), RatingBlock(itemIds, userIds, localRatings))
+ }
+ val (itemInBlocks, itemOutBlocks) =
+ makeBlocks("item", swappedBlockRatings, itemPart, userPart, intermediateRDDStorageLevel)
+ itemOutBlocks.count() // materialize item blocks
+
+ // Encoders for storing each user/item's partition ID and index within its partition using a
+ // single integer; used as an optimization
+ val userLocalIndexEncoder = new LocalIndexEncoder(userPart.numPartitions)
+ val itemLocalIndexEncoder = new LocalIndexEncoder(itemPart.numPartitions)
+
+ // These are the user and item factor matrices that, once trained, are multiplied together to
+ // estimate the rating matrix. The two matrices are stored in RDDs, partitioned by column such
+ // that each factor column resides on the same Spark worker as its corresponding user or item.
+ val seedGen = new XORShiftRandom(seed)
+ var userFactors = initialize(userInBlocks, rank, seedGen.nextLong())
+ var itemFactors = initialize(itemInBlocks, rank, seedGen.nextLong())
+
+ val solver = if (nonnegative) new NNLSSolver else new CholeskySolver
+
+ var previousCheckpointFile: Option[String] = None
+ val shouldCheckpoint: Int => Boolean = (iter) =>
+ sc.checkpointDir.isDefined && checkpointInterval != -1 && (iter % checkpointInterval == 0)
+ val deletePreviousCheckpointFile: () => Unit = () =>
+ previousCheckpointFile.foreach { file =>
+ try {
+ val checkpointFile = new Path(file)
+ checkpointFile.getFileSystem(sc.hadoopConfiguration).delete(checkpointFile, true)
+ } catch {
+ case e: IOException =>
+ logWarning(s"Cannot delete checkpoint file $file:", e)
+ }
+ }
+
+ if (implicitPrefs) {
+ for (iter <- 1 to maxIter) {
+ userFactors.setName(s"userFactors-$iter").persist(intermediateRDDStorageLevel)
+ val previousItemFactors = itemFactors
+ itemFactors = computeFactors(userFactors, userOutBlocks, itemInBlocks, rank, regParam,
+ userLocalIndexEncoder, implicitPrefs, alpha, solver)
+ previousItemFactors.unpersist()
+ itemFactors.setName(s"itemFactors-$iter").persist(intermediateRDDStorageLevel)
+ // TODO: Generalize PeriodicGraphCheckpointer and use it here.
+ val deps = itemFactors.dependencies
+ if (shouldCheckpoint(iter)) {
+ itemFactors.checkpoint() // itemFactors gets materialized in computeFactors
+ }
+ val previousUserFactors = userFactors
+ userFactors = computeFactors(itemFactors, itemOutBlocks, userInBlocks, rank, regParam,
+ itemLocalIndexEncoder, implicitPrefs, alpha, solver)
+ if (shouldCheckpoint(iter)) {
+ ALS.cleanShuffleDependencies(sc, deps)
+ deletePreviousCheckpointFile()
+ previousCheckpointFile = itemFactors.getCheckpointFile
+ }
+ previousUserFactors.unpersist()
+ }
+ } else {
+ var previousCachedItemFactors: Option[RDD[(Int, FactorBlock)]] = None
+ for (iter <- 0 until maxIter) {
+ itemFactors = computeFactors(userFactors, userOutBlocks, itemInBlocks, rank, regParam,
+ userLocalIndexEncoder, solver = solver)
+ if (shouldCheckpoint(iter)) {
+ itemFactors.setName(s"itemFactors-$iter").persist(intermediateRDDStorageLevel)
+ val deps = itemFactors.dependencies
+ itemFactors.checkpoint()
+ itemFactors.count() // checkpoint item factors and cut lineage
+ ALS.cleanShuffleDependencies(sc, deps)
+ deletePreviousCheckpointFile()
+
+ previousCachedItemFactors.foreach(_.unpersist())
+ previousCheckpointFile = itemFactors.getCheckpointFile
+ previousCachedItemFactors = Option(itemFactors)
+ }
+ userFactors = computeFactors(itemFactors, itemOutBlocks, userInBlocks, rank, regParam,
+ itemLocalIndexEncoder, solver = solver)
+ }
+ }
+ val userIdAndFactors = userInBlocks
+ .mapValues(_.srcIds)
+ .join(userFactors)
+ .mapPartitions({ items =>
+ items.flatMap { case (_, (ids, factors)) =>
+ ids.view.zip(factors)
+ }
+ // Preserve the partitioning because IDs are consistent with the partitioners in userInBlocks
+ // and userFactors.
+ }, preservesPartitioning = true)
+ .setName("userFactors")
+ .persist(finalRDDStorageLevel)
+ val itemIdAndFactors = itemInBlocks
+ .mapValues(_.srcIds)
+ .join(itemFactors)
+ .mapPartitions({ items =>
+ items.flatMap { case (_, (ids, factors)) =>
+ ids.view.zip(factors)
+ }
+ }, preservesPartitioning = true)
+ .setName("itemFactors")
+ .persist(finalRDDStorageLevel)
+ if (finalRDDStorageLevel != StorageLevel.NONE) {
+ userIdAndFactors.count()
+ userInBlocks.unpersist()
+ userOutBlocks.unpersist()
+ itemOutBlocks.unpersist()
+ blockRatings.unpersist()
+ itemIdAndFactors.count()
+ itemFactors.unpersist()
+ itemInBlocks.unpersist()
+ }
+ (userIdAndFactors, itemIdAndFactors)
+ }
+
+ /**
+ * Factor block that stores factors (Array[Float]) in an Array.
+ */
+ private type FactorBlock = Array[Array[Float]]
+
+ /**
+ * A mapping of the columns of the items factor matrix that are needed when calculating each row
+ * of the users factor matrix, and vice versa.
+ *
+ * Specifically, when calculating a user factor vector, since only those columns of the items
+ * factor matrix that correspond to the items that that user has rated are needed, we can avoid
+ * having to repeatedly copy the entire items factor matrix to each worker later in the algorithm
+ * by precomputing these dependencies for all users, storing them in an RDD of `OutBlock`s. The
+ * items' dependencies on the columns of the users factor matrix is computed similarly.
+ *
+ * =Example=
+ *
+ * Using the example provided in the `InBlock` Scaladoc, `userOutBlocks` would look like the
+ * following:
+ *
+ * {{{
+ * userOutBlocks.collect() == Seq(
+ * 0 -> Array(Array(0, 1), Array(0, 1)),
+ * 1 -> Array(Array(0), Array(0))
+ * )
+ * }}}
+ *
+ * Each value in this map-like sequence is of type `Array[Array[Int]]`. The values in the
+ * inner array are the ranks of the sorted user IDs in that partition; so in the example above,
+ * `Array(0, 1)` in partition 0 refers to user IDs 0 and 6, since when all unique user IDs in
+ * partition 0 are sorted, 0 is the first ID and 6 is the second. The position of each inner
+ * array in its enclosing outer array denotes the partition number to which item IDs map; in the
+ * example, the first `Array(0, 1)` is in position 0 of its outer array, denoting item IDs that
+ * map to partition 0.
+ *
+ * In summary, the data structure encodes the following information:
+ *
+ * * There are ratings with user IDs 0 and 6 (encoded in `Array(0, 1)`, where 0 and 1 are the
+ * indices of the user IDs 0 and 6 on partition 0) whose item IDs map to partitions 0 and 1
+ * (represented by the fact that `Array(0, 1)` appears in both the 0th and 1st positions).
+ *
+ * * There are ratings with user ID 3 (encoded in `Array(0)`, where 0 is the index of the user
+ * ID 3 on partition 1) whose item IDs map to partitions 0 and 1 (represented by the fact that
+ * `Array(0)` appears in both the 0th and 1st positions).
+ */
+ private type OutBlock = Array[Array[Int]]
+
+ /**
+ * In-link block for computing user and item factor matrices.
+ *
+ * The ALS algorithm partitions the columns of the users factor matrix evenly among Spark workers.
+ * Since each column of the factor matrix is calculated using the known ratings of the correspond-
+ * ing user, and since the ratings don't change across iterations, the ALS algorithm preshuffles
+ * the ratings to the appropriate partitions, storing them in `InBlock` objects.
+ *
+ * The ratings shuffled by item ID are computed similarly and also stored in `InBlock` objects.
+ * Note that this means every rating is stored twice, once as shuffled by user ID and once by item
+ * ID. This is a necessary tradeoff, since in general a rating will not be on the same worker
+ * when partitioned by user as by item.
+ *
+ * =Example=
+ *
+ * Say we have a small collection of eight items to offer the seven users in our application. We
+ * have some known ratings given by the users, as seen in the matrix below:
+ *
+ * {{{
+ * Items
+ * 0 1 2 3 4 5 6 7
+ * +---+---+---+---+---+---+---+---+
+ * 0 | |0.1| | |0.4| | |0.7|
+ * +---+---+---+---+---+---+---+---+
+ * 1 | | | | | | | | |
+ * +---+---+---+---+---+---+---+---+
+ * U 2 | | | | | | | | |
+ * s +---+---+---+---+---+---+---+---+
+ * e 3 | |3.1| | |3.4| | |3.7|
+ * r +---+---+---+---+---+---+---+---+
+ * s 4 | | | | | | | | |
+ * +---+---+---+---+---+---+---+---+
+ * 5 | | | | | | | | |
+ * +---+---+---+---+---+---+---+---+
+ * 6 | |6.1| | |6.4| | |6.7|
+ * +---+---+---+---+---+---+---+---+
+ * }}}
+ *
+ * The ratings are represented as an RDD, passed to the `partitionRatings` method as the `ratings`
+ * parameter:
+ *
+ * {{{
+ * ratings.collect() == Seq(
+ * Rating(0, 1, 0.1f),
+ * Rating(0, 4, 0.4f),
+ * Rating(0, 7, 0.7f),
+ * Rating(3, 1, 3.1f),
+ * Rating(3, 4, 3.4f),
+ * Rating(3, 7, 3.7f),
+ * Rating(6, 1, 6.1f),
+ * Rating(6, 4, 6.4f),
+ * Rating(6, 7, 6.7f)
+ * )
+ * }}}
+ *
+ * Say that we are using two partitions to calculate each factor matrix:
+ *
+ * {{{
+ * val userPart = new ALSPartitioner(2)
+ * val itemPart = new ALSPartitioner(2)
+ * val blockRatings = partitionRatings(ratings, userPart, itemPart)
+ * }}}
+ *
+ * Ratings are mapped to partitions using the user/item IDs modulo the number of partitions. With
+ * two partitions, ratings with even-valued user IDs are shuffled to partition 0 while those with
+ * odd-valued user IDs are shuffled to partition 1:
+ *
+ * {{{
+ * userInBlocks.collect() == Seq(
+ * 0 -> Seq(
+ * // Internally, the class stores the ratings in a more optimized format than
+ * // a sequence of `Rating`s, but for clarity we show it as such here.
+ * Rating(0, 1, 0.1f),
+ * Rating(0, 4, 0.4f),
+ * Rating(0, 7, 0.7f),
+ * Rating(6, 1, 6.1f),
+ * Rating(6, 4, 6.4f),
+ * Rating(6, 7, 6.7f)
+ * ),
+ * 1 -> Seq(
+ * Rating(3, 1, 3.1f),
+ * Rating(3, 4, 3.4f),
+ * Rating(3, 7, 3.7f)
+ * )
+ * )
+ * }}}
+ *
+ * Similarly, ratings with even-valued item IDs are shuffled to partition 0 while those with
+ * odd-valued item IDs are shuffled to partition 1:
+ *
+ * {{{
+ * itemInBlocks.collect() == Seq(
+ * 0 -> Seq(
+ * Rating(0, 4, 0.4f),
+ * Rating(3, 4, 3.4f),
+ * Rating(6, 4, 6.4f)
+ * ),
+ * 1 -> Seq(
+ * Rating(0, 1, 0.1f),
+ * Rating(0, 7, 0.7f),
+ * Rating(3, 1, 3.1f),
+ * Rating(3, 7, 3.7f),
+ * Rating(6, 1, 6.1f),
+ * Rating(6, 7, 6.7f)
+ * )
+ * )
+ * }}}
+ *
+ * @param srcIds src ids (ordered)
+ * @param dstPtrs dst pointers. Elements in range [dstPtrs(i), dstPtrs(i+1)) of dst indices and
+ * ratings are associated with srcIds(i).
+ * @param dstEncodedIndices encoded dst indices
+ * @param ratings ratings
+ * @see [[LocalIndexEncoder]]
+ */
+ private[recommendation] case class InBlock[@specialized(Int, Long) ID: ClassTag](
+ srcIds: Array[ID],
+ dstPtrs: Array[Int],
+ dstEncodedIndices: Array[Int],
+ ratings: Array[Float]) {
+ /** Size of the block. */
+ def size: Int = ratings.length
+ require(dstEncodedIndices.length == size)
+ require(dstPtrs.length == srcIds.length + 1)
+ }
+
+ /**
+ * Initializes factors randomly given the in-link blocks.
+ *
+ * @param inBlocks in-link blocks
+ * @param rank rank
+ * @return initialized factor blocks
+ */
+ private def initialize[ID](
+ inBlocks: RDD[(Int, InBlock[ID])],
+ rank: Int,
+ seed: Long): RDD[(Int, FactorBlock)] = {
+ // Choose a unit vector uniformly at random from the unit sphere, but from the
+ // "first quadrant" where all elements are nonnegative. This can be done by choosing
+ // elements distributed as Normal(0,1) and taking the absolute value, and then normalizing.
+ // This appears to create factorizations that have a slightly better reconstruction
+ // (<1%) compared picking elements uniformly at random in [0,1].
+ inBlocks.mapPartitions({ iter =>
+ iter.map {
+ case (srcBlockId, inBlock) =>
+ val random = new XORShiftRandom(byteswap64(seed ^ srcBlockId))
+ val factors = Array.fill(inBlock.srcIds.length) {
+ val factor = Array.fill(rank)(random.nextGaussian().toFloat)
+ val nrm = blas.snrm2(rank, factor, 1)
+ blas.sscal(rank, 1.0f / nrm, factor, 1)
+ factor
+ }
+ (srcBlockId, factors)
+ }
+ }, preservesPartitioning = true)
+ }
+
+ /**
+ * A rating block that contains src IDs, dst IDs, and ratings, stored in primitive arrays.
+ */
+ private[recommendation] case class RatingBlock[@specialized(Int, Long) ID: ClassTag](
+ srcIds: Array[ID],
+ dstIds: Array[ID],
+ ratings: Array[Float]) {
+ /** Size of the block. */
+ def size: Int = srcIds.length
+ require(dstIds.length == srcIds.length)
+ require(ratings.length == srcIds.length)
+ }
+
+ /**
+ * Builder for [[RatingBlock]]. `mutable.ArrayBuilder` is used to avoid boxing/unboxing.
+ */
+ private[recommendation] class RatingBlockBuilder[@specialized(Int, Long) ID: ClassTag]
+ extends Serializable {
+
+ private val srcIds = mutable.ArrayBuilder.make[ID]
+ private val dstIds = mutable.ArrayBuilder.make[ID]
+ private val ratings = mutable.ArrayBuilder.make[Float]
+ var size = 0
+
+ /** Adds a rating. */
+ def add(r: Rating[ID]): this.type = {
+ size += 1
+ srcIds += r.user
+ dstIds += r.item
+ ratings += r.rating
+ this
+ }
+
+ /** Merges another [[RatingBlockBuilder]]. */
+ def merge(other: RatingBlock[ID]): this.type = {
+ size += other.srcIds.length
+ srcIds ++= other.srcIds
+ dstIds ++= other.dstIds
+ ratings ++= other.ratings
+ this
+ }
+
+ /** Builds a [[RatingBlock]]. */
+ def build(): RatingBlock[ID] = {
+ RatingBlock[ID](srcIds.result(), dstIds.result(), ratings.result())
+ }
+ }
+
+ /**
+ * Groups an RDD of [[Rating]]s by the user partition and item partition to which each `Rating`
+ * maps according to the given partitioners. The returned pair RDD holds the ratings, encoded in
+ * a memory-efficient format but otherwise unchanged, keyed by the (user partition ID, item
+ * partition ID) pair.
+ *
+ * Performance note: This is an expensive operation that performs an RDD shuffle.
+ *
+ * Implementation note: This implementation produces the same result as the following but
+ * generates fewer intermediate objects:
+ *
+ * {{{
+ * ratings.map { r =>
+ * ((srcPart.getPartition(r.user), dstPart.getPartition(r.item)), r)
+ * }.aggregateByKey(new RatingBlockBuilder)(
+ * seqOp = (b, r) => b.add(r),
+ * combOp = (b0, b1) => b0.merge(b1.build()))
+ * .mapValues(_.build())
+ * }}}
+ *
+ * @param ratings raw ratings
+ * @param srcPart partitioner for src IDs
+ * @param dstPart partitioner for dst IDs
+ * @return an RDD of rating blocks in the form of ((srcBlockId, dstBlockId), ratingBlock)
+ */
+ private def partitionRatings[ID: ClassTag](
+ ratings: RDD[Rating[ID]],
+ srcPart: Partitioner,
+ dstPart: Partitioner): RDD[((Int, Int), RatingBlock[ID])] = {
+ val numPartitions = srcPart.numPartitions * dstPart.numPartitions
+ ratings.mapPartitions { iter =>
+ val builders = Array.fill(numPartitions)(new RatingBlockBuilder[ID])
+ iter.flatMap { r =>
+ val srcBlockId = srcPart.getPartition(r.user)
+ val dstBlockId = dstPart.getPartition(r.item)
+ val idx = srcBlockId + srcPart.numPartitions * dstBlockId
+ val builder = builders(idx)
+ builder.add(r)
+ if (builder.size >= 2048) { // 2048 * (3 * 4) = 24k
+ builders(idx) = new RatingBlockBuilder
+ Iterator.single(((srcBlockId, dstBlockId), builder.build()))
+ } else {
+ Iterator.empty
+ }
+ } ++ {
+ builders.view.zipWithIndex.filter(_._1.size > 0).map { case (block, idx) =>
+ val srcBlockId = idx % srcPart.numPartitions
+ val dstBlockId = idx / srcPart.numPartitions
+ ((srcBlockId, dstBlockId), block.build())
+ }
+ }
+ }.groupByKey().mapValues { blocks =>
+ val builder = new RatingBlockBuilder[ID]
+ blocks.foreach(builder.merge)
+ builder.build()
+ }.setName("ratingBlocks")
+ }
+
+ /**
+ * Builder for uncompressed in-blocks of (srcId, dstEncodedIndex, rating) tuples.
+ *
+ * @param encoder encoder for dst indices
+ */
+ private[recommendation] class UncompressedInBlockBuilder[@specialized(Int, Long) ID: ClassTag](
+ encoder: LocalIndexEncoder)(
+ implicit ord: Ordering[ID]) {
+
+ private val srcIds = mutable.ArrayBuilder.make[ID]
+ private val dstEncodedIndices = mutable.ArrayBuilder.make[Int]
+ private val ratings = mutable.ArrayBuilder.make[Float]
+
+ /**
+ * Adds a dst block of (srcId, dstLocalIndex, rating) tuples.
+ *
+ * @param dstBlockId dst block ID
+ * @param srcIds original src IDs
+ * @param dstLocalIndices dst local indices
+ * @param ratings ratings
+ */
+ def add(
+ dstBlockId: Int,
+ srcIds: Array[ID],
+ dstLocalIndices: Array[Int],
+ ratings: Array[Float]): this.type = {
+ val sz = srcIds.length
+ require(dstLocalIndices.length == sz)
+ require(ratings.length == sz)
+ this.srcIds ++= srcIds
+ this.ratings ++= ratings
+ var j = 0
+ while (j < sz) {
+ this.dstEncodedIndices += encoder.encode(dstBlockId, dstLocalIndices(j))
+ j += 1
+ }
+ this
+ }
+
+ /** Builds a [[UncompressedInBlock]]. */
+ def build(): UncompressedInBlock[ID] = {
+ new UncompressedInBlock(srcIds.result(), dstEncodedIndices.result(), ratings.result())
+ }
+ }
+
+ /**
+ * A block of (srcId, dstEncodedIndex, rating) tuples stored in primitive arrays.
+ */
+ private[recommendation] class UncompressedInBlock[@specialized(Int, Long) ID: ClassTag](
+ val srcIds: Array[ID],
+ val dstEncodedIndices: Array[Int],
+ val ratings: Array[Float])(
+ implicit ord: Ordering[ID]) {
+
+ /** Size the of block. */
+ def length: Int = srcIds.length
+
+ /**
+ * Compresses the block into an `InBlock`. The algorithm is the same as converting a sparse
+ * matrix from coordinate list (COO) format into compressed sparse column (CSC) format.
+ * Sorting is done using Spark's built-in Timsort to avoid generating too many objects.
+ */
+ def compress(): InBlock[ID] = {
+ val sz = length
+ assert(sz > 0, "Empty in-link block should not exist.")
+ sort()
+ val uniqueSrcIdsBuilder = mutable.ArrayBuilder.make[ID]
+ val dstCountsBuilder = mutable.ArrayBuilder.make[Int]
+ var preSrcId = srcIds(0)
+ uniqueSrcIdsBuilder += preSrcId
+ var curCount = 1
+ var i = 1
+ while (i < sz) {
+ val srcId = srcIds(i)
+ if (srcId != preSrcId) {
+ uniqueSrcIdsBuilder += srcId
+ dstCountsBuilder += curCount
+ preSrcId = srcId
+ curCount = 0
+ }
+ curCount += 1
+ i += 1
+ }
+ dstCountsBuilder += curCount
+ val uniqueSrcIds = uniqueSrcIdsBuilder.result()
+ val numUniqueSrdIds = uniqueSrcIds.length
+ val dstCounts = dstCountsBuilder.result()
+ val dstPtrs = new Array[Int](numUniqueSrdIds + 1)
+ var sum = 0
+ i = 0
+ while (i < numUniqueSrdIds) {
+ sum += dstCounts(i)
+ i += 1
+ dstPtrs(i) = sum
+ }
+ InBlock(uniqueSrcIds, dstPtrs, dstEncodedIndices, ratings)
+ }
+
+ private def sort(): Unit = {
+ val sz = length
+ // Since there might be interleaved log messages, we insert a unique id for easy pairing.
+ val sortId = Utils.random.nextInt()
+ logDebug(s"Start sorting an uncompressed in-block of size $sz. (sortId = $sortId)")
+ val start = System.nanoTime()
+ val sorter = new Sorter(new UncompressedInBlockSort[ID])
+ sorter.sort(this, 0, length, Ordering[KeyWrapper[ID]])
+ val duration = (System.nanoTime() - start) / 1e9
+ logDebug(s"Sorting took $duration seconds. (sortId = $sortId)")
+ }
+ }
+
+ /**
+ * A wrapper that holds a primitive key.
+ *
+ * @see [[UncompressedInBlockSort]]
+ */
+ private class KeyWrapper[@specialized(Int, Long) ID: ClassTag](
+ implicit ord: Ordering[ID]) extends Ordered[KeyWrapper[ID]] {
+
+ var key: ID = _
+
+ override def compare(that: KeyWrapper[ID]): Int = {
+ ord.compare(key, that.key)
+ }
+
+ def setKey(key: ID): this.type = {
+ this.key = key
+ this
+ }
+ }
+
+ /**
+ * [[SortDataFormat]] of [[UncompressedInBlock]] used by [[Sorter]].
+ */
+ private class UncompressedInBlockSort[@specialized(Int, Long) ID: ClassTag](
+ implicit ord: Ordering[ID])
+ extends SortDataFormat[KeyWrapper[ID], UncompressedInBlock[ID]] {
+
+ override def newKey(): KeyWrapper[ID] = new KeyWrapper()
+
+ override def getKey(
+ data: UncompressedInBlock[ID],
+ pos: Int,
+ reuse: KeyWrapper[ID]): KeyWrapper[ID] = {
+ if (reuse == null) {
+ new KeyWrapper().setKey(data.srcIds(pos))
+ } else {
+ reuse.setKey(data.srcIds(pos))
+ }
+ }
+
+ override def getKey(
+ data: UncompressedInBlock[ID],
+ pos: Int): KeyWrapper[ID] = {
+ getKey(data, pos, null)
+ }
+
+ private def swapElements[@specialized(Int, Float) T](
+ data: Array[T],
+ pos0: Int,
+ pos1: Int): Unit = {
+ val tmp = data(pos0)
+ data(pos0) = data(pos1)
+ data(pos1) = tmp
+ }
+
+ override def swap(data: UncompressedInBlock[ID], pos0: Int, pos1: Int): Unit = {
+ swapElements(data.srcIds, pos0, pos1)
+ swapElements(data.dstEncodedIndices, pos0, pos1)
+ swapElements(data.ratings, pos0, pos1)
+ }
+
+ override def copyRange(
+ src: UncompressedInBlock[ID],
+ srcPos: Int,
+ dst: UncompressedInBlock[ID],
+ dstPos: Int,
+ length: Int): Unit = {
+ System.arraycopy(src.srcIds, srcPos, dst.srcIds, dstPos, length)
+ System.arraycopy(src.dstEncodedIndices, srcPos, dst.dstEncodedIndices, dstPos, length)
+ System.arraycopy(src.ratings, srcPos, dst.ratings, dstPos, length)
+ }
+
+ override def allocate(length: Int): UncompressedInBlock[ID] = {
+ new UncompressedInBlock(
+ new Array[ID](length), new Array[Int](length), new Array[Float](length))
+ }
+
+ override def copyElement(
+ src: UncompressedInBlock[ID],
+ srcPos: Int,
+ dst: UncompressedInBlock[ID],
+ dstPos: Int): Unit = {
+ dst.srcIds(dstPos) = src.srcIds(srcPos)
+ dst.dstEncodedIndices(dstPos) = src.dstEncodedIndices(srcPos)
+ dst.ratings(dstPos) = src.ratings(srcPos)
+ }
+ }
+
+ /**
+ * Creates in-blocks and out-blocks from rating blocks.
+ *
+ * @param prefix prefix for in/out-block names
+ * @param ratingBlocks rating blocks
+ * @param srcPart partitioner for src IDs
+ * @param dstPart partitioner for dst IDs
+ * @return (in-blocks, out-blocks)
+ */
+ private def makeBlocks[ID: ClassTag](
+ prefix: String,
+ ratingBlocks: RDD[((Int, Int), RatingBlock[ID])],
+ srcPart: Partitioner,
+ dstPart: Partitioner,
+ storageLevel: StorageLevel)(
+ implicit srcOrd: Ordering[ID]): (RDD[(Int, InBlock[ID])], RDD[(Int, OutBlock)]) = {
+ val inBlocks = ratingBlocks.map {
+ case ((srcBlockId, dstBlockId), RatingBlock(srcIds, dstIds, ratings)) =>
+ // The implementation is a faster version of
+ // val dstIdToLocalIndex = dstIds.toSet.toSeq.sorted.zipWithIndex.toMap
+ val start = System.nanoTime()
+ val dstIdSet = new OpenHashSet[ID](1 << 20)
+ dstIds.foreach(dstIdSet.add)
+ val sortedDstIds = new Array[ID](dstIdSet.size)
+ var i = 0
+ var pos = dstIdSet.nextPos(0)
+ while (pos != -1) {
+ sortedDstIds(i) = dstIdSet.getValue(pos)
+ pos = dstIdSet.nextPos(pos + 1)
+ i += 1
+ }
+ assert(i == dstIdSet.size)
+ Sorting.quickSort(sortedDstIds)
+ val dstIdToLocalIndex = new OpenHashMap[ID, Int](sortedDstIds.length)
+ i = 0
+ while (i < sortedDstIds.length) {
+ dstIdToLocalIndex.update(sortedDstIds(i), i)
+ i += 1
+ }
+ logDebug(
+ "Converting to local indices took " + (System.nanoTime() - start) / 1e9 + " seconds.")
+ val dstLocalIndices = dstIds.map(dstIdToLocalIndex.apply)
+ (srcBlockId, (dstBlockId, srcIds, dstLocalIndices, ratings))
+ }.groupByKey(new ALSPartitioner(srcPart.numPartitions))
+ .mapValues { iter =>
+ val builder =
+ new UncompressedInBlockBuilder[ID](new LocalIndexEncoder(dstPart.numPartitions))
+ iter.foreach { case (dstBlockId, srcIds, dstLocalIndices, ratings) =>
+ builder.add(dstBlockId, srcIds, dstLocalIndices, ratings)
+ }
+ builder.build().compress()
+ }.setName(prefix + "InBlocks")
+ .persist(storageLevel)
+ val outBlocks = inBlocks.mapValues { case InBlock(srcIds, dstPtrs, dstEncodedIndices, _) =>
+ val encoder = new LocalIndexEncoder(dstPart.numPartitions)
+ val activeIds = Array.fill(dstPart.numPartitions)(mutable.ArrayBuilder.make[Int])
+ var i = 0
+ val seen = new Array[Boolean](dstPart.numPartitions)
+ while (i < srcIds.length) {
+ var j = dstPtrs(i)
+ ju.Arrays.fill(seen, false)
+ while (j < dstPtrs(i + 1)) {
+ val dstBlockId = encoder.blockId(dstEncodedIndices(j))
+ if (!seen(dstBlockId)) {
+ activeIds(dstBlockId) += i // add the local index in this out-block
+ seen(dstBlockId) = true
+ }
+ j += 1
+ }
+ i += 1
+ }
+ activeIds.map { x =>
+ x.result()
+ }
+ }.setName(prefix + "OutBlocks")
+ .persist(storageLevel)
+ (inBlocks, outBlocks)
+ }
+
+ /**
+ * Compute dst factors by constructing and solving least square problems.
+ *
+ * @param srcFactorBlocks src factors
+ * @param srcOutBlocks src out-blocks
+ * @param dstInBlocks dst in-blocks
+ * @param rank rank
+ * @param regParam regularization constant
+ * @param srcEncoder encoder for src local indices
+ * @param implicitPrefs whether to use implicit preference
+ * @param alpha the alpha constant in the implicit preference formulation
+ * @param solver solver for least squares problems
+ * @return dst factors
+ */
+ private def computeFactors[ID](
+ srcFactorBlocks: RDD[(Int, FactorBlock)],
+ srcOutBlocks: RDD[(Int, OutBlock)],
+ dstInBlocks: RDD[(Int, InBlock[ID])],
+ rank: Int,
+ regParam: Double,
+ srcEncoder: LocalIndexEncoder,
+ implicitPrefs: Boolean = false,
+ alpha: Double = 1.0,
+ solver: LeastSquaresNESolver): RDD[(Int, FactorBlock)] = {
+ val numSrcBlocks = srcFactorBlocks.partitions.length
+ val YtY = if (implicitPrefs) Some(computeYtY(srcFactorBlocks, rank)) else None
+ val srcOut = srcOutBlocks.join(srcFactorBlocks).flatMap {
+ case (srcBlockId, (srcOutBlock, srcFactors)) =>
+ srcOutBlock.view.zipWithIndex.map { case (activeIndices, dstBlockId) =>
+ (dstBlockId, (srcBlockId, activeIndices.map(idx => srcFactors(idx))))
+ }
+ }
+ val merged = srcOut.groupByKey(new ALSPartitioner(dstInBlocks.partitions.length))
+
+ // SPARK-28927: Nondeterministic RDDs causes inconsistent in/out blocks in case of rerun.
+ // It can cause runtime error when matching in/out user/item blocks.
+ val isBlockRDDNondeterministic =
+ dstInBlocks.outputDeterministicLevel == DeterministicLevel.INDETERMINATE ||
+ srcOutBlocks.outputDeterministicLevel == DeterministicLevel.INDETERMINATE
+
+ dstInBlocks.join(merged).mapValues {
+ case (InBlock(dstIds, srcPtrs, srcEncodedIndices, ratings), srcFactors) =>
+ val sortedSrcFactors = new Array[FactorBlock](numSrcBlocks)
+ srcFactors.foreach { case (srcBlockId, factors) =>
+ sortedSrcFactors(srcBlockId) = factors
+ }
+ val dstFactors = new Array[Array[Float]](dstIds.length)
+ var j = 0
+ val ls = new NormalEquation(rank)
+ while (j < dstIds.length) {
+ ls.reset()
+ if (implicitPrefs) {
+ ls.merge(YtY.get)
+ }
+ var i = srcPtrs(j)
+ var numExplicits = 0
+ while (i < srcPtrs(j + 1)) {
+ val encoded = srcEncodedIndices(i)
+ val blockId = srcEncoder.blockId(encoded)
+ val localIndex = srcEncoder.localIndex(encoded)
+ var srcFactor: Array[Float] = null
+ try {
+ srcFactor = sortedSrcFactors(blockId)(localIndex)
+ } catch {
+ case a: ArrayIndexOutOfBoundsException if isBlockRDDNondeterministic =>
+ val errMsg = "A failure detected when matching In/Out blocks of users/items. " +
+ "Because at least one In/Out block RDD is found to be nondeterministic now, " +
+ "the issue is probably caused by nondeterministic input data. You can try to " +
+ "checkpoint training data to make it deterministic. If you do `repartition` + " +
+ "`sample` or `randomSplit`, you can also try to sort it before `sample` or " +
+ "`randomSplit` to make it deterministic."
+ throw new SparkException(errMsg, a)
+ }
+ val rating = ratings(i)
+ if (implicitPrefs) {
+ // Extension to the original paper to handle rating < 0. confidence is a function
+ // of |rating| instead so that it is never negative. c1 is confidence - 1.
+ val c1 = alpha * math.abs(rating)
+ // For rating <= 0, the corresponding preference is 0. So the second argument of add
+ // is only there for rating > 0.
+ if (rating > 0.0) {
+ numExplicits += 1
+ }
+ ls.add(srcFactor, if (rating > 0.0) 1.0 + c1 else 0.0, c1)
+ } else {
+ ls.add(srcFactor, rating)
+ numExplicits += 1
+ }
+ i += 1
+ }
+ // Weight lambda by the number of explicit ratings based on the ALS-WR paper.
+ dstFactors(j) = solver.solve(ls, numExplicits * regParam)
+ j += 1
+ }
+ dstFactors
+ }
+ }
+
+ /**
+ * Computes the Gramian matrix of user or item factors, which is only used in implicit preference.
+ * Caching of the input factors is handled in [[ALS#train]].
+ */
+ private def computeYtY(factorBlocks: RDD[(Int, FactorBlock)], rank: Int): NormalEquation = {
+ factorBlocks.values.aggregate(new NormalEquation(rank))(
+ seqOp = (ne, factors) => {
+ factors.foreach(ne.add(_, 0.0))
+ ne
+ },
+ combOp = (ne1, ne2) => ne1.merge(ne2))
+ }
+
+ /**
+ * Encoder for storing (blockId, localIndex) into a single integer.
+ *
+ * We use the leading bits (including the sign bit) to store the block id and the rest to store
+ * the local index. This is based on the assumption that users/items are approximately evenly
+ * partitioned. With this assumption, we should be able to encode two billion distinct values.
+ *
+ * @param numBlocks number of blocks
+ */
+ private[recommendation] class LocalIndexEncoder(numBlocks: Int) extends Serializable {
+
+ require(numBlocks > 0, s"numBlocks must be positive but found $numBlocks.")
+
+ private[this] final val numLocalIndexBits =
+ math.min(java.lang.Integer.numberOfLeadingZeros(numBlocks - 1), 31)
+ private[this] final val localIndexMask = (1 << numLocalIndexBits) - 1
+
+ /** Encodes a (blockId, localIndex) into a single integer. */
+ def encode(blockId: Int, localIndex: Int): Int = {
+ require(blockId < numBlocks)
+ require((localIndex & ~localIndexMask) == 0)
+ (blockId << numLocalIndexBits) | localIndex
+ }
+
+ /** Gets the block id from an encoded index. */
+ @inline
+ def blockId(encoded: Int): Int = {
+ encoded >>> numLocalIndexBits
+ }
+
+ /** Gets the local index from an encoded index. */
+ @inline
+ def localIndex(encoded: Int): Int = {
+ encoded & localIndexMask
+ }
+ }
+
+ /**
+ * Partitioner used by ALS. We require that getPartition is a projection. That is, for any key k,
+ * we have getPartition(getPartition(k)) = getPartition(k). Since the default HashPartitioner
+ * satisfies this requirement, we simply use a type alias here.
+ */
+ private[recommendation] type ALSPartitioner = org.apache.spark.HashPartitioner
+
+ /**
+ * Private function to clean up all of the shuffles files from the dependencies and their parents.
+ */
+ private[spark] def cleanShuffleDependencies[T](
+ sc: SparkContext,
+ deps: Seq[Dependency[_]],
+ blocking: Boolean = false): Unit = {
+ // If there is no reference tracking we skip clean up.
+ sc.cleaner.foreach { cleaner =>
+ /**
+ * Clean the shuffles & all of its parents.
+ */
+ def cleanEagerly(dep: Dependency[_]): Unit = {
+ if (dep.isInstanceOf[ShuffleDependency[_, _, _]]) {
+ val shuffleId = dep.asInstanceOf[ShuffleDependency[_, _, _]].shuffleId
+ cleaner.doCleanupShuffle(shuffleId, blocking)
+ }
+ val rdd = dep.rdd
+ val rddDeps = rdd.dependencies
+ if (rdd.getStorageLevel == StorageLevel.NONE && rddDeps != null) {
+ rddDeps.foreach(cleanEagerly)
+ }
+ }
+ deps.foreach(cleanEagerly)
+ }
+ }
+}
diff --git a/mllib-dal/src/main/scala/org/apache/spark/ml/recommendation/ALSDALImpl.scala b/mllib-dal/src/main/scala/org/apache/spark/ml/recommendation/ALSDALImpl.scala
new file mode 100644
index 000000000..bcb95ca1f
--- /dev/null
+++ b/mllib-dal/src/main/scala/org/apache/spark/ml/recommendation/ALSDALImpl.scala
@@ -0,0 +1,387 @@
+package org.apache.spark.ml.recommendation
+
+import com.intel.daal.data_management.data.CSRNumericTable.Indexing
+import org.apache.spark.rdd.{ExecutorInProcessCoalescePartitioner, RDD}
+
+import scala.reflect.ClassTag
+import com.intel.daal.data_management.data.{CSRNumericTable, HomogenNumericTable, RowMergedNumericTable, Matrix => DALMatrix}
+import com.intel.daal.services.DaalContext
+import org.apache.spark.Partitioner
+import org.apache.spark.internal.Logging
+import org.apache.spark.ml.recommendation.ALS.Rating
+import org.apache.spark.ml.util._
+
+import java.nio.{ByteBuffer, ByteOrder}
+import scala.collection.mutable.ArrayBuffer
+//import java.nio.DoubleBuffer
+import java.nio.FloatBuffer
+
+class ALSDataPartitioner(blocks: Int, itemsInBlock: Long)
+ extends Partitioner {
+ def numPartitions: Int = blocks
+ def getPartition(key: Any): Int = {
+ val k = key.asInstanceOf[Long]
+ // itemsInBlock = numItems / partitions
+ // remaining records will belog to the last partition
+ // 21 => 5, 5, 5, 6
+ // 46 => 11, 11, 11, 13
+ math.min((k / itemsInBlock).toInt, blocks-1)
+ }
+}
+
+class ALSDALImpl[@specialized(Int, Long) ID: ClassTag](
+ data: RDD[Rating[ID]],
+ nFactors: Int,
+ maxIter: Int,
+ regParam: Double,
+ alpha: Double,
+ seed: Long,
+) extends Serializable with Logging {
+
+ // Rating struct size is size of Long+Long+Float
+ val RATING_SIZE = 8 + 8 + 4
+
+ // Return Map partitionId -> (ratingsNum, csrRowNum, rowOffset)
+ private def getRatingsPartitionInfo(data: RDD[Rating[ID]]): Map[Int, (Int, Int, Int)] = {
+ val collectd = data.mapPartitionsWithIndex { case (index: Int, it: Iterator[Rating[ID]]) =>
+ var ratingsNum = 0
+ var s = Set[ID]()
+ it.foreach { v =>
+ s += v.user
+ ratingsNum += 1
+ }
+ Iterator((index, (ratingsNum, s.count(_ => true))))
+ }.collect
+
+ var ret = Map[Int, (Int, Int, Int)]()
+ var rowOffset = 0
+ collectd.foreach { v =>
+ val partitionId = v._1
+ val ratingsNum = v._2._1
+ val csrRowNum = v._2._2
+ ret += ( partitionId -> (ratingsNum, csrRowNum, rowOffset))
+ rowOffset = rowOffset + csrRowNum
+ }
+
+ ret
+ }
+
+ private def ratingsToCSRNumericTables(ratings: RDD[Rating[ID]],
+ nVectors: Long, nFeatures: Long, nBlocks: Long): RDD[CSRNumericTable] = {
+
+// val rowSortedRatings = ratings.sortBy(_.user.toString.toLong)
+
+// val itemsInBlock = (nFeatures + nBlocks - 1) / nBlocks
+ val itemsInBlock = nFeatures / nBlocks
+// val rowSortedGrouped = rowSortedRatings.groupBy(value => value.user.toString.toLong / itemsInBlock).flatMap(_._2)
+ val rowSortedGrouped = ratings
+ // Transpose the dataset
+ .map { p =>
+ Rating(p.item, p.user, p.rating)
+ }
+ .groupBy(value => value.user.toString.toLong)
+ .partitionBy(new ALSDataPartitioner(nBlocks.toInt, itemsInBlock))
+ .flatMap(_._2).mapPartitions { p =>
+ p.toArray.sortBy(_.user.toString.toLong).toIterator
+ }
+
+ println("rowSortedGrouped partition number: ", rowSortedGrouped.getNumPartitions)
+
+ // rowSortedGrouped.mapPartitionsWithIndex { case (partitionId, partition) =>
+// println("partitionId", partitionId)
+// partition.foreach { p =>
+// println(p.user, p.item, p.rating) }
+// Iterator(partitionId)
+// }.collect()
+
+ val ratingsPartitionInfo = getRatingsPartitionInfo(rowSortedGrouped)
+ println("ratingsPartitionInfo:", ratingsPartitionInfo)
+
+ rowSortedGrouped.mapPartitionsWithIndex { case (partitionId, partition) =>
+ val ratingsNum = ratingsPartitionInfo(partitionId)._1
+ val csrRowNum = ratingsPartitionInfo(partitionId)._2
+ val values = Array.fill(ratingsNum) { 0.0f }
+ val columnIndices = Array.fill(ratingsNum) { 0L }
+ val rowOffsets = ArrayBuffer[Long](1L)
+
+
+ var index = 0
+ var curRow = 0L
+ // Each partition converted to one CSRNumericTable
+ partition.foreach { p =>
+ // Modify row index for each partition (start from 0)
+ val row = p.user.toString.toLong - ratingsPartitionInfo(partitionId)._3
+ val column = p.item.toString.toLong
+ val rating = p.rating
+
+ values(index) = rating
+ // one-based index
+ columnIndices(index) = column + 1
+
+ if (row > curRow) {
+ curRow = row
+ // one-based index
+ rowOffsets += index + 1
+ }
+
+ index = index + 1
+ }
+ // one-based row index
+ rowOffsets += index+1
+
+ println("PartitionId:", partitionId)
+ println("csrRowNum", csrRowNum)
+// println("rowOffsets", rowOffsets.mkString(","))
+// println("columnIndices", columnIndices.mkString(","))
+// println("values", values.mkString(","))
+
+ val contextLocal = new DaalContext()
+
+ println("ALSDALImpl: Loading native libraries ..." )
+ LibLoader.loadLibraries()
+
+ val cTable = OneDAL.cNewCSRNumericTable(values, columnIndices, rowOffsets.toArray, nVectors, csrRowNum)
+ val table = new CSRNumericTable(contextLocal, cTable)
+// table.pack()
+
+ println("Input dimensions:", table.getNumberOfRows, table.getNumberOfColumns)
+
+ // There is a bug https://github.com/oneapi-src/oneDAL/pull/1288,
+ // printNumericTable can't print correct result for CSRNumericTable, use C++ printNumericTable
+ // Service.printNumericTable("Input: ", table)
+
+ Iterator(table)
+ }.cache()
+ }
+
+// def factorsToRDD(cUsersFactorsNumTab: Long, cItemsFactorsNumTab: Long)
+// :(RDD[(ID, Array[Float])], RDD[(ID, Array[Float])]) = {
+// val usersFactorsNumTab = OneDAL.makeNumericTable(cUsersFactorsNumTab)
+// val itemsFactorsNumTab = OneDAL.makeNumericTable(cItemsFactorsNumTab)
+//
+// Service.printNumericTable("usersFactorsNumTab", usersFactorsNumTab)
+// Service.printNumericTable("itemsFactorsNumTab", itemsFactorsNumTab)
+//
+// null
+// }
+
+ def ratingsToByteBuffer(ratings: Array[Rating[ID]]): ByteBuffer = {
+// println("ratings len", ratings.length)
+
+ val buffer= ByteBuffer.allocateDirect(ratings.length*(8+8+4))
+ // Use little endian
+ buffer.order(ByteOrder.LITTLE_ENDIAN)
+ ratings.foreach { rating =>
+ buffer.putLong(rating.user.toString.toLong)
+ buffer.putLong(rating.item.toString.toLong)
+ buffer.putFloat(rating.rating)
+ }
+ buffer
+ }
+
+ def run(): (RDD[(ID, Array[Float])], RDD[(ID, Array[Float])]) = {
+ val executorNum = Utils.sparkExecutorNum(data.sparkContext)
+ val executorCores = Utils.sparkExecutorCores()
+
+ val nFeatures = data.max()(new Ordering[Rating[ID]]() {
+ override def compare(x: Rating[ID], y: Rating[ID]): Int =
+ Ordering[Long].compare(x.item.toString.toLong, y.item.toString.toLong)
+ }).item.toString.toLong + 1
+
+ val nVectors = data.max()(new Ordering[Rating[ID]]() {
+ override def compare(x: Rating[ID], y: Rating[ID]): Int =
+ Ordering[Long].compare(x.user.toString.toLong, y.user.toString.toLong)
+ }).user.toString.toLong + 1
+
+// val largestItems = data.sortBy(_.item.toString.toLong, ascending = false).take(1)
+// val nFeatures = largestItems(0).item.toString.toLong + 1
+
+// val largestUsers = data.sortBy(_.user.toString.toLong, ascending = false).take(1)
+// val nVectors = largestUsers(0).user.toString.toLong + 1
+
+ val nBlocks = executorNum
+
+// val nRatings = data.count()
+
+ logInfo(s"ALSDAL fit using $executorNum Executors for $nVectors vectors and $nFeatures features")
+
+ val numericTables = data.repartition(executorNum).setName("Repartitioned for conversion").cache()
+
+ val executorIPAddress = Utils.sparkFirstExecutorIP(numericTables.sparkContext)
+ val kvsIP = numericTables.sparkContext.conf.get("spark.oap.mllib.oneccl.kvs.ip", executorIPAddress)
+
+ val kvsPortDetected = Utils.checkExecutorAvailPort(numericTables, kvsIP)
+ val kvsPort = numericTables.sparkContext.conf.getInt("spark.oap.mllib.oneccl.kvs.port", kvsPortDetected)
+
+ val kvsIPPort = kvsIP+"_"+kvsPort
+
+ val results = numericTables
+ // Transpose the dataset
+ .map { p =>
+ Rating(p.item, p.user, p.rating) }
+ .mapPartitionsWithIndex { (rank, iter) =>
+ val context = new DaalContext()
+ println("ALSDALImpl: Loading libMLlibDAL.so" )
+ LibLoader.loadLibraries()
+
+ OneCCL.init(executorNum, rank, kvsIPPort)
+ val rankId = OneCCL.rankID()
+
+ println("rankId", rankId, "nUsers", nVectors, "nItems", nFeatures)
+
+ val buffer = ratingsToByteBuffer(iter.toArray)
+ val bufferInfo = new ALSPartitionInfo
+ val shuffledBuffer = cShuffleData(buffer, nFeatures.toInt, nBlocks, bufferInfo)
+
+ val table = bufferToCSRNumericTable(shuffledBuffer, bufferInfo, nVectors.toInt, nFeatures.toInt, nBlocks, rankId)
+
+ val result = new ALSResult()
+ cDALImplictALS(
+ table.getCNumericTable, nUsers = nVectors,
+ nFactors, maxIter, regParam, alpha,
+ executorNum,
+ executorCores,
+ rankId,
+ result
+ )
+ Iterator(result)
+ }.cache()
+
+// results.foreach { p =>
+//// val usersFactorsNumTab = OneDAL.makeNumericTable(p.cUsersFactorsNumTab)
+//// println("foreach", p.cUsersFactorsNumTab, p.cItemsFactorsNumTab)
+// println("result", p.rankId, p.cUserOffset, p.cItemOffset);
+// }
+
+// val usersFactorsRDD = results.mapPartitionsWithIndex { (index: Int, partiton: Iterator[ALSResult]) =>
+// partiton.foreach { p =>
+// val usersFactorsNumTab = OneDAL.makeNumericTable(p.cUsersFactorsNumTab)
+// Service.printNumericTable("usersFactorsNumTab", usersFactorsNumTab)
+// }
+// Iterator()
+// }.collect()
+
+ val usersFactorsRDD = results.mapPartitionsWithIndex { (index: Int, partiton: Iterator[ALSResult]) =>
+ val ret = partiton.flatMap { p =>
+ val userOffset = p.cUserOffset.toInt
+ val usersFactorsNumTab = OneDAL.makeNumericTable(p.cUsersFactorsNumTab)
+ val nRows = usersFactorsNumTab.getNumberOfRows.toInt
+ val nCols = usersFactorsNumTab.getNumberOfColumns.toInt
+ var buffer = FloatBuffer.allocate(nCols * nRows)
+ // should use returned buffer
+ buffer = usersFactorsNumTab.getBlockOfRows(0, nRows, buffer)
+ (0 until nRows).map { index =>
+ val array = Array.fill(nCols){0.0f}
+ buffer.get(array, 0, nCols)
+ ((index+userOffset).asInstanceOf[ID], array)
+ }.toIterator
+ }
+ ret
+ }.setName("userFactors").cache()
+
+ val itemsFactorsRDD = results.mapPartitionsWithIndex { (index: Int, partiton: Iterator[ALSResult]) =>
+ val ret = partiton.flatMap { p =>
+ val itemOffset = p.cItemOffset.toInt
+ val itemsFactorsNumTab = OneDAL.makeNumericTable(p.cItemsFactorsNumTab)
+ val nRows = itemsFactorsNumTab.getNumberOfRows.toInt
+ val nCols = itemsFactorsNumTab.getNumberOfColumns.toInt
+ var buffer = FloatBuffer.allocate(nCols * nRows)
+ // should use returned buffer
+ buffer = itemsFactorsNumTab.getBlockOfRows(0, nRows, buffer)
+ (0 until nRows).map { index =>
+ val array = Array.fill(nCols){0.0f}
+ buffer.get(array, 0, nCols)
+ ((index+itemOffset).asInstanceOf[ID], array)
+ }.toIterator
+ }
+ ret
+ }.setName("itemFactors").cache()
+
+ usersFactorsRDD.count()
+ itemsFactorsRDD.count()
+
+// usersFactorsRDD.foreach { case (id, array) =>
+// println("usersFactorsRDD", id, array.mkString(", "))
+// }
+//
+// itemsFactorsRDD.foreach { case (id, array) =>
+// println("itemsFactorsRDD", id, array.mkString(", "))
+// }
+
+ (usersFactorsRDD, itemsFactorsRDD)
+ }
+
+ private def getPartitionOffset(partitionId: Int, nRatings: Int, nBlocks: Int): Int = {
+ require(partitionId >=0 && partitionId < nBlocks)
+ val itemsInBlock = nRatings / nBlocks
+ return partitionId * itemsInBlock
+ }
+
+ private def bufferToCSRNumericTable(buffer: ByteBuffer, info: ALSPartitionInfo,
+ nVectors: Int, nFeatures: Int, nBlocks: Int, rankId: Int): CSRNumericTable = {
+ // Use little endian
+ buffer.order(ByteOrder.LITTLE_ENDIAN)
+
+ val ratingsNum = info.ratingsNum
+ val csrRowNum = info.csrRowNum
+ val values = Array.fill(ratingsNum) { 0.0f }
+ val columnIndices = Array.fill(ratingsNum) { 0L }
+ val rowOffsets = ArrayBuffer[Long](1L)
+
+ var index = 0
+ var curRow = 0L
+ // Each partition converted to one CSRNumericTable
+ for (i <- 0 until ratingsNum) {
+ // Modify row index for each partition (start from 0)
+ val row = buffer.getLong(i*RATING_SIZE) - getPartitionOffset(rankId, nFeatures, nBlocks)
+ val column = buffer.getLong(i*RATING_SIZE+8)
+ val rating = buffer.getFloat(i*RATING_SIZE+16)
+
+ values(index) = rating
+ // one-based index
+ columnIndices(index) = column + 1
+
+ if (row > curRow) {
+ curRow = row
+ // one-based index
+ rowOffsets += index + 1
+ }
+
+ index = index + 1
+ }
+ // one-based row index
+ rowOffsets += index+1
+
+// println("rankId:", rankId)
+// println("csrRowNum", csrRowNum)
+
+// println(rowOffsets.mkString(" "))
+// println(columnIndices.mkString(" "))
+// println(values.mkString(" "))
+
+ val contextLocal = new DaalContext()
+ val cTable = OneDAL.cNewCSRNumericTable(values, columnIndices, rowOffsets.toArray, nVectors, csrRowNum)
+ val table = new CSRNumericTable(contextLocal, cTable)
+
+ println("Input dimensions:", table.getNumberOfRows, table.getNumberOfColumns)
+// Service.printNumericTable("Input NumericTable", table)
+
+ table
+ }
+
+ // Single entry to call Implict ALS DAL backend
+ @native private def cDALImplictALS(data: Long,
+ nUsers: Long,
+ nFactors: Int,
+ maxIter: Int,
+ regParam: Double,
+ alpha: Double,
+ executor_num: Int,
+ executor_cores: Int,
+ rankId: Int,
+ result: ALSResult): Long
+ @native private def cShuffleData(data: ByteBuffer,
+ nTotalKeys: Int,
+ nBlocks: Int,
+ info: ALSPartitionInfo): ByteBuffer
+}
diff --git a/mllib-dal/src/main/scala/org/apache/spark/ml/util/OneCCL.scala b/mllib-dal/src/main/scala/org/apache/spark/ml/util/OneCCL.scala
index 4c38a9bdc..7581a1003 100644
--- a/mllib-dal/src/main/scala/org/apache/spark/ml/util/OneCCL.scala
+++ b/mllib-dal/src/main/scala/org/apache/spark/ml/util/OneCCL.scala
@@ -17,60 +17,32 @@
package org.apache.spark.ml.util
-import org.apache.spark.SparkConf
+import org.apache.spark.internal.Logging
-object OneCCL {
+object OneCCL extends Logging {
var cclParam = new CCLParam()
- var kvsIPPort = sys.env.getOrElse("CCL_KVS_IP_PORT", "")
- var worldSize = sys.env.getOrElse("CCL_WORLD_SIZE", "1").toInt
-
- var KVS_PORT = 51234
-
- private def checkEnv() {
- val altTransport = sys.env.getOrElse("CCL_ATL_TRANSPORT", "")
- val pmType = sys.env.getOrElse("CCL_PM_TYPE", "")
- val ipExchange = sys.env.getOrElse("CCL_KVS_IP_EXCHANGE", "")
-
- assert(altTransport == "ofi")
- assert(pmType == "resizable")
- assert(ipExchange == "env")
- assert(kvsIPPort != "")
-
- }
-
// Run on Executor
- def setExecutorEnv(executor_num: Int, ip: String, port: Int): Unit = {
- // Work around ccl by passings in a spark.executorEnv.CCL_KVS_IP_PORT.
- val ccl_kvs_ip_port = sys.env.getOrElse("CCL_KVS_IP_PORT", s"${ip}_${port}")
-
- println(s"oneCCL: Initializing with CCL_KVS_IP_PORT: $ccl_kvs_ip_port")
-
- setEnv("CCL_PM_TYPE", "resizable")
+ def setExecutorEnv(): Unit = {
setEnv("CCL_ATL_TRANSPORT","ofi")
- setEnv("CCL_ATL_TRANSPORT_PATH", LibLoader.getTempSubDir())
- setEnv("CCL_KVS_IP_EXCHANGE","env")
- setEnv("CCL_KVS_IP_PORT", ccl_kvs_ip_port)
- setEnv("CCL_WORLD_SIZE", s"${executor_num}")
// Uncomment this if you whant to debug oneCCL
// setEnv("CCL_LOG_LEVEL", "2")
}
- def init(executor_num: Int, ip: String, port: Int) = {
+ def init(executor_num: Int, rank: Int, ip_port: String) = {
+
+ setExecutorEnv()
- setExecutorEnv(executor_num, ip, port)
+ logInfo(s"Initializing with IP_PORT: ${ip_port}")
// cclParam is output from native code
- c_init(cclParam)
+ c_init(executor_num, rank, ip_port, cclParam)
// executor number should equal to oneCCL world size
assert(executor_num == cclParam.commSize, "executor number should equal to oneCCL world size")
- println(s"oneCCL: Initialized with executorNum: $executor_num, commSize, ${cclParam.commSize}, rankId: ${cclParam.rankId}")
-
- KVS_PORT = KVS_PORT + 1
-
+ logInfo(s"Initialized with executorNum: $executor_num, commSize, ${cclParam.commSize}, rankId: ${cclParam.rankId}")
}
// Run on Executor
@@ -78,11 +50,16 @@ object OneCCL {
c_cleanup()
}
- @native private def c_init(param: CCLParam) : Int
+ def getAvailPort(localIP: String): Int = synchronized {
+ c_getAvailPort(localIP)
+ }
+
+ @native private def c_init(size: Int, rank: Int, ip_port: String, param: CCLParam) : Int
@native private def c_cleanup() : Unit
@native def isRoot() : Boolean
@native def rankID() : Int
@native def setEnv(key: String, value: String, overwrite: Boolean = true): Int
-}
+ @native def c_getAvailPort(localIP: String): Int
+}
\ No newline at end of file
diff --git a/mllib-dal/src/main/scala/org/apache/spark/ml/util/OneDAL.scala b/mllib-dal/src/main/scala/org/apache/spark/ml/util/OneDAL.scala
index 2f6c83775..9b6c0f6c7 100644
--- a/mllib-dal/src/main/scala/org/apache/spark/ml/util/OneDAL.scala
+++ b/mllib-dal/src/main/scala/org/apache/spark/ml/util/OneDAL.scala
@@ -149,4 +149,7 @@ object OneDAL {
@native def cFreeDataMemory(numTableAddr: Long)
@native def cCheckPlatformCompatibility() : Boolean
+
+ @native def cNewCSRNumericTable(data: Array[Float], colIndices: Array[Long], rowOffsets: Array[Long], nFeatures: Long,
+ nVectors: Long) : Long
}
diff --git a/mllib-dal/src/main/scala/org/apache/spark/ml/util/Utils.scala b/mllib-dal/src/main/scala/org/apache/spark/ml/util/Utils.scala
index 40a1c6823..aa8eb8979 100644
--- a/mllib-dal/src/main/scala/org/apache/spark/ml/util/Utils.scala
+++ b/mllib-dal/src/main/scala/org/apache/spark/ml/util/Utils.scala
@@ -71,8 +71,22 @@ object Utils {
ip
}
+ def checkExecutorAvailPort(data: RDD[_], localIP: String) : Int = {
+ val sc = data.sparkContext
+ val result = data.mapPartitions { p =>
+ LibLoader.loadLibraries()
+ val port = OneCCL.getAvailPort(localIP)
+ if (port != -1)
+ Iterator(port)
+ else
+ Iterator()
+ }.collect()
+
+ return result(0)
+ }
+
def checkClusterPlatformCompatibility(sc: SparkContext) : Boolean = {
- LibLoader.loadLibMLlibDAL()
+ LibLoader.loadLibraries()
// check driver platform compatibility
if (!OneDAL.cCheckPlatformCompatibility())
@@ -82,7 +96,7 @@ object Utils {
val executor_num = Utils.sparkExecutorNum(sc)
val data = sc.parallelize(1 to executor_num, executor_num)
val result = data.map { p =>
- LibLoader.loadLibMLlibDAL()
+ LibLoader.loadLibraries()
OneDAL.cCheckPlatformCompatibility()
}.collect()
diff --git a/mllib-dal/src/test/scala/org/apache/spark/ml/recommendation/IntelALSSuite.scala b/mllib-dal/src/test/scala/org/apache/spark/ml/recommendation/IntelALSSuite.scala
new file mode 100644
index 000000000..339644888
--- /dev/null
+++ b/mllib-dal/src/test/scala/org/apache/spark/ml/recommendation/IntelALSSuite.scala
@@ -0,0 +1,1239 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.recommendation
+
+import java.io.File
+import java.util.Random
+
+import scala.collection.JavaConverters._
+import scala.collection.mutable
+import scala.collection.mutable.{ArrayBuffer, WrappedArray}
+
+import com.github.fommil.netlib.BLAS.{getInstance => blas}
+import org.apache.commons.io.FileUtils
+import org.apache.commons.io.filefilter.TrueFileFilter
+import org.scalatest.BeforeAndAfterEach
+
+import org.apache.spark._
+import org.apache.spark.internal.Logging
+import org.apache.spark.ml.linalg.Vectors
+import org.apache.spark.ml.recommendation.ALS._
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
+import org.apache.spark.ml.util.TestingUtils._
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.rdd.RDD
+import org.apache.spark.scheduler.{SparkListener, SparkListenerStageCompleted}
+import org.apache.spark.sql.{DataFrame, Encoder, Row, SparkSession}
+import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
+import org.apache.spark.sql.functions.{col, lit}
+import org.apache.spark.sql.streaming.StreamingQueryException
+import org.apache.spark.sql.types._
+import org.apache.spark.storage.StorageLevel
+import org.apache.spark.util.Utils
+
+class IntelALSSuite extends MLTest with DefaultReadWriteTest with Logging {
+
+ override def beforeAll(): Unit = {
+ super.beforeAll()
+ sc.setCheckpointDir(tempDir.getAbsolutePath)
+ }
+
+ override def afterAll(): Unit = {
+ super.afterAll()
+ }
+
+ test("LocalIndexEncoder") {
+ val random = new Random
+ for (numBlocks <- Seq(1, 2, 5, 10, 20, 50, 100)) {
+ val encoder = new LocalIndexEncoder(numBlocks)
+ val maxLocalIndex = Int.MaxValue / numBlocks
+ val tests = Seq.fill(5)((random.nextInt(numBlocks), random.nextInt(maxLocalIndex))) ++
+ Seq((0, 0), (numBlocks - 1, maxLocalIndex))
+ tests.foreach { case (blockId, localIndex) =>
+ val err = s"Failed with numBlocks=$numBlocks, blockId=$blockId, and localIndex=$localIndex."
+ val encoded = encoder.encode(blockId, localIndex)
+ assert(encoder.blockId(encoded) === blockId, err)
+ assert(encoder.localIndex(encoded) === localIndex, err)
+ }
+ }
+ }
+
+ test("normal equation construction") {
+ val k = 2
+ val ne0 = new NormalEquation(k)
+ .add(Array(1.0f, 2.0f), 3.0)
+ .add(Array(4.0f, 5.0f), 12.0, 2.0) // weighted
+ assert(ne0.k === k)
+ assert(ne0.triK === k * (k + 1) / 2)
+ // NumPy code that computes the expected values:
+ // A = np.matrix("1 2; 4 5")
+ // b = np.matrix("3; 6")
+ // C = np.matrix(np.diag([1, 2]))
+ // ata = A.transpose() * C * A
+ // atb = A.transpose() * C * b
+ assert(Vectors.dense(ne0.ata) ~== Vectors.dense(33.0, 42.0, 54.0) relTol 1e-8)
+ assert(Vectors.dense(ne0.atb) ~== Vectors.dense(51.0, 66.0) relTol 1e-8)
+
+ val ne1 = new NormalEquation(2)
+ .add(Array(7.0f, 8.0f), 9.0)
+ ne0.merge(ne1)
+ // NumPy code that computes the expected values:
+ // A = np.matrix("1 2; 4 5; 7 8")
+ // b = np.matrix("3; 6; 9")
+ // C = np.matrix(np.diag([1, 2, 1]))
+ // ata = A.transpose() * C * A
+ // atb = A.transpose() * C * b
+ assert(Vectors.dense(ne0.ata) ~== Vectors.dense(82.0, 98.0, 118.0) relTol 1e-8)
+ assert(Vectors.dense(ne0.atb) ~== Vectors.dense(114.0, 138.0) relTol 1e-8)
+
+ intercept[IllegalArgumentException] {
+ ne0.add(Array(1.0f), 2.0)
+ }
+ intercept[IllegalArgumentException] {
+ ne0.add(Array(1.0f, 2.0f, 3.0f), 4.0)
+ }
+ intercept[IllegalArgumentException] {
+ ne0.add(Array(1.0f, 2.0f), 0.0, -1.0)
+ }
+ intercept[IllegalArgumentException] {
+ val ne2 = new NormalEquation(3)
+ ne0.merge(ne2)
+ }
+
+ ne0.reset()
+ assert(ne0.ata.forall(_ == 0.0))
+ assert(ne0.atb.forall(_ == 0.0))
+ }
+
+ test("CholeskySolver") {
+ val k = 2
+ val ne0 = new NormalEquation(k)
+ .add(Array(1.0f, 2.0f), 4.0)
+ .add(Array(1.0f, 3.0f), 9.0)
+ .add(Array(1.0f, 4.0f), 16.0)
+ val ne1 = new NormalEquation(k)
+ .merge(ne0)
+
+ val chol = new CholeskySolver
+ val x0 = chol.solve(ne0, 0.0).map(_.toDouble)
+ // NumPy code that computes the expected solution:
+ // A = np.matrix("1 2; 1 3; 1 4")
+ // b = b = np.matrix("3; 6")
+ // x0 = np.linalg.lstsq(A, b)[0]
+ assert(Vectors.dense(x0) ~== Vectors.dense(-8.333333, 6.0) relTol 1e-6)
+
+ assert(ne0.ata.forall(_ == 0.0))
+ assert(ne0.atb.forall(_ == 0.0))
+
+ val x1 = chol.solve(ne1, 1.5).map(_.toDouble)
+ // NumPy code that computes the expected solution, where lambda is scaled by n:
+ // x0 = np.linalg.solve(A.transpose() * A + 1.5 * np.eye(2), A.transpose() * b)
+ assert(Vectors.dense(x1) ~== Vectors.dense(-0.1155556, 3.28) relTol 1e-6)
+ }
+
+ test("RatingBlockBuilder") {
+ val emptyBuilder = new RatingBlockBuilder[Int]()
+ assert(emptyBuilder.size === 0)
+ val emptyBlock = emptyBuilder.build()
+ assert(emptyBlock.srcIds.isEmpty)
+ assert(emptyBlock.dstIds.isEmpty)
+ assert(emptyBlock.ratings.isEmpty)
+
+ val builder0 = new RatingBlockBuilder()
+ .add(Rating(0, 1, 2.0f))
+ .add(Rating(3, 4, 5.0f))
+ assert(builder0.size === 2)
+ val builder1 = new RatingBlockBuilder()
+ .add(Rating(6, 7, 8.0f))
+ .merge(builder0.build())
+ assert(builder1.size === 3)
+ val block = builder1.build()
+ val ratings = Seq.tabulate(block.size) { i =>
+ (block.srcIds(i), block.dstIds(i), block.ratings(i))
+ }.toSet
+ assert(ratings === Set((0, 1, 2.0f), (3, 4, 5.0f), (6, 7, 8.0f)))
+ }
+
+ test("UncompressedInBlock") {
+ val encoder = new LocalIndexEncoder(10)
+ val uncompressed = new UncompressedInBlockBuilder[Int](encoder)
+ .add(0, Array(1, 0, 2), Array(0, 1, 4), Array(1.0f, 2.0f, 3.0f))
+ .add(1, Array(3, 0), Array(2, 5), Array(4.0f, 5.0f))
+ .build()
+ assert(uncompressed.length === 5)
+ val records = Seq.tabulate(uncompressed.length) { i =>
+ val dstEncodedIndex = uncompressed.dstEncodedIndices(i)
+ val dstBlockId = encoder.blockId(dstEncodedIndex)
+ val dstLocalIndex = encoder.localIndex(dstEncodedIndex)
+ (uncompressed.srcIds(i), dstBlockId, dstLocalIndex, uncompressed.ratings(i))
+ }.toSet
+ val expected =
+ Set((1, 0, 0, 1.0f), (0, 0, 1, 2.0f), (2, 0, 4, 3.0f), (3, 1, 2, 4.0f), (0, 1, 5, 5.0f))
+ assert(records === expected)
+
+ val compressed = uncompressed.compress()
+ assert(compressed.size === 5)
+ assert(compressed.srcIds.toSeq === Seq(0, 1, 2, 3))
+ assert(compressed.dstPtrs.toSeq === Seq(0, 2, 3, 4, 5))
+ var decompressed = ArrayBuffer.empty[(Int, Int, Int, Float)]
+ var i = 0
+ while (i < compressed.srcIds.length) {
+ var j = compressed.dstPtrs(i)
+ while (j < compressed.dstPtrs(i + 1)) {
+ val dstEncodedIndex = compressed.dstEncodedIndices(j)
+ val dstBlockId = encoder.blockId(dstEncodedIndex)
+ val dstLocalIndex = encoder.localIndex(dstEncodedIndex)
+ decompressed += ((compressed.srcIds(i), dstBlockId, dstLocalIndex, compressed.ratings(j)))
+ j += 1
+ }
+ i += 1
+ }
+ assert(decompressed.toSet === expected)
+ }
+
+ test("CheckedCast") {
+ val checkedCast = new ALS().checkedCast
+ val df = spark.range(1)
+
+ withClue("Valid Integer Ids") {
+ df.select(checkedCast(lit(123))).collect()
+ }
+
+ withClue("Valid Long Ids") {
+ df.select(checkedCast(lit(1231L))).collect()
+ }
+
+ withClue("Valid Decimal Ids") {
+ df.select(checkedCast(lit(123).cast(DecimalType(15, 2)))).collect()
+ }
+
+ withClue("Valid Double Ids") {
+ df.select(checkedCast(lit(123.0))).collect()
+ }
+
+ val msg = "either out of Integer range or contained a fractional part"
+ withClue("Invalid Long: out of range") {
+ val e: SparkException = intercept[SparkException] {
+ df.select(checkedCast(lit(1231000000000L))).collect()
+ }
+ assert(e.getMessage.contains(msg))
+ }
+
+ withClue("Invalid Decimal: out of range") {
+ val e: SparkException = intercept[SparkException] {
+ df.select(checkedCast(lit(1231000000000.0).cast(DecimalType(15, 2)))).collect()
+ }
+ assert(e.getMessage.contains(msg))
+ }
+
+ withClue("Invalid Decimal: fractional part") {
+ val e: SparkException = intercept[SparkException] {
+ df.select(checkedCast(lit(123.1).cast(DecimalType(15, 2)))).collect()
+ }
+ assert(e.getMessage.contains(msg))
+ }
+
+ withClue("Invalid Double: out of range") {
+ val e: SparkException = intercept[SparkException] {
+ df.select(checkedCast(lit(1231000000000.0))).collect()
+ }
+ assert(e.getMessage.contains(msg))
+ }
+
+ withClue("Invalid Double: fractional part") {
+ val e: SparkException = intercept[SparkException] {
+ df.select(checkedCast(lit(123.1))).collect()
+ }
+ assert(e.getMessage.contains(msg))
+ }
+
+ withClue("Invalid Type") {
+ val e: SparkException = intercept[SparkException] {
+ df.select(checkedCast(lit("123.1"))).collect()
+ }
+ assert(e.getMessage.contains("was not numeric"))
+ }
+ }
+
+ /**
+ * Generates an explicit feedback dataset for testing ALS.
+ * @param numUsers number of users
+ * @param numItems number of items
+ * @param rank rank
+ * @param noiseStd the standard deviation of additive Gaussian noise on training data
+ * @param seed random seed
+ * @return (training, test)
+ */
+ def genExplicitTestData(
+ numUsers: Int,
+ numItems: Int,
+ rank: Int,
+ noiseStd: Double = 0.0,
+ seed: Long = 11L): (RDD[Rating[Int]], RDD[Rating[Int]]) = {
+ val trainingFraction = 0.6
+ val testFraction = 0.3
+ val totalFraction = trainingFraction + testFraction
+ val random = new Random(seed)
+ val userFactors = genFactors(numUsers, rank, random)
+ val itemFactors = genFactors(numItems, rank, random)
+ val training = ArrayBuffer.empty[Rating[Int]]
+ val test = ArrayBuffer.empty[Rating[Int]]
+ for ((userId, userFactor) <- userFactors; (itemId, itemFactor) <- itemFactors) {
+ val x = random.nextDouble()
+ if (x < totalFraction) {
+ val rating = blas.sdot(rank, userFactor, 1, itemFactor, 1)
+ if (x < trainingFraction) {
+ val noise = noiseStd * random.nextGaussian()
+ training += Rating(userId, itemId, rating + noise.toFloat)
+ } else {
+ test += Rating(userId, itemId, rating)
+ }
+ }
+ }
+ logInfo(s"Generated an explicit feedback dataset with ${training.size} ratings for training " +
+ s"and ${test.size} for test.")
+ (sc.parallelize(training, 2), sc.parallelize(test, 2))
+ }
+
+ /**
+ * Generates an implicit feedback dataset for testing ALS.
+ * @param numUsers number of users
+ * @param numItems number of items
+ * @param rank rank
+ * @param noiseStd the standard deviation of additive Gaussian noise on training data
+ * @param seed random seed
+ * @return (training, test)
+ */
+ def genImplicitTestData(
+ numUsers: Int,
+ numItems: Int,
+ rank: Int,
+ noiseStd: Double = 0.0,
+ seed: Long = 11L): (RDD[Rating[Int]], RDD[Rating[Int]]) = {
+ ALSSuite.genImplicitTestData(sc, numUsers, numItems, rank, noiseStd, seed)
+ }
+
+ /**
+ * Generates random user/item factors, with i.i.d. values drawn from U(a, b).
+ * @param size number of users/items
+ * @param rank number of features
+ * @param random random number generator
+ * @param a min value of the support (default: -1)
+ * @param b max value of the support (default: 1)
+ * @return a sequence of (ID, factors) pairs
+ */
+ private def genFactors(
+ size: Int,
+ rank: Int,
+ random: Random,
+ a: Float = -1.0f,
+ b: Float = 1.0f): Seq[(Int, Array[Float])] = {
+ IntelALSSuite.genFactors(size, rank, random, a, b)
+ }
+
+ /**
+ * Train ALS using the given training set and parameters
+ * @param training training dataset
+ * @param rank rank of the matrix factorization
+ * @param maxIter max number of iterations
+ * @param regParam regularization constant
+ * @param implicitPrefs whether to use implicit preference
+ * @param numUserBlocks number of user blocks
+ * @param numItemBlocks number of item blocks
+ * @return a trained ALSModel
+ */
+ def trainALS(
+ training: RDD[Rating[Int]],
+ rank: Int,
+ maxIter: Int,
+ regParam: Double,
+ implicitPrefs: Boolean = false,
+ numUserBlocks: Int = 2,
+ numItemBlocks: Int = 3): ALSModel = {
+ val spark = this.spark
+ import spark.implicits._
+ val als = new ALS()
+ .setRank(rank)
+ .setRegParam(regParam)
+ .setImplicitPrefs(implicitPrefs)
+ .setNumUserBlocks(numUserBlocks)
+ .setNumItemBlocks(numItemBlocks)
+ .setSeed(0)
+ als.fit(training.toDF())
+ }
+
+ /**
+ * Test ALS using the given training/test splits and parameters.
+ * @param training training dataset
+ * @param test test dataset
+ * @param rank rank of the matrix factorization
+ * @param maxIter max number of iterations
+ * @param regParam regularization constant
+ * @param implicitPrefs whether to use implicit preference
+ * @param numUserBlocks number of user blocks
+ * @param numItemBlocks number of item blocks
+ * @param targetRMSE target test RMSE
+ */
+ def testALS(
+ training: RDD[Rating[Int]],
+ test: RDD[Rating[Int]],
+ rank: Int,
+ maxIter: Int,
+ regParam: Double,
+ implicitPrefs: Boolean = false,
+ numUserBlocks: Int = 2,
+ numItemBlocks: Int = 3,
+ targetRMSE: Double = 0.05): Unit = {
+ val spark = this.spark
+ import spark.implicits._
+ val als = new ALS()
+ .setRank(rank)
+ .setRegParam(regParam)
+ .setImplicitPrefs(implicitPrefs)
+ .setNumUserBlocks(numUserBlocks)
+ .setNumItemBlocks(numItemBlocks)
+ .setSeed(0)
+ val alpha = als.getAlpha
+ val model = als.fit(training.toDF())
+ testTransformerByGlobalCheckFunc[Rating[Int]](test.toDF(), model, "rating", "prediction") {
+ case rows: Seq[Row] =>
+ val predictions = rows.map(row => (row.getFloat(0).toDouble, row.getFloat(1).toDouble))
+
+ val rmse =
+ if (implicitPrefs) {
+ // TODO: Use a better (rank-based?) evaluation metric for implicit feedback.
+ // We limit the ratings and the predictions to interval [0, 1] and compute the
+ // weighted RMSE with the confidence scores as weights.
+ val (totalWeight, weightedSumSq) = predictions.map { case (rating, prediction) =>
+ val confidence = 1.0 + alpha * math.abs(rating)
+ val rating01 = math.max(math.min(rating, 1.0), 0.0)
+ val prediction01 = math.max(math.min(prediction, 1.0), 0.0)
+ val err = prediction01 - rating01
+ (confidence, confidence * err * err)
+ }.reduce[(Double, Double)] { case ((c0, e0), (c1, e1)) =>
+ (c0 + c1, e0 + e1)
+ }
+ math.sqrt(weightedSumSq / totalWeight)
+ } else {
+ val errorSquares = predictions.map { case (rating, prediction) =>
+ val err = rating - prediction
+ err * err
+ }
+ val mse = errorSquares.sum / errorSquares.length
+ math.sqrt(mse)
+ }
+ logInfo(s"Test RMSE is $rmse.")
+ assert(rmse < targetRMSE)
+ }
+
+ MLTestingUtils.checkCopyAndUids(als, model)
+ }
+
+ test("exact rank-1 matrix") {
+ val (training, test) = genExplicitTestData(numUsers = 20, numItems = 40, rank = 1)
+ testALS(training, test, maxIter = 1, rank = 1, regParam = 1e-5, targetRMSE = 0.001)
+ testALS(training, test, maxIter = 1, rank = 2, regParam = 1e-5, targetRMSE = 0.001)
+ }
+
+ test("approximate rank-1 matrix") {
+ val (training, test) =
+ genExplicitTestData(numUsers = 20, numItems = 40, rank = 1, noiseStd = 0.01)
+ testALS(training, test, maxIter = 2, rank = 1, regParam = 0.01, targetRMSE = 0.02)
+ testALS(training, test, maxIter = 2, rank = 2, regParam = 0.01, targetRMSE = 0.02)
+ }
+
+ test("approximate rank-2 matrix") {
+ val (training, test) =
+ genExplicitTestData(numUsers = 20, numItems = 40, rank = 2, noiseStd = 0.01)
+ testALS(training, test, maxIter = 4, rank = 2, regParam = 0.01, targetRMSE = 0.03)
+ testALS(training, test, maxIter = 4, rank = 3, regParam = 0.01, targetRMSE = 0.03)
+ }
+
+ test("different block settings") {
+ val (training, test) =
+ genExplicitTestData(numUsers = 20, numItems = 40, rank = 2, noiseStd = 0.01)
+ for ((numUserBlocks, numItemBlocks) <- Seq((1, 1), (1, 2), (2, 1), (2, 2))) {
+ testALS(training, test, maxIter = 4, rank = 3, regParam = 0.01, targetRMSE = 0.03,
+ numUserBlocks = numUserBlocks, numItemBlocks = numItemBlocks)
+ }
+ }
+
+ test("more blocks than ratings") {
+ val (training, test) =
+ genExplicitTestData(numUsers = 4, numItems = 4, rank = 1)
+ testALS(training, test, maxIter = 2, rank = 1, regParam = 1e-4, targetRMSE = 0.002,
+ numItemBlocks = 5, numUserBlocks = 5)
+ }
+
+ test("implicit feedback") {
+ val (training, test) =
+ genImplicitTestData(numUsers = 20, numItems = 40, rank = 2, noiseStd = 0.01)
+ testALS(training, test, maxIter = 4, rank = 2, regParam = 0.01, implicitPrefs = true,
+ targetRMSE = 0.3)
+ }
+
+ test("implicit feedback regression") {
+ val trainingWithNeg = sc.parallelize(Seq(Rating(0, 0, 1), Rating(1, 1, 1), Rating(0, 1, -3)))
+ val trainingWithZero = sc.parallelize(Seq(Rating(0, 0, 1), Rating(1, 1, 1), Rating(0, 1, 0)))
+ val modelWithNeg =
+ trainALS(trainingWithNeg, rank = 1, maxIter = 5, regParam = 0.01, implicitPrefs = true)
+ val modelWithZero =
+ trainALS(trainingWithZero, rank = 1, maxIter = 5, regParam = 0.01, implicitPrefs = true)
+ val userFactorsNeg = modelWithNeg.userFactors
+ val itemFactorsNeg = modelWithNeg.itemFactors
+ val userFactorsZero = modelWithZero.userFactors
+ val itemFactorsZero = modelWithZero.itemFactors
+ assert(userFactorsNeg.intersect(userFactorsZero).count() == 0)
+ assert(itemFactorsNeg.intersect(itemFactorsZero).count() == 0)
+ }
+ test("using generic ID types") {
+ val (ratings, _) = genImplicitTestData(numUsers = 20, numItems = 40, rank = 2, noiseStd = 0.01)
+
+ val longRatings = ratings.map(r => Rating(r.user.toLong, r.item.toLong, r.rating))
+ val (longUserFactors, _) = ALS.train(longRatings, rank = 2, maxIter = 4, seed = 0)
+ assert(longUserFactors.first()._1.getClass === classOf[Long])
+
+ val strRatings = ratings.map(r => Rating(r.user.toString, r.item.toString, r.rating))
+ val (strUserFactors, _) = ALS.train(strRatings, rank = 2, maxIter = 4, seed = 0)
+ assert(strUserFactors.first()._1.getClass === classOf[String])
+ }
+
+ test("nonnegative constraint") {
+ val (ratings, _) = genImplicitTestData(numUsers = 20, numItems = 40, rank = 2, noiseStd = 0.01)
+ val (userFactors, itemFactors) =
+ ALS.train(ratings, rank = 2, maxIter = 4, nonnegative = true, seed = 0)
+ def isNonnegative(factors: RDD[(Int, Array[Float])]): Boolean = {
+ factors.values.map { _.forall(_ >= 0.0) }.reduce(_ && _)
+ }
+ assert(isNonnegative(userFactors))
+ assert(isNonnegative(itemFactors))
+ // TODO: Validate the solution.
+ }
+
+ test("als partitioner is a projection") {
+ for (p <- Seq(1, 10, 100, 1000)) {
+ val part = new ALSPartitioner(p)
+ var k = 0
+ while (k < p) {
+ assert(k === part.getPartition(k))
+ assert(k === part.getPartition(k.toLong))
+ k += 1
+ }
+ }
+ }
+
+ test("partitioner in returned factors") {
+ val (ratings, _) = genImplicitTestData(numUsers = 20, numItems = 40, rank = 2, noiseStd = 0.01)
+ val (userFactors, itemFactors) = ALS.train(
+ ratings, rank = 2, maxIter = 4, numUserBlocks = 3, numItemBlocks = 4, seed = 0)
+ for ((tpe, factors) <- Seq(("User", userFactors), ("Item", itemFactors))) {
+ assert(userFactors.partitioner.isDefined, s"$tpe factors should have partitioner.")
+ val part = userFactors.partitioner.get
+ userFactors.mapPartitionsWithIndex { (idx, items) =>
+ items.foreach { case (id, _) =>
+ if (part.getPartition(id) != idx) {
+ throw new SparkException(s"$tpe with ID $id should not be in partition $idx.")
+ }
+ }
+ Iterator.empty
+ }.count()
+ }
+ }
+
+ test("als with large number of iterations") {
+ val (ratings, _) = genExplicitTestData(numUsers = 4, numItems = 4, rank = 1)
+ ALS.train(ratings, rank = 1, maxIter = 50, numUserBlocks = 2, numItemBlocks = 2, seed = 0)
+ ALS.train(ratings, rank = 1, maxIter = 50, numUserBlocks = 2, numItemBlocks = 2,
+ implicitPrefs = true, seed = 0)
+ }
+
+ test("read/write") {
+ val spark = this.spark
+ import ALSSuite._
+ import spark.implicits._
+ val (ratings, _) = genExplicitTestData(numUsers = 4, numItems = 4, rank = 1)
+
+ def getFactors(df: DataFrame): Set[(Int, Array[Float])] = {
+ df.select("id", "features").collect().map { case r =>
+ (r.getInt(0), r.getAs[Array[Float]](1))
+ }.toSet
+ }
+
+ def checkModelData(model: ALSModel, model2: ALSModel): Unit = {
+ assert(model.rank === model2.rank)
+ assert(getFactors(model.userFactors) === getFactors(model2.userFactors))
+ assert(getFactors(model.itemFactors) === getFactors(model2.itemFactors))
+ }
+
+ val als = new ALS()
+ testEstimatorAndModelReadWrite(als, ratings.toDF(), allEstimatorParamSettings,
+ allModelParamSettings, checkModelData)
+ }
+
+ private def checkNumericTypesALS(
+ estimator: ALS,
+ spark: SparkSession,
+ column: String,
+ baseType: NumericType)
+ (check: (ALSModel, ALSModel) => Unit)
+ (check2: (ALSModel, ALSModel, DataFrame, Encoder[_]) => Unit): Unit = {
+ val dfs = genRatingsDFWithNumericCols(spark, column)
+ val maybeDf = dfs.find { case (numericTypeWithEncoder, _) =>
+ numericTypeWithEncoder.numericType == baseType
+ }
+ assert(maybeDf.isDefined)
+ val df = maybeDf.get._2
+
+ val expected = estimator.fit(df)
+ val actuals = dfs.map(t => (t, estimator.fit(t._2)))
+ actuals.foreach { case (_, actual) => check(expected, actual) }
+ actuals.foreach { case (t, actual) => check2(expected, actual, t._2, t._1.encoder) }
+
+ val baseDF = dfs.find(_._1.numericType == baseType).get._2
+ val others = baseDF.columns.toSeq.diff(Seq(column)).map(col)
+ val cols = Seq(col(column).cast(StringType)) ++ others
+ val strDF = baseDF.select(cols: _*)
+ val thrown = intercept[IllegalArgumentException] {
+ estimator.fit(strDF)
+ }
+ assert(thrown.getMessage.contains(
+ s"$column must be of type numeric but was actually of type string"))
+ }
+
+ private class NumericTypeWithEncoder[A](val numericType: NumericType)
+ (implicit val encoder: Encoder[(A, Int, Double)])
+
+ private def genRatingsDFWithNumericCols(
+ spark: SparkSession,
+ column: String) = {
+
+ import testImplicits._
+
+ val df = spark.createDataFrame(Seq(
+ (0, 10, 1.0),
+ (1, 20, 2.0),
+ (2, 30, 3.0),
+ (3, 40, 4.0),
+ (4, 50, 5.0)
+ )).toDF("user", "item", "rating")
+
+ val others = df.columns.toSeq.diff(Seq(column)).map(col)
+ val types =
+ Seq(new NumericTypeWithEncoder[Short](ShortType),
+ new NumericTypeWithEncoder[Long](LongType),
+ new NumericTypeWithEncoder[Int](IntegerType),
+ new NumericTypeWithEncoder[Float](FloatType),
+ new NumericTypeWithEncoder[Byte](ByteType),
+ new NumericTypeWithEncoder[Double](DoubleType),
+ new NumericTypeWithEncoder[Decimal](DecimalType(10, 0))(ExpressionEncoder())
+ )
+ types.map { t =>
+ val cols = Seq(col(column).cast(t.numericType)) ++ others
+ t -> df.select(cols: _*)
+ }
+ }
+
+ test("input type validation") {
+ val spark = this.spark
+ import spark.implicits._
+
+ // check that ALS can handle all numeric types for rating column
+ // and user/item columns (when the user/item ids are within Int range)
+ val als = new ALS().setMaxIter(1).setRank(1)
+ Seq(("user", IntegerType), ("item", IntegerType), ("rating", FloatType)).foreach {
+ case (colName, sqlType) =>
+ checkNumericTypesALS(als, spark, colName, sqlType) {
+ (ex, act) =>
+ ex.userFactors.first().getSeq[Float](1) === act.userFactors.first().getSeq[Float](1)
+ } { (ex, act, df, enc) =>
+ // With AQE on/off, the order of result may be different. Here sortby the result.
+ val expected = ex.transform(df).selectExpr("prediction")
+ .sort("prediction").first().getFloat(0)
+ testTransformerByGlobalCheckFunc(df, act, "prediction") {
+ case rows: Seq[Row] =>
+ expected ~== rows.sortBy(_.getFloat(0)).head.getFloat(0) absTol 1e-6
+ }(enc)
+ }
+ }
+ // check user/item ids falling outside of Int range
+ val big = Int.MaxValue.toLong + 1
+ val small = Int.MinValue.toDouble - 1
+ val df = Seq(
+ (0, 0L, 0d, 1, 1L, 1d, 3.0),
+ (0, big, small, 0, big, small, 2.0),
+ (1, 1L, 1d, 0, 0L, 0d, 5.0)
+ ).toDF("user", "user_big", "user_small", "item", "item_big", "item_small", "rating")
+ val msg = "either out of Integer range or contained a fractional part"
+ withClue("fit should fail when ids exceed integer range. ") {
+ assert(intercept[SparkException] {
+ als.fit(df.select(df("user_big").as("user"), df("item"), df("rating")))
+ }.getCause.getMessage.contains(msg))
+ assert(intercept[SparkException] {
+ als.fit(df.select(df("user_small").as("user"), df("item"), df("rating")))
+ }.getCause.getMessage.contains(msg))
+ assert(intercept[SparkException] {
+ als.fit(df.select(df("item_big").as("item"), df("user"), df("rating")))
+ }.getCause.getMessage.contains(msg))
+ assert(intercept[SparkException] {
+ als.fit(df.select(df("item_small").as("item"), df("user"), df("rating")))
+ }.getCause.getMessage.contains(msg))
+ }
+ withClue("transform should fail when ids exceed integer range. ") {
+ val model = als.fit(df)
+ def testTransformIdExceedsIntRange[A : Encoder](dataFrame: DataFrame): Unit = {
+ val e1 = intercept[SparkException] {
+ model.transform(dataFrame).collect()
+ }
+ TestUtils.assertExceptionMsg(e1, msg)
+ val e2 = intercept[StreamingQueryException] {
+ testTransformer[A](dataFrame, model, "prediction") { _ => }
+ }
+ TestUtils.assertExceptionMsg(e2, msg)
+ }
+ testTransformIdExceedsIntRange[(Long, Int)](df.select(df("user_big").as("user"),
+ df("item")))
+ testTransformIdExceedsIntRange[(Double, Int)](df.select(df("user_small").as("user"),
+ df("item")))
+ testTransformIdExceedsIntRange[(Long, Int)](df.select(df("item_big").as("item"),
+ df("user")))
+ testTransformIdExceedsIntRange[(Double, Int)](df.select(df("item_small").as("item"),
+ df("user")))
+ }
+ }
+
+ test("SPARK-18268: ALS with empty RDD should fail with better message") {
+ val ratings = sc.parallelize(Array.empty[Rating[Int]])
+ intercept[IllegalArgumentException] {
+ ALS.train(ratings)
+ }
+ }
+
+ test("ALS cold start user/item prediction strategy") {
+ val spark = this.spark
+ import org.apache.spark.sql.functions._
+ import spark.implicits._
+
+ val (ratings, _) = genExplicitTestData(numUsers = 4, numItems = 4, rank = 1)
+ val data = ratings.toDF
+ val knownUser = data.select(max("user")).as[Int].first()
+ val unknownUser = knownUser + 10
+ val knownItem = data.select(max("item")).as[Int].first()
+ val unknownItem = knownItem + 20
+ val test = Seq(
+ (unknownUser, unknownItem, true),
+ (knownUser, unknownItem, true),
+ (unknownUser, knownItem, true),
+ (knownUser, knownItem, false)
+ ).toDF("user", "item", "expectedIsNaN")
+
+ val als = new ALS().setMaxIter(1).setRank(1)
+ // default is 'nan'
+ val defaultModel = als.fit(data)
+ testTransformer[(Int, Int, Boolean)](test, defaultModel, "expectedIsNaN", "prediction") {
+ case Row(expectedIsNaN: Boolean, prediction: Float) =>
+ assert(prediction.isNaN === expectedIsNaN)
+ }
+
+ // check 'drop' strategy should filter out rows with unknown users/items
+ val defaultPrediction = defaultModel.transform(test).select("prediction")
+ .as[Float].filter(!_.isNaN).first()
+ testTransformerByGlobalCheckFunc[(Int, Int, Boolean)](test,
+ defaultModel.setColdStartStrategy("drop"), "prediction") {
+ case rows: Seq[Row] =>
+ val dropPredictions = rows.map(_.getFloat(0))
+ assert(dropPredictions.length == 1)
+ assert(!dropPredictions.head.isNaN)
+ assert(dropPredictions.head ~== defaultPrediction relTol 1e-14)
+ }
+ }
+
+ test("case insensitive cold start param value") {
+ val spark = this.spark
+ import spark.implicits._
+ val (ratings, _) = genExplicitTestData(numUsers = 2, numItems = 2, rank = 1)
+ val data = ratings.toDF
+ val model = new ALS().fit(data)
+ Seq("nan", "NaN", "Nan", "drop", "DROP", "Drop").foreach { s =>
+ testTransformer[Rating[Int]](data, model.setColdStartStrategy(s), "prediction") { _ => }
+ }
+ }
+
+ private def getALSModel = {
+ val spark = this.spark
+ import spark.implicits._
+
+ val userFactors = Seq(
+ (0, Array(6.0f, 4.0f)),
+ (1, Array(3.0f, 4.0f)),
+ (2, Array(3.0f, 6.0f))
+ ).toDF("id", "features")
+ val itemFactors = Seq(
+ (3, Array(5.0f, 6.0f)),
+ (4, Array(6.0f, 2.0f)),
+ (5, Array(3.0f, 6.0f)),
+ (6, Array(4.0f, 1.0f))
+ ).toDF("id", "features")
+ val als = new ALS().setRank(2)
+ new ALSModel(als.uid, als.getRank, userFactors, itemFactors)
+ .setUserCol("user")
+ .setItemCol("item")
+ }
+
+ test("recommendForAllUsers with k <, = and > num_items") {
+ val model = getALSModel
+ val numUsers = model.userFactors.count
+ val numItems = model.itemFactors.count
+ val expected = Map(
+ 0 -> Seq((3, 54f), (4, 44f), (5, 42f), (6, 28f)),
+ 1 -> Seq((3, 39f), (5, 33f), (4, 26f), (6, 16f)),
+ 2 -> Seq((3, 51f), (5, 45f), (4, 30f), (6, 18f))
+ )
+
+ Seq(2, 4, 6).foreach { k =>
+ val n = math.min(k, numItems).toInt
+ val expectedUpToN = expected.mapValues(_.slice(0, n))
+ val topItems = model.recommendForAllUsers(k)
+ assert(topItems.count() == numUsers)
+ assert(topItems.columns.contains("user"))
+ checkRecommendations(topItems, expectedUpToN, "item")
+ }
+ }
+
+ test("recommendForAllItems with k <, = and > num_users") {
+ val model = getALSModel
+ val numUsers = model.userFactors.count
+ val numItems = model.itemFactors.count
+ val expected = Map(
+ 3 -> Seq((0, 54f), (2, 51f), (1, 39f)),
+ 4 -> Seq((0, 44f), (2, 30f), (1, 26f)),
+ 5 -> Seq((2, 45f), (0, 42f), (1, 33f)),
+ 6 -> Seq((0, 28f), (2, 18f), (1, 16f))
+ )
+
+ Seq(2, 3, 4).foreach { k =>
+ val n = math.min(k, numUsers).toInt
+ val expectedUpToN = expected.mapValues(_.slice(0, n))
+ val topUsers = getALSModel.recommendForAllItems(k)
+ assert(topUsers.count() == numItems)
+ assert(topUsers.columns.contains("item"))
+ checkRecommendations(topUsers, expectedUpToN, "user")
+ }
+ }
+
+ test("recommendForUserSubset with k <, = and > num_items") {
+ val spark = this.spark
+ import spark.implicits._
+ val model = getALSModel
+ val numItems = model.itemFactors.count
+ val expected = Map(
+ 0 -> Seq((3, 54f), (4, 44f), (5, 42f), (6, 28f)),
+ 2 -> Seq((3, 51f), (5, 45f), (4, 30f), (6, 18f))
+ )
+ val userSubset = expected.keys.toSeq.toDF("user")
+ val numUsersSubset = userSubset.count
+
+ Seq(2, 4, 6).foreach { k =>
+ val n = math.min(k, numItems).toInt
+ val expectedUpToN = expected.mapValues(_.slice(0, n))
+ val topItems = model.recommendForUserSubset(userSubset, k)
+ assert(topItems.count() == numUsersSubset)
+ assert(topItems.columns.contains("user"))
+ checkRecommendations(topItems, expectedUpToN, "item")
+ }
+ }
+
+ test("recommendForItemSubset with k <, = and > num_users") {
+ val spark = this.spark
+ import spark.implicits._
+ val model = getALSModel
+ val numUsers = model.userFactors.count
+ val expected = Map(
+ 3 -> Seq((0, 54f), (2, 51f), (1, 39f)),
+ 6 -> Seq((0, 28f), (2, 18f), (1, 16f))
+ )
+ val itemSubset = expected.keys.toSeq.toDF("item")
+ val numItemsSubset = itemSubset.count
+
+ Seq(2, 3, 4).foreach { k =>
+ val n = math.min(k, numUsers).toInt
+ val expectedUpToN = expected.mapValues(_.slice(0, n))
+ val topUsers = model.recommendForItemSubset(itemSubset, k)
+ assert(topUsers.count() == numItemsSubset)
+ assert(topUsers.columns.contains("item"))
+ checkRecommendations(topUsers, expectedUpToN, "user")
+ }
+ }
+
+ test("subset recommendations eliminate duplicate ids, returns same results as unique ids") {
+ val spark = this.spark
+ import spark.implicits._
+ val model = getALSModel
+ val k = 2
+
+ val users = Seq(0, 1).toDF("user")
+ val dupUsers = Seq(0, 1, 0, 1).toDF("user")
+ val singleUserRecs = model.recommendForUserSubset(users, k)
+ val dupUserRecs = model.recommendForUserSubset(dupUsers, k)
+ .as[(Int, Seq[(Int, Float)])].collect().toMap
+ assert(singleUserRecs.count == dupUserRecs.size)
+ checkRecommendations(singleUserRecs, dupUserRecs, "item")
+
+ val items = Seq(3, 4, 5).toDF("item")
+ val dupItems = Seq(3, 4, 5, 4, 5).toDF("item")
+ val singleItemRecs = model.recommendForItemSubset(items, k)
+ val dupItemRecs = model.recommendForItemSubset(dupItems, k)
+ .as[(Int, Seq[(Int, Float)])].collect().toMap
+ assert(singleItemRecs.count == dupItemRecs.size)
+ checkRecommendations(singleItemRecs, dupItemRecs, "user")
+ }
+
+ test("subset recommendations on full input dataset equivalent to recommendForAll") {
+ val spark = this.spark
+ import spark.implicits._
+ val model = getALSModel
+ val k = 2
+
+ val userSubset = model.userFactors.withColumnRenamed("id", "user").drop("features")
+ val userSubsetRecs = model.recommendForUserSubset(userSubset, k)
+ val allUserRecs = model.recommendForAllUsers(k).as[(Int, Seq[(Int, Float)])].collect().toMap
+ checkRecommendations(userSubsetRecs, allUserRecs, "item")
+
+ val itemSubset = model.itemFactors.withColumnRenamed("id", "item").drop("features")
+ val itemSubsetRecs = model.recommendForItemSubset(itemSubset, k)
+ val allItemRecs = model.recommendForAllItems(k).as[(Int, Seq[(Int, Float)])].collect().toMap
+ checkRecommendations(itemSubsetRecs, allItemRecs, "user")
+ }
+
+ test("ALS should not introduce unnecessary shuffle") {
+ def getShuffledDependencies(rdd: RDD[_]): Seq[ShuffleDependency[_, _, _]] = {
+ rdd.dependencies.flatMap {
+ case s: ShuffleDependency[_, _, _] =>
+ Seq(s) ++ getShuffledDependencies(s.rdd)
+ case o =>
+ Seq.empty ++ getShuffledDependencies(o.rdd)
+ }
+ }
+
+ val spark = this.spark
+ import spark.implicits._
+ val (ratings, _) = genExplicitTestData(numUsers = 2, numItems = 2, rank = 1)
+ val data = ratings.toDF
+ val model = new ALS()
+ .setMaxIter(2)
+ .setImplicitPrefs(true)
+ .setCheckpointInterval(-1)
+ .fit(data)
+
+ val userFactors = model.userFactors
+ val itemFactors = model.itemFactors
+ val shuffledUserFactors = getShuffledDependencies(userFactors.rdd).filter { dep =>
+ dep.rdd.name != null && dep.rdd.name.contains("userFactors")
+ }
+ val shuffledItemFactors = getShuffledDependencies(itemFactors.rdd).filter { dep =>
+ dep.rdd.name != null && dep.rdd.name.contains("itemFactors")
+ }
+ assert(shuffledUserFactors.size == 0)
+ assert(shuffledItemFactors.size == 0)
+ }
+
+ private def checkRecommendations(
+ topK: DataFrame,
+ expected: Map[Int, Seq[(Int, Float)]],
+ dstColName: String): Unit = {
+ val spark = this.spark
+ import spark.implicits._
+
+ assert(topK.columns.contains("recommendations"))
+ topK.as[(Int, Seq[(Int, Float)])].collect().foreach { case (id: Int, recs: Seq[(Int, Float)]) =>
+ assert(recs === expected(id))
+ }
+ topK.collect().foreach { row =>
+ val recs = row.getAs[WrappedArray[Row]]("recommendations")
+ assert(recs(0).fieldIndex(dstColName) == 0)
+ assert(recs(0).fieldIndex("rating") == 1)
+ }
+ }
+}
+
+class ALSCleanerSuite extends SparkFunSuite with BeforeAndAfterEach {
+ override def beforeEach(): Unit = {
+ super.beforeEach()
+ // Once `Utils.getOrCreateLocalRootDirs` is called, it is cached in `Utils.localRootDirs`.
+ // Unless this is manually cleared before and after a test, it returns the same directory
+ // set before even if 'spark.local.dir' is configured afterwards.
+ Utils.clearLocalRootDirs()
+ }
+
+ override def afterEach(): Unit = {
+ Utils.clearLocalRootDirs()
+ super.afterEach()
+ }
+
+ test("ALS shuffle cleanup standalone") {
+ val conf = new SparkConf()
+ val localDir = Utils.createTempDir()
+ val checkpointDir = Utils.createTempDir()
+ def getAllFiles: Set[File] =
+ FileUtils.listFiles(localDir, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE).asScala.toSet
+ try {
+ conf.set("spark.local.dir", localDir.getAbsolutePath)
+ val sc = new SparkContext("local[2]", "test", conf)
+ try {
+ sc.setCheckpointDir(checkpointDir.getAbsolutePath)
+ // Test checkpoint and clean parents
+ val input = sc.parallelize(1 to 1000)
+ val keyed = input.map(x => (x % 20, 1))
+ val shuffled = keyed.reduceByKey(_ + _)
+ val keysOnly = shuffled.keys
+ val deps = keysOnly.dependencies
+ keysOnly.count()
+ ALS.cleanShuffleDependencies(sc, deps, true)
+ val resultingFiles = getAllFiles
+ assert(resultingFiles === Set())
+ // Ensure running count again works fine even if we kill the shuffle files.
+ keysOnly.count()
+ } finally {
+ sc.stop()
+ }
+ } finally {
+ Utils.deleteRecursively(localDir)
+ Utils.deleteRecursively(checkpointDir)
+ }
+ }
+
+ test("ALS shuffle cleanup in algorithm") {
+ val conf = new SparkConf()
+ val localDir = Utils.createTempDir()
+ val checkpointDir = Utils.createTempDir()
+ def getAllFiles: Set[File] =
+ FileUtils.listFiles(localDir, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE).asScala.toSet
+ try {
+ conf.set("spark.local.dir", localDir.getAbsolutePath)
+ val sc = new SparkContext("local[2]", "ALSCleanerSuite", conf)
+ try {
+ sc.setCheckpointDir(checkpointDir.getAbsolutePath)
+ // Generate test data
+ val (training, _) = ALSSuite.genImplicitTestData(sc, 20, 5, 1, 0.2, 0)
+ // Implicitly test the cleaning of parents during ALS training
+ val spark = SparkSession.builder
+ .sparkContext(sc)
+ .getOrCreate()
+ import spark.implicits._
+ val als = new ALS()
+ .setRank(1)
+ .setRegParam(1e-5)
+ .setSeed(0)
+ .setCheckpointInterval(1)
+ .setMaxIter(7)
+ val model = als.fit(training.toDF())
+ val resultingFiles = getAllFiles
+ // We expect the last shuffles files, block ratings, user factors, and item factors to be
+ // around but no more.
+ val pattern = "shuffle_(\\d+)_.+\\.data".r
+ val rddIds = resultingFiles.flatMap { f =>
+ pattern.findAllIn(f.getName()).matchData.map { _.group(1) } }
+ assert(rddIds.size === 4)
+ } finally {
+ sc.stop()
+ }
+ } finally {
+ Utils.deleteRecursively(localDir)
+ Utils.deleteRecursively(checkpointDir)
+ }
+ }
+}
+
+class ALSStorageSuite
+ extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest with Logging {
+
+ test("invalid storage params") {
+ intercept[IllegalArgumentException] {
+ new ALS().setIntermediateStorageLevel("foo")
+ }
+ intercept[IllegalArgumentException] {
+ new ALS().setIntermediateStorageLevel("NONE")
+ }
+ intercept[IllegalArgumentException] {
+ new ALS().setFinalStorageLevel("foo")
+ }
+ }
+
+ test("default and non-default storage params set correct RDD StorageLevels") {
+ val spark = this.spark
+ import spark.implicits._
+ val data = Seq(
+ (0, 0, 1.0),
+ (0, 1, 2.0),
+ (1, 2, 3.0),
+ (1, 0, 2.0)
+ ).toDF("user", "item", "rating")
+ val als = new ALS().setMaxIter(1).setRank(1)
+ // add listener to check intermediate RDD default storage levels
+ val defaultListener = new IntermediateRDDStorageListener
+ sc.addSparkListener(defaultListener)
+ val model = als.fit(data)
+ // check final factor RDD default storage levels
+ val defaultFactorRDDs = sc.getPersistentRDDs.collect {
+ case (id, rdd) if rdd.name == "userFactors" || rdd.name == "itemFactors" =>
+ rdd.name -> ((id, rdd.getStorageLevel))
+ }.toMap
+ defaultFactorRDDs.foreach { case (_, (id, level)) =>
+ assert(level == StorageLevel.MEMORY_AND_DISK)
+ }
+ defaultListener.storageLevels.foreach(level => assert(level == StorageLevel.MEMORY_AND_DISK))
+
+ // add listener to check intermediate RDD non-default storage levels
+ val nonDefaultListener = new IntermediateRDDStorageListener
+ sc.addSparkListener(nonDefaultListener)
+ val nonDefaultModel = als
+ .setFinalStorageLevel("MEMORY_ONLY")
+ .setIntermediateStorageLevel("DISK_ONLY")
+ .fit(data)
+ // check final factor RDD non-default storage levels
+ val levels = sc.getPersistentRDDs.collect {
+ case (id, rdd) if rdd.name == "userFactors" && rdd.id != defaultFactorRDDs("userFactors")._1
+ || rdd.name == "itemFactors" && rdd.id != defaultFactorRDDs("itemFactors")._1 =>
+ rdd.getStorageLevel
+ }
+ levels.foreach(level => assert(level == StorageLevel.MEMORY_ONLY))
+ nonDefaultListener.storageLevels.foreach(level => assert(level == StorageLevel.DISK_ONLY))
+ }
+}
+
+private class IntermediateRDDStorageListener extends SparkListener {
+
+ val storageLevels: mutable.ArrayBuffer[StorageLevel] = mutable.ArrayBuffer()
+
+ override def onStageCompleted(stageCompleted: SparkListenerStageCompleted): Unit = {
+ val stageLevels = stageCompleted.stageInfo.rddInfos.collect {
+ case info if info.name.contains("Blocks") || info.name.contains("Factors-") =>
+ info.storageLevel
+ }
+ storageLevels ++= stageLevels
+ }
+
+}
+
+object IntelALSSuite extends Logging {
+
+ /**
+ * Mapping from all Params to valid settings which differ from the defaults.
+ * This is useful for tests which need to exercise all Params, such as save/load.
+ * This excludes input columns to simplify some tests.
+ */
+ val allModelParamSettings: Map[String, Any] = Map(
+ "predictionCol" -> "myPredictionCol"
+ )
+
+ /**
+ * Mapping from all Params to valid settings which differ from the defaults.
+ * This is useful for tests which need to exercise all Params, such as save/load.
+ * This excludes input columns to simplify some tests.
+ */
+ val allEstimatorParamSettings: Map[String, Any] = allModelParamSettings ++ Map(
+ "maxIter" -> 1,
+ "rank" -> 1,
+ "regParam" -> 0.01,
+ "numUserBlocks" -> 2,
+ "numItemBlocks" -> 2,
+ "implicitPrefs" -> true,
+ "alpha" -> 0.9,
+ "nonnegative" -> true,
+ "checkpointInterval" -> 20,
+ "intermediateStorageLevel" -> "MEMORY_ONLY",
+ "finalStorageLevel" -> "MEMORY_AND_DISK_SER"
+ )
+
+ // Helper functions to generate test data we share between ALS test suites
+
+ /**
+ * Generates random user/item factors, with i.i.d. values drawn from U(a, b).
+ * @param size number of users/items
+ * @param rank number of features
+ * @param random random number generator
+ * @param a min value of the support (default: -1)
+ * @param b max value of the support (default: 1)
+ * @return a sequence of (ID, factors) pairs
+ */
+ private def genFactors(
+ size: Int,
+ rank: Int,
+ random: Random,
+ a: Float = -1.0f,
+ b: Float = 1.0f): Seq[(Int, Array[Float])] = {
+ require(size > 0 && size < Int.MaxValue / 3)
+ require(b > a)
+ val ids = mutable.Set.empty[Int]
+ while (ids.size < size) {
+ ids += random.nextInt()
+ }
+ val width = b - a
+ ids.toSeq.sorted.map(id => (id, Array.fill(rank)(a + random.nextFloat() * width)))
+ }
+
+ /**
+ * Generates an implicit feedback dataset for testing ALS.
+ *
+ * @param sc SparkContext
+ * @param numUsers number of users
+ * @param numItems number of items
+ * @param rank rank
+ * @param noiseStd the standard deviation of additive Gaussian noise on training data
+ * @param seed random seed
+ * @return (training, test)
+ */
+ def genImplicitTestData(
+ sc: SparkContext,
+ numUsers: Int,
+ numItems: Int,
+ rank: Int,
+ noiseStd: Double = 0.0,
+ seed: Long = 11L): (RDD[Rating[Int]], RDD[Rating[Int]]) = {
+ // The assumption of the implicit feedback model is that unobserved ratings are more likely to
+ // be negatives.
+ val positiveFraction = 0.8
+ val negativeFraction = 1.0 - positiveFraction
+ val trainingFraction = 0.6
+ val testFraction = 0.3
+ val totalFraction = trainingFraction + testFraction
+ val random = new Random(seed)
+ val userFactors = genFactors(numUsers, rank, random)
+ val itemFactors = genFactors(numItems, rank, random)
+ val training = ArrayBuffer.empty[Rating[Int]]
+ val test = ArrayBuffer.empty[Rating[Int]]
+ for ((userId, userFactor) <- userFactors; (itemId, itemFactor) <- itemFactors) {
+ val rating = blas.sdot(rank, userFactor, 1, itemFactor, 1)
+ val threshold = if (rating > 0) positiveFraction else negativeFraction
+ val observed = random.nextDouble() < threshold
+ if (observed) {
+ val x = random.nextDouble()
+ if (x < totalFraction) {
+ if (x < trainingFraction) {
+ val noise = noiseStd * random.nextGaussian()
+ training += Rating(userId, itemId, rating + noise.toFloat)
+ } else {
+ test += Rating(userId, itemId, rating)
+ }
+ }
+ }
+ }
+ logInfo(s"Generated an implicit feedback dataset with ${training.size} ratings for training " +
+ s"and ${test.size} for test.")
+ (sc.parallelize(training, 2), sc.parallelize(test, 2))
+ }
+}
diff --git a/mllib-dal/test-cluster.sh b/mllib-dal/test-cluster.sh
new file mode 100755
index 000000000..4f5a6132a
--- /dev/null
+++ b/mllib-dal/test-cluster.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+
+cd ../dev/test-cluster/workloads
+
+./run-kmeans-pyspark.sh
diff --git a/mllib-dal/test.sh b/mllib-dal/test.sh
index f7e73ca1f..0157c22a4 100755
--- a/mllib-dal/test.sh
+++ b/mllib-dal/test.sh
@@ -35,8 +35,9 @@ export LD_PRELOAD=$JAVA_HOME/jre/lib/amd64/libjsig.so
# -Dtest=none to turn off the Java tests
# Test all
-mvn -Dtest=none -Dmaven.test.skip=false test
+# mvn -Dtest=none -Dmaven.test.skip=false test
# Individual test
-# mvn -Dtest=none -DwildcardSuites=org.apache.spark.ml.clustering.IntelKMeansSuite test
-# mvn -Dtest=none -DwildcardSuites=org.apache.spark.ml.feature.IntelPCASuite test
+mvn -Dtest=none -DwildcardSuites=org.apache.spark.ml.clustering.IntelKMeansSuite test
+mvn -Dtest=none -DwildcardSuites=org.apache.spark.ml.feature.IntelPCASuite test
+# mvn -Dtest=none -DwildcardSuites=org.apache.spark.ml.recommendation.IntelALSSuite test