From 437953eb6d2b9cdffde06fc2d7aa5dfffb728849 Mon Sep 17 00:00:00 2001 From: "Wu, Xiaochang" Date: Fri, 19 Mar 2021 12:18:31 +0800 Subject: [PATCH] Revert "[ML-12] Improve CI and add pseudo cluster testing (#20)" This reverts commit 6fe5d3e93f673e5d9f04c24f54bb6c93cad6f7f5. Signed-off-by: Wu, Xiaochang --- dev/test-cluster/config-ssh.sh | 6 -- dev/test-cluster/core-site.xml | 24 ----- dev/test-cluster/envs.sh | 22 ----- dev/test-cluster/hadoop-env.sh | 99 ------------------- dev/test-cluster/hdfs-site.xml | 32 ------ dev/test-cluster/setup-cluster.sh | 42 -------- dev/test-cluster/setup-python3-env.sh | 12 --- dev/test-cluster/spark-defaults.conf | 34 ------- dev/test-cluster/workloads/kmeans-pyspark.py | 70 ------------- .../workloads/run-kmeans-pyspark.sh | 48 --------- dev/test-cluster/yarn-site.xml | 67 ------------- mllib-dal/test-cluster.sh | 5 - 12 files changed, 461 deletions(-) delete mode 100755 dev/test-cluster/config-ssh.sh delete mode 100644 dev/test-cluster/core-site.xml delete mode 100644 dev/test-cluster/envs.sh delete mode 100755 dev/test-cluster/hadoop-env.sh delete mode 100644 dev/test-cluster/hdfs-site.xml delete mode 100755 dev/test-cluster/setup-cluster.sh delete mode 100755 dev/test-cluster/setup-python3-env.sh delete mode 100644 dev/test-cluster/spark-defaults.conf delete mode 100644 dev/test-cluster/workloads/kmeans-pyspark.py delete mode 100755 dev/test-cluster/workloads/run-kmeans-pyspark.sh delete mode 100644 dev/test-cluster/yarn-site.xml delete mode 100755 mllib-dal/test-cluster.sh diff --git a/dev/test-cluster/config-ssh.sh b/dev/test-cluster/config-ssh.sh deleted file mode 100755 index d093fa17a..000000000 --- a/dev/test-cluster/config-ssh.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env bash - -ssh-keygen -q -N "" -t rsa -f ~/.ssh/id_rsa -cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys -echo " StrictHostKeyChecking no " | sudo tee -a /etc/ssh/ssh_config -sudo service ssh restart diff --git a/dev/test-cluster/core-site.xml b/dev/test-cluster/core-site.xml deleted file mode 100644 index 7016e477e..000000000 --- a/dev/test-cluster/core-site.xml +++ /dev/null @@ -1,24 +0,0 @@ - - - - - - - - - fs.default.name - hdfs://localhost:8020 - - diff --git a/dev/test-cluster/envs.sh b/dev/test-cluster/envs.sh deleted file mode 100644 index 71e8506e6..000000000 --- a/dev/test-cluster/envs.sh +++ /dev/null @@ -1,22 +0,0 @@ -# Set user Spark and Hadoop home directory -export HADOOP_HOME=~/opt/hadoop-2.7.7 -export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop -export SPARK_HOME=~/opt/spark-3.0.0-bin-hadoop2.7 - -export PYTHONPATH=$SPARK_HOME/python:$PYTHONPATH -export PYSPARK_PYTHON=python3 - -# Set user HDFS Root -export HDFS_ROOT=hdfs://localhost:8020 -export OAP_MLLIB_DATA_ROOT=OAPMLlib/Data -# Set user Intel MLlib Root directory -export OAP_MLLIB_ROOT=${GITHUB_WORKSPACE} - -# Target jar built -OAP_MLLIB_JAR_NAME=oap-mllib-1.1.0.jar -OAP_MLLIB_JAR=$OAP_MLLIB_ROOT/mllib-dal/target/$OAP_MLLIB_JAR_NAME - -# Use absolute path -SPARK_DRIVER_CLASSPATH=$OAP_MLLIB_JAR -# Use relative path -SPARK_EXECUTOR_CLASSPATH=./$OAP_MLLIB_JAR_NAME diff --git a/dev/test-cluster/hadoop-env.sh b/dev/test-cluster/hadoop-env.sh deleted file mode 100755 index bee6c1f69..000000000 --- a/dev/test-cluster/hadoop-env.sh +++ /dev/null @@ -1,99 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Set Hadoop-specific environment variables here. - -# The only required environment variable is JAVA_HOME. All others are -# optional. When running a distributed configuration it is best to -# set JAVA_HOME in this file, so that it is correctly defined on -# remote nodes. - -# The java implementation to use. -# export JAVA_HOME=${JAVA_HOME} -export JAVA_HOME=/usr/local/lib/jvm/openjdk8 - -# The jsvc implementation to use. Jsvc is required to run secure datanodes -# that bind to privileged ports to provide authentication of data transfer -# protocol. Jsvc is not required if SASL is configured for authentication of -# data transfer protocol using non-privileged ports. -#export JSVC_HOME=${JSVC_HOME} - -export HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-"/etc/hadoop"} - -# Extra Java CLASSPATH elements. Automatically insert capacity-scheduler. -for f in $HADOOP_HOME/contrib/capacity-scheduler/*.jar; do - if [ "$HADOOP_CLASSPATH" ]; then - export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:$f - else - export HADOOP_CLASSPATH=$f - fi -done - -# The maximum amount of heap to use, in MB. Default is 1000. -#export HADOOP_HEAPSIZE= -#export HADOOP_NAMENODE_INIT_HEAPSIZE="" - -# Extra Java runtime options. Empty by default. -export HADOOP_OPTS="$HADOOP_OPTS -Djava.net.preferIPv4Stack=true" - -# Command specific options appended to HADOOP_OPTS when specified -export HADOOP_NAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} $HADOOP_NAMENODE_OPTS" -export HADOOP_DATANODE_OPTS="-Dhadoop.security.logger=ERROR,RFAS $HADOOP_DATANODE_OPTS" - -export HADOOP_SECONDARYNAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} $HADOOP_SECONDARYNAMENODE_OPTS" - -export HADOOP_NFS3_OPTS="$HADOOP_NFS3_OPTS" -export HADOOP_PORTMAP_OPTS="-Xmx512m $HADOOP_PORTMAP_OPTS" - -# The following applies to multiple commands (fs, dfs, fsck, distcp etc) -export HADOOP_CLIENT_OPTS="-Xmx512m $HADOOP_CLIENT_OPTS" -#HADOOP_JAVA_PLATFORM_OPTS="-XX:-UsePerfData $HADOOP_JAVA_PLATFORM_OPTS" - -# On secure datanodes, user to run the datanode as after dropping privileges. -# This **MUST** be uncommented to enable secure HDFS if using privileged ports -# to provide authentication of data transfer protocol. This **MUST NOT** be -# defined if SASL is configured for authentication of data transfer protocol -# using non-privileged ports. -export HADOOP_SECURE_DN_USER=${HADOOP_SECURE_DN_USER} - -# Where log files are stored. $HADOOP_HOME/logs by default. -#export HADOOP_LOG_DIR=${HADOOP_LOG_DIR}/$USER - -# Where log files are stored in the secure data environment. -export HADOOP_SECURE_DN_LOG_DIR=${HADOOP_LOG_DIR}/${HADOOP_HDFS_USER} - -### -# HDFS Mover specific parameters -### -# Specify the JVM options to be used when starting the HDFS Mover. -# These options will be appended to the options specified as HADOOP_OPTS -# and therefore may override any similar flags set in HADOOP_OPTS -# -# export HADOOP_MOVER_OPTS="" - -### -# Advanced Users Only! -### - -# The directory where pid files are stored. /tmp by default. -# NOTE: this should be set to a directory that can only be written to by -# the user that will run the hadoop daemons. Otherwise there is the -# potential for a symlink attack. -export HADOOP_PID_DIR=${HADOOP_PID_DIR} -export HADOOP_SECURE_DN_PID_DIR=${HADOOP_PID_DIR} - -# A string representing this instance of hadoop. $USER by default. -export HADOOP_IDENT_STRING=$USER diff --git a/dev/test-cluster/hdfs-site.xml b/dev/test-cluster/hdfs-site.xml deleted file mode 100644 index 40fcbb5d6..000000000 --- a/dev/test-cluster/hdfs-site.xml +++ /dev/null @@ -1,32 +0,0 @@ - - - - - - - - - dfs.replication - 1 - - - dfs.namenode.name.dir - /tmp/run/hdfs/namenode - - - dfs.datanode.data.dir - /tmp/run/hdfs/datanode - - diff --git a/dev/test-cluster/setup-cluster.sh b/dev/test-cluster/setup-cluster.sh deleted file mode 100755 index eea058f80..000000000 --- a/dev/test-cluster/setup-cluster.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env bash - -WORK_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" - -cd $WORK_DIR - -echo JAVA_HOME is $JAVA_HOME - -mkdir ~/opt -cd ~/opt -wget https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop2.7.tgz -tar -xzf spark-3.0.0-bin-hadoop2.7.tgz -wget https://archive.apache.org/dist/hadoop/core/hadoop-2.7.7/hadoop-2.7.7.tar.gz -tar -xzf hadoop-2.7.7.tar.gz - -cd $WORK_DIR - -cp ./core-site.xml ~/opt/hadoop-2.7.7/etc/hadoop/ -cp ./hdfs-site.xml ~/opt/hadoop-2.7.7/etc/hadoop/ -cp ./yarn-site.xml ~/opt/hadoop-2.7.7/etc/hadoop/ -cp ./hadoop-env.sh ~/opt/hadoop-2.7.7/etc/hadoop/ -cp ./spark-defaults.conf ~/opt/spark-3.0.0-bin-hadoop2.7/conf - -# create directories -mkdir -p /tmp/run/hdfs/namenode -mkdir -p /tmp/run/hdfs/datanode - -# hdfs format -~/opt/hadoop-2.7.7/bin/hdfs namenode -format - -export HADOOP_HOME=~/opt/hadoop-2.7.7 -export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop -export SPARK_HOME=~/opt/spark-3.0.0-bin-hadoop2.7 - -export PATH=$HADOOP_HOME/bin:$SPARK_HOME/bin:$PATH - -# start hdfs and yarn -$HADOOP_HOME/sbin/start-dfs.sh -$HADOOP_HOME/sbin/start-yarn.sh - -hadoop fs -ls / -yarn node -list diff --git a/dev/test-cluster/setup-python3-env.sh b/dev/test-cluster/setup-python3-env.sh deleted file mode 100755 index 29208dc5e..000000000 --- a/dev/test-cluster/setup-python3-env.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/usr/bin/env bash - -sudo apt-get update -sudo apt-get install python3-pip python3-setuptools python3-wheel - -pip3 install --user numpy - -echo python is in $(which python) -python --version - -echo python3 is in $(which python3) -python3 --version diff --git a/dev/test-cluster/spark-defaults.conf b/dev/test-cluster/spark-defaults.conf deleted file mode 100644 index 1c25bb2ec..000000000 --- a/dev/test-cluster/spark-defaults.conf +++ /dev/null @@ -1,34 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Default system properties included when running spark-submit. -# This is useful for setting default environmental settings. - -# Example: -# spark.master spark://master:7077 -# spark.eventLog.enabled true -# spark.eventLog.dir hdfs://namenode:8021/directory -# spark.serializer org.apache.spark.serializer.KryoSerializer -# spark.driver.memory 5g -# spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three" - -spark.master yarn -spark.serializer org.apache.spark.serializer.KryoSerializer -spark.driver.memory 3g -spark.executor.num 2 -spark.executor.cores 1 -spark.executor.memory 4g diff --git a/dev/test-cluster/workloads/kmeans-pyspark.py b/dev/test-cluster/workloads/kmeans-pyspark.py deleted file mode 100644 index cf93e6034..000000000 --- a/dev/test-cluster/workloads/kmeans-pyspark.py +++ /dev/null @@ -1,70 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -An example demonstrating k-means clustering. -Run with: - bin/spark-submit examples/src/main/python/ml/kmeans_example.py - -This example requires NumPy (http://www.numpy.org/). -""" -from __future__ import print_function -import sys - -# $example on$ -from pyspark.ml.clustering import KMeans -from pyspark.ml.evaluation import ClusteringEvaluator -# $example off$ - -from pyspark.sql import SparkSession - -if __name__ == "__main__": - spark = SparkSession\ - .builder\ - .appName("KMeansExample")\ - .getOrCreate() - - if (len(sys.argv) != 2) : - println("Require data file path as input parameter") - sys.exit(1) - - # $example on$ - # Loads data. - dataset = spark.read.format("libsvm").load(sys.argv[1]) - - # Trains a k-means model. - kmeans = KMeans().setK(2).setSeed(1) - model = kmeans.fit(dataset) - - # Make predictions - predictions = model.transform(dataset) - - # Evaluate clustering by computing Silhouette score - evaluator = ClusteringEvaluator() - - silhouette = evaluator.evaluate(predictions) - print("Silhouette with squared euclidean distance = " + str(silhouette)) - - # Shows the result. - centers = model.clusterCenters() - print("Cluster Centers: ") - for center in centers: - print(center) - # $example off$ - - spark.stop() - diff --git a/dev/test-cluster/workloads/run-kmeans-pyspark.sh b/dev/test-cluster/workloads/run-kmeans-pyspark.sh deleted file mode 100755 index e07f3f7b6..000000000 --- a/dev/test-cluster/workloads/run-kmeans-pyspark.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env bash - -source ../envs.sh - -# Data file is from Spark Examples (data/mllib/sample_kmeans_data.txt), the data file should be copied to HDFS -$HADOOP_HOME/bin/hadoop fs -mkdir -p $OAP_MLLIB_DATA_ROOT -$HADOOP_HOME/bin/hadoop fs -copyFromLocal $SPARK_HOME/data/mllib/sample_kmeans_data.txt $OAP_MLLIB_DATA_ROOT - -# User should check the requested resources are acturally allocated by cluster manager or Intel MLlib will behave incorrectly -SPARK_MASTER=yarn -SPARK_DRIVER_MEMORY=1G -SPARK_NUM_EXECUTORS=2 -SPARK_EXECUTOR_CORES=1 -SPARK_EXECUTOR_MEMORY=1G - -SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES '*' 2) - -# ======================================================= # - -# Check env -if [[ -z $SPARK_HOME ]]; then - echo SPARK_HOME not defined! - exit 1 -fi - -if [[ -z $HADOOP_HOME ]]; then - echo HADOOP_HOME not defined! - exit 1 -fi - -APP_PY="$OAP_MLLIB_ROOT/dev/test-cluster/workloads/kmeans-pyspark.py" -DATA_FILE=$OAP_MLLIB_DATA_ROOT/sample_kmeans_data.txt - -$SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \ - --num-executors $SPARK_NUM_EXECUTORS \ - --driver-memory $SPARK_DRIVER_MEMORY \ - --executor-cores $SPARK_EXECUTOR_CORES \ - --executor-memory $SPARK_EXECUTOR_MEMORY \ - --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \ - --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \ - --conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \ - --conf "spark.driver.extraClassPath=$SPARK_DRIVER_CLASSPATH" \ - --conf "spark.executor.extraClassPath=$SPARK_EXECUTOR_CLASSPATH" \ - --conf "spark.shuffle.reduceLocality.enabled=false" \ - --conf "spark.network.timeout=1200s" \ - --conf "spark.task.maxFailures=1" \ - --jars $OAP_MLLIB_JAR \ - $APP_PY $DATA_FILE diff --git a/dev/test-cluster/yarn-site.xml b/dev/test-cluster/yarn-site.xml deleted file mode 100644 index ff74d23a7..000000000 --- a/dev/test-cluster/yarn-site.xml +++ /dev/null @@ -1,67 +0,0 @@ - - - - - - yarn.nodemanager.aux-services - mapreduce_shuffle - - - yarn.nodemanager.aux-services.mapreduce.shuffle.class - org.apache.hadoop.mapred.ShuffleHandler - - - yarn.resourcemanager.hostname - localhost - - - yarn.resourcemanager.address - localhost:8032 - - - - yarn.nodemanager.resource.memory-mb - 7168 - - - yarn.nodemanager.resource.cpu-vcores - 2 - - - yarn.nodemanager.vmem-check-enabled - false - - - yarn.nodemanager.vmem-pmem-ratio - 2 - - - yarn.scheduler.minimum-allocation-mb - 1024 - - - yarn.scheduler.maximum-allocation-mb - 7168 - - - yarn.scheduler.minimum-allocation-vcores - 1 - - - yarn.scheduler.maximum-allocation-vcores - 2 - - - diff --git a/mllib-dal/test-cluster.sh b/mllib-dal/test-cluster.sh deleted file mode 100755 index 4f5a6132a..000000000 --- a/mllib-dal/test-cluster.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/usr/bin/env bash - -cd ../dev/test-cluster/workloads - -./run-kmeans-pyspark.sh