Skip to content

Commit

Permalink
Support Spark 2.3.3, 2.3.4, 2.4.5
Browse files Browse the repository at this point in the history
  • Loading branch information
chu11 committed Apr 30, 2020
1 parent d0ecdfe commit eac964b
Show file tree
Hide file tree
Showing 43 changed files with 4,416 additions and 31 deletions.
4 changes: 3 additions & 1 deletion NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@ Add support for Hbase 1.4.10, 1.4.13
- Update default Hbase to 1.4.10, 1.4.13
Add support for Zeppelin 0.8.2.
- Update default Zeppelin to 0.8.2.
Support Spark 2.4.4-bin-hadoop2.6, 2.4.4-bin-hadoop2.7
Support Spark 2.3.3-bin-hadoop2.6, 2.3.3-bin-hadoop2.7, Spark
2.3.4-bin-hadoop2.6, 2.3.4-bin-hadoop2.7, Spark 2.4.5-bin-hadoop2.6,
2.4.5-bin-hadoop2.7
- Update Spark default to 2.4.4-bin-hadoop2.7

Magpie 2.3
Expand Down
5 changes: 4 additions & 1 deletion doc/README
Original file line number Diff line number Diff line change
Expand Up @@ -187,11 +187,14 @@ Spark - 0.9.1-bin-hadoop2+, 0.9.2-bin-hadoop2+,
2.3.0-bin-hadoop2.6+!, 2.3.0-bin-hadoop2.7+!,
2.3.1-bin-hadoop2.6+!, 2.3.1-bin-hadoop2.7+!,
2.3.2-bin-hadoop2.6+!, 2.3.2-bin-hadoop2.7+!,
2.3.3-bin-hadoop2.6+!, 2.3.3-bin-hadoop2.7+!,
2.3.4-bin-hadoop2.6+!, 2.3.4-bin-hadoop2.7+!,
2.4.0-bin-hadoop2.6+!, 2.4.0-bin-hadoop2.7+!,
2.4.1-bin-hadoop2.6+!, 2.4.1-bin-hadoop2.7+!,
2.4.2-bin-hadoop2.6+!, 2.4.2-bin-hadoop2.7+!,
2.4.3-bin-hadoop2.6+!, 2.4.3-bin-hadoop2.7+!,
2.4.4-bin-hadoop2.6+!, 2.4.4-bin-hadoop2.7+!
2.4.4-bin-hadoop2.6+!, 2.4.4-bin-hadoop2.7+!,
2.4.5-bin-hadoop2.6+!, 2.4.5-bin-hadoop2.7+!

TensorFlow - 1.9, 1.12

Expand Down
2 changes: 1 addition & 1 deletion misc/magpie-apache-download-and-setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ HIVE_PACKAGE="hive/2.3.0/apache-hive-2.3.0.tar.gz"
PIG_PACKAGE="pig/pig-0.17.0/pig-0.17.0.tar.gz"
MAHOUT_PACKAGE="mahout/0.13.0/apache-mahout-distribution-0.13.0.tar.gz"
ZOOKEEPER_PACKAGE="zookeeper/zookeeper-3.4.14/zookeeper-3.4.14.tar.gz"
SPARK_PACKAGE="spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz"
SPARK_PACKAGE="spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz"
SPARK_HADOOP_PACKAGE="hadoop/common/hadoop-2.7.3/hadoop-2.7.3.tar.gz"
STORM_PACKAGE="storm/apache-storm-1.2.3/apache-storm-1.2.3.tar.gz"
PHOENIX_PACKAGE="phoenix/apache-phoenix-4.14.0-HBase-1.4/bin/apache-phoenix-4.14.0-HBase-1.4-bin.tar.gz"
Expand Down
168 changes: 168 additions & 0 deletions patches/spark/spark-2.3.3-bin-hadoop2.6-alternate.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
diff -pruN spark-2.3.3-bin-hadoop2.6-orig/sbin/slaves.sh spark-2.3.3-bin-hadoop2.6-alternate/sbin/slaves.sh
--- spark-2.3.3-bin-hadoop2.6-orig/sbin/slaves.sh 2019-02-04 10:11:59.000000000 -0800
+++ spark-2.3.3-bin-hadoop2.6-alternate/sbin/slaves.sh 2020-04-28 22:40:49.075333000 -0700
@@ -25,6 +25,8 @@
# Default is ${SPARK_CONF_DIR}/slaves.
# SPARK_CONF_DIR Alternate conf dir. Default is ${SPARK_HOME}/conf.
# SPARK_SLAVE_SLEEP Seconds to sleep between spawning remote commands.
+# SPARK_SSH_CMD Specify an alternate remote shell command.
+# Defaults to ssh if not specified.
# SPARK_SSH_OPTS Options passed to ssh when running remote commands.
##

@@ -80,19 +82,19 @@ if [ "$HOSTLIST" = "" ]; then
fi
fi

-
+RSH_CMD=${SPARK_SSH_CMD:-ssh}

# By default disable strict host key checking
-if [ "$SPARK_SSH_OPTS" = "" ]; then
+if [ "$RSH_CMD" == "ssh" ] && [ "$SPARK_SSH_OPTS" = "" ]; then
SPARK_SSH_OPTS="-o StrictHostKeyChecking=no"
fi

for slave in `echo "$HOSTLIST"|sed "s/#.*$//;/^$/d"`; do
if [ -n "${SPARK_SSH_FOREGROUND}" ]; then
- ssh $SPARK_SSH_OPTS "$slave" $"${@// /\\ }" \
+ $RSH_CMD $SPARK_SSH_OPTS "$slave" $"${@// /\\ }" \
2>&1 | sed "s/^/$slave: /"
else
- ssh $SPARK_SSH_OPTS "$slave" $"${@// /\\ }" \
+ $RSH_CMD $SPARK_SSH_OPTS "$slave" $"${@// /\\ }" \
2>&1 | sed "s/^/$slave: /" &
fi
if [ "$SPARK_SLAVE_SLEEP" != "" ]; then
diff -pruN spark-2.3.3-bin-hadoop2.6-orig/sbin/spark-daemon.sh spark-2.3.3-bin-hadoop2.6-alternate/sbin/spark-daemon.sh
--- spark-2.3.3-bin-hadoop2.6-orig/sbin/spark-daemon.sh 2019-02-04 10:11:59.000000000 -0800
+++ spark-2.3.3-bin-hadoop2.6-alternate/sbin/spark-daemon.sh 2020-04-28 22:40:49.089333000 -0700
@@ -167,7 +167,8 @@ run_command() {

if [ "$SPARK_MASTER" != "" ]; then
echo rsync from "$SPARK_MASTER"
- rsync -a -e ssh --delete --exclude=.svn --exclude='logs/*' --exclude='contrib/hod/logs/*' "$SPARK_MASTER/" "${SPARK_HOME}"
+ RSH_CMD=${SPARK_SSH_CMD:-ssh}
+ rsync -a -e $RSH_CMD --delete --exclude=.svn --exclude='logs/*' --exclude='contrib/hod/logs/*' "$SPARK_MASTER/" "${SPARK_HOME}"
fi

spark_rotate_log "$log"
diff -pruN spark-2.3.3-bin-hadoop2.6-orig/sbin/spark-daemons.sh spark-2.3.3-bin-hadoop2.6-alternate/sbin/spark-daemons.sh
--- spark-2.3.3-bin-hadoop2.6-orig/sbin/spark-daemons.sh 2019-02-04 10:11:59.000000000 -0800
+++ spark-2.3.3-bin-hadoop2.6-alternate/sbin/spark-daemons.sh 2020-04-28 22:40:49.094335000 -0700
@@ -31,6 +31,24 @@ if [ -z "${SPARK_HOME}" ]; then
export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
fi

+# Check if --config is passed as an argument. It is an optional parameter.
+# Exit if the argument is not a directory.
+
+if [ "$1" == "--config" ]
+then
+ shift
+ conf_dir=$1
+ if [ ! -d "$conf_dir" ]
+ then
+ echo "ERROR : $conf_dir is not a directory"
+ echo $usage
+ exit 1
+ else
+ export SPARK_CONF_DIR=$conf_dir
+ fi
+ shift
+fi
+
. "${SPARK_HOME}/sbin/spark-config.sh"

-exec "${SPARK_HOME}/sbin/slaves.sh" cd "${SPARK_HOME}" \; "${SPARK_HOME}/sbin/spark-daemon.sh" "$@"
+exec "${SPARK_HOME}/sbin/slaves.sh" --config $SPARK_CONF_DIR cd "${SPARK_HOME}" \; "${SPARK_HOME}/sbin/spark-daemon.sh" "$@"
diff -pruN spark-2.3.3-bin-hadoop2.6-orig/sbin/start-slave.sh spark-2.3.3-bin-hadoop2.6-alternate/sbin/start-slave.sh
--- spark-2.3.3-bin-hadoop2.6-orig/sbin/start-slave.sh 2019-02-04 10:12:00.000000000 -0800
+++ spark-2.3.3-bin-hadoop2.6-alternate/sbin/start-slave.sh 2020-04-28 22:40:49.103331000 -0700
@@ -49,6 +49,24 @@ if [[ $# -lt 1 ]] || [[ "$@" = *--help ]
exit 1
fi

+# Check if --config is passed as an argument. It is an optional parameter.
+# Exit if the argument is not a directory.
+
+if [ "$1" == "--config" ]
+then
+ shift
+ conf_dir=$1
+ if [ ! -d "$conf_dir" ]
+ then
+ echo "ERROR : $conf_dir is not a directory"
+ echo $usage
+ exit 1
+ else
+ export SPARK_CONF_DIR=$conf_dir
+ fi
+ shift
+fi
+
. "${SPARK_HOME}/sbin/spark-config.sh"

. "${SPARK_HOME}/bin/load-spark-env.sh"
@@ -78,8 +96,14 @@ function start_instance {
fi
WEBUI_PORT=$(( $SPARK_WORKER_WEBUI_PORT + $WORKER_NUM - 1 ))

- "${SPARK_HOME}/sbin"/spark-daemon.sh start $CLASS $WORKER_NUM \
- --webui-port "$WEBUI_PORT" $PORT_FLAG $PORT_NUM $MASTER "$@"
+ if [ "${SPARK_CONF_DIR}X" != "X" ]
+ then
+ "${SPARK_HOME}/sbin"/spark-daemon.sh --config $SPARK_CONF_DIR start $CLASS $WORKER_NUM \
+ --webui-port "$WEBUI_PORT" $PORT_FLAG $PORT_NUM $MASTER "$@"
+ else
+ "${SPARK_HOME}/sbin"/spark-daemon.sh start $CLASS $WORKER_NUM \
+ --webui-port "$WEBUI_PORT" $PORT_FLAG $PORT_NUM $MASTER "$@"
+ fi
}

if [ "$SPARK_WORKER_INSTANCES" = "" ]; then
diff -pruN spark-2.3.3-bin-hadoop2.6-orig/sbin/start-slaves.sh spark-2.3.3-bin-hadoop2.6-alternate/sbin/start-slaves.sh
--- spark-2.3.3-bin-hadoop2.6-orig/sbin/start-slaves.sh 2019-02-04 10:12:00.000000000 -0800
+++ spark-2.3.3-bin-hadoop2.6-alternate/sbin/start-slaves.sh 2020-04-28 22:40:49.119340000 -0700
@@ -43,4 +43,4 @@ if [ "$SPARK_MASTER_HOST" = "" ]; then
fi

# Launch the slaves
-"${SPARK_HOME}/sbin/slaves.sh" cd "${SPARK_HOME}" \; "${SPARK_HOME}/sbin/start-slave.sh" "spark://$SPARK_MASTER_HOST:$SPARK_MASTER_PORT"
+"${SPARK_HOME}/sbin/slaves.sh" --config $SPARK_CONF_DIR cd "${SPARK_HOME}" \; "${SPARK_HOME}/sbin/start-slave.sh" --config $SPARK_CONF_DIR "spark://$SPARK_MASTER_HOST:$SPARK_MASTER_PORT"
diff -pruN spark-2.3.3-bin-hadoop2.6-orig/sbin/stop-slave.sh spark-2.3.3-bin-hadoop2.6-alternate/sbin/stop-slave.sh
--- spark-2.3.3-bin-hadoop2.6-orig/sbin/stop-slave.sh 2019-02-04 10:12:00.000000000 -0800
+++ spark-2.3.3-bin-hadoop2.6-alternate/sbin/stop-slave.sh 2020-04-28 22:40:49.124346000 -0700
@@ -31,6 +31,23 @@ if [ -z "${SPARK_HOME}" ]; then
export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
fi

+# Check if --config is passed as an argument. It is an optional parameter.
+# Exit if the argument is not a directory.
+if [ "$1" == "--config" ]
+then
+ shift
+ conf_dir="$1"
+ if [ ! -d "$conf_dir" ]
+ then
+ echo "ERROR : $conf_dir is not a directory"
+ echo $usage
+ exit 1
+ else
+ export SPARK_CONF_DIR="$conf_dir"
+ fi
+ shift
+fi
+
. "${SPARK_HOME}/sbin/spark-config.sh"

. "${SPARK_HOME}/bin/load-spark-env.sh"
diff -pruN spark-2.3.3-bin-hadoop2.6-orig/sbin/stop-slaves.sh spark-2.3.3-bin-hadoop2.6-alternate/sbin/stop-slaves.sh
--- spark-2.3.3-bin-hadoop2.6-orig/sbin/stop-slaves.sh 2019-02-04 10:12:00.000000000 -0800
+++ spark-2.3.3-bin-hadoop2.6-alternate/sbin/stop-slaves.sh 2020-04-28 22:40:49.134341000 -0700
@@ -25,4 +25,4 @@ fi

. "${SPARK_HOME}/bin/load-spark-env.sh"

-"${SPARK_HOME}/sbin/slaves.sh" cd "${SPARK_HOME}" \; "${SPARK_HOME}/sbin"/stop-slave.sh
+"${SPARK_HOME}/sbin/slaves.sh" --config $SPARK_CONF_DIR cd "${SPARK_HOME}" \; "${SPARK_HOME}/sbin"/stop-slave.sh --config $SPARK_CONF_DIR
Loading

0 comments on commit eac964b

Please sign in to comment.