Skip to content

Commit

Permalink
Support Spark 3.3.0, 3.3.1, 3.3.2
Browse files Browse the repository at this point in the history
  • Loading branch information
chu11 committed Apr 5, 2023
1 parent 5648d68 commit 8a2676b
Show file tree
Hide file tree
Showing 51 changed files with 4,452 additions and 56 deletions.
4 changes: 2 additions & 2 deletions NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ Magpie 3.1
MAGPIE_HOSTNAME_CMD_MAP.
- Support Hadoop 3.2.4, 3.3.4, 3.3.5
- Update Hadoop default to 3.3.5
- Support Spark 3.2.2, 3.2.3
- Update Spark default to 3.2.3-bin-hadoop3.2
- Support Spark 3.2.2, 3.2.3, 3.3.0, 3.3.1, 3.3.2
- Update Spark default to 3.3.2-bin-hadoop3

Magpie 3.0
----------
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ Hadoop - 2.2.0, 2.3.0, 2.4.X, 2.5.X, 2.6.X, 2.7.X, 2.8.X, 2.9.X,
3.0.X, 3.1.X, 3.2.X, 3.3.X

Spark - 1.1.X, 1.2.X, 1.3.X, 1.4.X, 1.5.X, 1.6.X, 2.0.X, 2.1.X, 2.2.X,
2.3.X, 2.4.X, 3.0.X, 3.1.X, 3.2.X
2.3.X, 2.4.X, 3.0.X, 3.1.X, 3.2.X, 3.3.X

Hbase - 1.0.X, 1.1.X, 1.2.X, 1.3.X, 1.4.X, 1.5.X, 1.6.X

Expand Down
5 changes: 4 additions & 1 deletion doc/README
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,10 @@ Spark - 1.1.0-bin-hadoop2.3+, 1.1.0-bin-hadoop2.4+,
3.2.0-bin-hadoop2.7+!, 3.2.0-bin-hadoop3.2+!,
3.2.1-bin-hadoop2.7+!, 3.2.1-bin-hadoop3.2+!,
3.2.2-bin-hadoop2.7+!, 3.2.2-bin-hadoop3.2+!,
3.2.3-bin-hadoop2.7+!, 3.2.3-bin-hadoop3.2+!
3.2.3-bin-hadoop2.7+!, 3.2.3-bin-hadoop3.2+!,
3.3.0-bin-hadoop2.7+!, 3.3.0-bin-hadoop3.2+!,
3.3.1-bin-hadoop2.7+!, 3.3.1-bin-hadoop3.2+!,
3.3.2-bin-hadoop2.7+!, 3.3.2-bin-hadoop3.2+!

TensorFlow - 1.9, 1.12

Expand Down
2 changes: 1 addition & 1 deletion misc/magpie-download-and-setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ HBASE_PACKAGE="hbase/1.6.0/hbase-1.6.0-bin.tar.gz"
HIVE_PACKAGE="hive/2.3.0/apache-hive-2.3.0.tar.gz"
PIG_PACKAGE="pig/pig-0.17.0/pig-0.17.0.tar.gz"
ZOOKEEPER_PACKAGE="zookeeper/zookeeper-3.4.14/zookeeper-3.4.14.tar.gz"
SPARK_PACKAGE="spark/spark-3.2.3/spark-3.2.3-bin-hadoop3.2.tgz"
SPARK_PACKAGE="spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz"
SPARK_HADOOP_PACKAGE="hadoop/common/hadoop-3.2.1/hadoop-3.2.1.tar.gz"
STORM_PACKAGE="storm/apache-storm-1.2.3/apache-storm-1.2.3.tar.gz"
PHOENIX_PACKAGE="phoenix/apache-phoenix-4.14.0-HBase-1.4/bin/apache-phoenix-4.14.0-HBase-1.4-bin.tar.gz"
Expand Down
168 changes: 168 additions & 0 deletions patches/spark/spark-3.3.0-bin-hadoop2-alternate.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
diff -pruN spark-3.3.0-bin-hadoop2-orig/sbin/spark-daemon.sh spark-3.3.0-bin-hadoop2-alternate/sbin/spark-daemon.sh
--- spark-3.3.0-bin-hadoop2-orig/sbin/spark-daemon.sh 2022-06-09 12:56:55.000000000 -0700
+++ spark-3.3.0-bin-hadoop2-alternate/sbin/spark-daemon.sh 2023-04-04 10:08:27.978437000 -0700
@@ -174,7 +174,8 @@ run_command() {

if [ "$SPARK_MASTER" != "" ]; then
echo rsync from "$SPARK_MASTER"
- rsync -a -e ssh --delete --exclude=.svn --exclude='logs/*' --exclude='contrib/hod/logs/*' "$SPARK_MASTER/" "${SPARK_HOME}"
+ RSH_CMD=${SPARK_SSH_CMD:-ssh}
+ rsync -a -e $RSH_CMD --delete --exclude=.svn --exclude='logs/*' --exclude='contrib/hod/logs/*' "$SPARK_MASTER/" "${SPARK_HOME}"
fi

spark_rotate_log "$log"
diff -pruN spark-3.3.0-bin-hadoop2-orig/sbin/spark-daemons.sh spark-3.3.0-bin-hadoop2-alternate/sbin/spark-daemons.sh
--- spark-3.3.0-bin-hadoop2-orig/sbin/spark-daemons.sh 2022-06-09 12:56:55.000000000 -0700
+++ spark-3.3.0-bin-hadoop2-alternate/sbin/spark-daemons.sh 2023-04-04 10:08:27.982446000 -0700
@@ -31,6 +31,24 @@ if [ -z "${SPARK_HOME}" ]; then
export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
fi

+# Check if --config is passed as an argument. It is an optional parameter.
+# Exit if the argument is not a directory.
+
+if [ "$1" == "--config" ]
+then
+ shift
+ conf_dir=$1
+ if [ ! -d "$conf_dir" ]
+ then
+ echo "ERROR : $conf_dir is not a directory"
+ echo $usage
+ exit 1
+ else
+ export SPARK_CONF_DIR=$conf_dir
+ fi
+ shift
+fi
+
. "${SPARK_HOME}/sbin/spark-config.sh"

-exec "${SPARK_HOME}/sbin/workers.sh" cd "${SPARK_HOME}" \; "${SPARK_HOME}/sbin/spark-daemon.sh" "$@"
+exec "${SPARK_HOME}/sbin/workers.sh" --config $SPARK_CONF_DIR cd "${SPARK_HOME}" \; "${SPARK_HOME}/sbin/spark-daemon.sh" "$@"
diff -pruN spark-3.3.0-bin-hadoop2-orig/sbin/start-worker.sh spark-3.3.0-bin-hadoop2-alternate/sbin/start-worker.sh
--- spark-3.3.0-bin-hadoop2-orig/sbin/start-worker.sh 2022-06-09 12:56:55.000000000 -0700
+++ spark-3.3.0-bin-hadoop2-alternate/sbin/start-worker.sh 2023-04-04 10:08:27.985454000 -0700
@@ -50,6 +50,24 @@ if [[ $# -lt 1 ]] || [[ "$@" = *--help ]
exit 1
fi

+# Check if --config is passed as an argument. It is an optional parameter.
+# Exit if the argument is not a directory.
+
+if [ "$1" == "--config" ]
+then
+ shift
+ conf_dir=$1
+ if [ ! -d "$conf_dir" ]
+ then
+ echo "ERROR : $conf_dir is not a directory"
+ echo $usage
+ exit 1
+ else
+ export SPARK_CONF_DIR=$conf_dir
+ fi
+ shift
+fi
+
. "${SPARK_HOME}/sbin/spark-config.sh"

. "${SPARK_HOME}/bin/load-spark-env.sh"
@@ -79,8 +97,14 @@ function start_instance {
fi
WEBUI_PORT=$(( $SPARK_WORKER_WEBUI_PORT + $WORKER_NUM - 1 ))

- "${SPARK_HOME}/sbin"/spark-daemon.sh start $CLASS $WORKER_NUM \
- --webui-port "$WEBUI_PORT" $PORT_FLAG $PORT_NUM $MASTER "$@"
+ if [ "${SPARK_CONF_DIR}X" != "X" ]
+ then
+ "${SPARK_HOME}/sbin"/spark-daemon.sh --config $SPARK_CONF_DIR start $CLASS $WORKER_NUM \
+ --webui-port "$WEBUI_PORT" $PORT_FLAG $PORT_NUM $MASTER "$@"
+ else
+ "${SPARK_HOME}/sbin"/spark-daemon.sh start $CLASS $WORKER_NUM \
+ --webui-port "$WEBUI_PORT" $PORT_FLAG $PORT_NUM $MASTER "$@"
+ fi
}

if [ "$SPARK_WORKER_INSTANCES" = "" ]; then
diff -pruN spark-3.3.0-bin-hadoop2-orig/sbin/start-workers.sh spark-3.3.0-bin-hadoop2-alternate/sbin/start-workers.sh
--- spark-3.3.0-bin-hadoop2-orig/sbin/start-workers.sh 2022-06-09 12:56:55.000000000 -0700
+++ spark-3.3.0-bin-hadoop2-alternate/sbin/start-workers.sh 2023-04-04 10:08:27.988451000 -0700
@@ -43,4 +43,4 @@ if [ "$SPARK_MASTER_HOST" = "" ]; then
fi

# Launch the workers
-"${SPARK_HOME}/sbin/workers.sh" cd "${SPARK_HOME}" \; "${SPARK_HOME}/sbin/start-worker.sh" "spark://$SPARK_MASTER_HOST:$SPARK_MASTER_PORT"
+"${SPARK_HOME}/sbin/workers.sh" --config $SPARK_CONF_DIR cd "${SPARK_HOME}" \; "${SPARK_HOME}/sbin/start-worker.sh" --config $SPARK_CONF_DIR "spark://$SPARK_MASTER_HOST:$SPARK_MASTER_PORT"
diff -pruN spark-3.3.0-bin-hadoop2-orig/sbin/stop-worker.sh spark-3.3.0-bin-hadoop2-alternate/sbin/stop-worker.sh
--- spark-3.3.0-bin-hadoop2-orig/sbin/stop-worker.sh 2022-06-09 12:56:55.000000000 -0700
+++ spark-3.3.0-bin-hadoop2-alternate/sbin/stop-worker.sh 2023-04-04 10:08:27.991448000 -0700
@@ -31,6 +31,23 @@ if [ -z "${SPARK_HOME}" ]; then
export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
fi

+# Check if --config is passed as an argument. It is an optional parameter.
+# Exit if the argument is not a directory.
+if [ "$1" == "--config" ]
+then
+ shift
+ conf_dir="$1"
+ if [ ! -d "$conf_dir" ]
+ then
+ echo "ERROR : $conf_dir is not a directory"
+ echo $usage
+ exit 1
+ else
+ export SPARK_CONF_DIR="$conf_dir"
+ fi
+ shift
+fi
+
. "${SPARK_HOME}/sbin/spark-config.sh"

. "${SPARK_HOME}/bin/load-spark-env.sh"
diff -pruN spark-3.3.0-bin-hadoop2-orig/sbin/stop-workers.sh spark-3.3.0-bin-hadoop2-alternate/sbin/stop-workers.sh
--- spark-3.3.0-bin-hadoop2-orig/sbin/stop-workers.sh 2022-06-09 12:56:55.000000000 -0700
+++ spark-3.3.0-bin-hadoop2-alternate/sbin/stop-workers.sh 2023-04-04 10:08:27.995433000 -0700
@@ -25,4 +25,4 @@ fi

. "${SPARK_HOME}/bin/load-spark-env.sh"

-"${SPARK_HOME}/sbin/workers.sh" cd "${SPARK_HOME}" \; "${SPARK_HOME}/sbin"/stop-worker.sh
+"${SPARK_HOME}/sbin/workers.sh" --config $SPARK_CONF_DIR cd "${SPARK_HOME}" \; "${SPARK_HOME}/sbin"/stop-worker.sh --config $SPARK_CONF_DIR
diff -pruN spark-3.3.0-bin-hadoop2-orig/sbin/workers.sh spark-3.3.0-bin-hadoop2-alternate/sbin/workers.sh
--- spark-3.3.0-bin-hadoop2-orig/sbin/workers.sh 2022-06-09 12:56:55.000000000 -0700
+++ spark-3.3.0-bin-hadoop2-alternate/sbin/workers.sh 2023-04-04 10:08:27.998437000 -0700
@@ -25,6 +25,8 @@
# Default is ${SPARK_CONF_DIR}/workers.
# SPARK_CONF_DIR Alternate conf dir. Default is ${SPARK_HOME}/conf.
# SPARK_WORKER_SLEEP Seconds to sleep between spawning remote commands.
+# SPARK_SSH_CMD Specify an alternate remote shell command.
+# Defaults to ssh if not specified.
# SPARK_SSH_OPTS Options passed to ssh when running remote commands.
##

@@ -93,19 +95,19 @@ if [ "$HOSTLIST" = "" ]; then
fi
fi

-
+RSH_CMD=${SPARK_SSH_CMD:-ssh}

# By default disable strict host key checking
-if [ "$SPARK_SSH_OPTS" = "" ]; then
+if [ "$RSH_CMD" == "ssh" ] && [ "$SPARK_SSH_OPTS" = "" ]; then
SPARK_SSH_OPTS="-o StrictHostKeyChecking=no"
fi

for host in `echo "$HOSTLIST"|sed "s/#.*$//;/^$/d"`; do
if [ -n "${SPARK_SSH_FOREGROUND}" ]; then
- ssh $SPARK_SSH_OPTS "$host" $"${@// /\\ }" \
+ $RSH_CMD $SPARK_SSH_OPTS "$host" $"${@// /\\ }" \
2>&1 | sed "s/^/$host: /"
else
- ssh $SPARK_SSH_OPTS "$host" $"${@// /\\ }" \
+ $RSH_CMD $SPARK_SSH_OPTS "$host" $"${@// /\\ }" \
2>&1 | sed "s/^/$host: /" &
fi
if [ "$SPARK_WORKER_SLEEP" != "" ]; then
Loading

0 comments on commit 8a2676b

Please sign in to comment.