Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 98 additions & 0 deletions Dockerfile.template
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
FROM {{ BASE_IMAGE }}

ARG spark_uid=185

RUN groupadd --system --gid=${spark_uid} spark && \
useradd --system --uid=${spark_uid} --gid=spark spark

RUN set -ex && \
apt-get update && \
ln -s /lib /lib64 && \
apt install -y gnupg2 wget bash tini libc6 libpam-modules krb5-user libnss3 procps net-tools gosu && \
{%- if HAVE_PY %}
apt install -y python3 python3-pip && \
pip3 install --upgrade pip setuptools && \
{%- endif %}
{%- if HAVE_R %}
apt install -y r-base r-base-dev && \
{%- endif %}
mkdir -p /opt/spark && \
{%- if HAVE_PY %}
mkdir /opt/spark/python && \
{%- endif %}
mkdir -p /opt/spark/examples && \
mkdir -p /opt/spark/work-dir && \
touch /opt/spark/RELEASE && \
chown -R spark:spark /opt/spark && \
rm /bin/sh && \
ln -sv /bin/bash /bin/sh && \
echo "auth required pam_wheel.so use_uid" >> /etc/pam.d/su && \
chgrp root /etc/passwd && chmod ug+rw /etc/passwd && \
rm -rf /var/cache/apt/*

# Install Apache Spark
# https://downloads.apache.org/spark/KEYS
ENV SPARK_TGZ_URL=https://dlcdn.apache.org/spark/spark-{{ SPARK_VERSION }}/spark-{{ SPARK_VERSION }}-bin-hadoop3.tgz \
SPARK_TGZ_ASC_URL=https://downloads.apache.org/spark/spark-{{ SPARK_VERSION }}/spark-{{ SPARK_VERSION }}-bin-hadoop3.tgz.asc \
GPG_KEY=E298A3A825C0D65DFD57CBB651716619E084DAB9

RUN set -ex; \
export SPARK_TMP="$(mktemp -d)"; \
cd $SPARK_TMP; \
wget -nv -O spark.tgz "$SPARK_TGZ_URL"; \
wget -nv -O spark.tgz.asc "$SPARK_TGZ_ASC_URL"; \
export GNUPGHOME="$(mktemp -d)"; \
gpg --keyserver hkps://keyserver.pgp.com --recv-key "$GPG_KEY" || \
gpg --keyserver hkps://keyserver.ubuntu.com --recv-keys "$GPG_KEY" || \
gpg --batch --verify spark.tgz.asc spark.tgz; \
gpgconf --kill all; \
rm -rf "$GNUPGHOME" spark.tgz.asc; \
\
tar -xf spark.tgz --strip-components=1; \
chown -R spark:spark .; \
mv jars /opt/spark/; \
mv bin /opt/spark/; \
mv sbin /opt/spark/; \
mv kubernetes/dockerfiles/spark/decom.sh /opt/; \
mv examples /opt/spark/; \
mv kubernetes/tests /opt/spark/; \
mv data /opt/spark/; \
{%- if HAVE_PY %}
mv python/pyspark /opt/spark/python/pyspark/; \
mv python/lib /opt/spark/python/lib/; \
{%- endif %}
{%- if HAVE_R %}
mv R /opt/spark/; \
{%- endif %}
cd ..; \
rm -rf "$SPARK_TMP";

COPY entrypoint.sh /opt/

ENV SPARK_HOME /opt/spark
{%- if HAVE_R %}
ENV R_HOME /usr/lib/R
{%- endif %}

WORKDIR /opt/spark/work-dir
RUN chmod g+w /opt/spark/work-dir
RUN chmod a+x /opt/decom.sh
RUN chmod a+x /opt/entrypoint.sh

ENTRYPOINT [ "/opt/entrypoint.sh" ]
53 changes: 53 additions & 0 deletions add-dockerfiles.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#!/usr/bin/env bash

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Usage: $0 [version]
# Generate dockerfiles for specified spark version.
#
# Examples:
# - Add 3.3.0 dockerfiles:
# $ ./add-dockerfiles.sh
# - Add 3.3.1 dockerfiles:
# $ ./add-dockerfiles.sh 3.3.1

VERSION=${1:-"3.3.0"}

TAGS="
scala2.12-java11-python3-r-ubuntu
scala2.12-java11-python3-ubuntu
scala2.12-java11-r-ubuntu
scala2.12-java11-ubuntu
"

for TAG in $TAGS; do
OPTS=""
if echo $TAG | grep -q "python"; then
OPTS+=" --pyspark"
fi

if echo $TAG | grep -q "r-"; then
OPTS+=" --sparkr"
fi

OPTS+=" --spark-version $VERSION"

mkdir -p $VERSION/$TAG
cp -f entrypoint.sh.template $VERSION/$TAG/entrypoint.sh
python3 tools/template.py $OPTS > $VERSION/$TAG/Dockerfile
done
114 changes: 114 additions & 0 deletions entrypoint.sh.template
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
#!/bin/bash
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Check whether there is a passwd entry for the container UID
myuid=$(id -u)
mygid=$(id -g)
# turn off -e for getent because it will return error code in anonymous uid case
set +e
uidentry=$(getent passwd $myuid)
set -e

# If there is no passwd entry for the container UID, attempt to create one
if [ -z "$uidentry" ] ; then
if [ -w /etc/passwd ] ; then
echo "$myuid:x:$myuid:$mygid:${SPARK_USER_NAME:-anonymous uid}:$SPARK_HOME:/bin/false" >> /etc/passwd
else
echo "Container ENTRYPOINT failed to add passwd entry for anonymous UID"
fi
fi

if [ -z "$JAVA_HOME" ]; then
JAVA_HOME=$(java -XshowSettings:properties -version 2>&1 > /dev/null | grep 'java.home' | awk '{print $3}')
fi

SPARK_CLASSPATH="$SPARK_CLASSPATH:${SPARK_HOME}/jars/*"
env | grep SPARK_JAVA_OPT_ | sort -t_ -k4 -n | sed 's/[^=]*=\(.*\)/\1/g' > /tmp/java_opts.txt
readarray -t SPARK_EXECUTOR_JAVA_OPTS < /tmp/java_opts.txt

if [ -n "$SPARK_EXTRA_CLASSPATH" ]; then
SPARK_CLASSPATH="$SPARK_CLASSPATH:$SPARK_EXTRA_CLASSPATH"
fi

if ! [ -z ${PYSPARK_PYTHON+x} ]; then
export PYSPARK_PYTHON
fi
if ! [ -z ${PYSPARK_DRIVER_PYTHON+x} ]; then
export PYSPARK_DRIVER_PYTHON
fi

# If HADOOP_HOME is set and SPARK_DIST_CLASSPATH is not set, set it here so Hadoop jars are available to the executor.
# It does not set SPARK_DIST_CLASSPATH if already set, to avoid overriding customizations of this value from elsewhere e.g. Docker/K8s.
if [ -n "${HADOOP_HOME}" ] && [ -z "${SPARK_DIST_CLASSPATH}" ]; then
export SPARK_DIST_CLASSPATH="$($HADOOP_HOME/bin/hadoop classpath)"
fi

if ! [ -z ${HADOOP_CONF_DIR+x} ]; then
SPARK_CLASSPATH="$HADOOP_CONF_DIR:$SPARK_CLASSPATH";
fi

if ! [ -z ${SPARK_CONF_DIR+x} ]; then
SPARK_CLASSPATH="$SPARK_CONF_DIR:$SPARK_CLASSPATH";
elif ! [ -z ${SPARK_HOME+x} ]; then
SPARK_CLASSPATH="$SPARK_HOME/conf:$SPARK_CLASSPATH";
fi

case "$1" in
driver)
shift 1
CMD=(
"$SPARK_HOME/bin/spark-submit"
--conf "spark.driver.bindAddress=$SPARK_DRIVER_BIND_ADDRESS"
--deploy-mode client
"$@"
)
;;
executor)
shift 1
CMD=(
${JAVA_HOME}/bin/java
"${SPARK_EXECUTOR_JAVA_OPTS[@]}"
-Xms$SPARK_EXECUTOR_MEMORY
-Xmx$SPARK_EXECUTOR_MEMORY
-cp "$SPARK_CLASSPATH:$SPARK_DIST_CLASSPATH"
org.apache.spark.scheduler.cluster.k8s.KubernetesExecutorBackend
--driver-url $SPARK_DRIVER_URL
--executor-id $SPARK_EXECUTOR_ID
--cores $SPARK_EXECUTOR_CORES
--app-id $SPARK_APPLICATION_ID
--hostname $SPARK_EXECUTOR_POD_IP
--resourceProfileId $SPARK_RESOURCE_PROFILE_ID
--podName $SPARK_EXECUTOR_POD_NAME
)
;;

*)
# Non-spark-on-k8s command provided, proceeding in pass-through mode...
CMD=("$@")
;;
esac

# Switch to spark if no USER specified (root by default) otherwise use USER directly
switch_spark_if_root() {
if [ $(id -u) -eq 0 ]; then
echo gosu spark
fi
}

# Execute the container CMD under tini for better hygiene
exec $(switch_spark_if_root) /usr/bin/tini -s -- "${CMD[@]}"
1 change: 1 addition & 0 deletions tools/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
jinja2
84 changes: 84 additions & 0 deletions tools/template.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/usr/bin/env python3

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from argparse import ArgumentParser

from jinja2 import Environment, FileSystemLoader


def parse_opts():
parser = ArgumentParser(prog="template")

parser.add_argument(
"-f",
"--template-file",
help="The Dockerfile template file path.",
default="Dockerfile.template",
)

parser.add_argument(
"-v",
"--spark-version",
help="The Spark version of Dockerfile.",
default="3.3.0",
)

parser.add_argument(
"-i",
"--image",
help="The base image tag of Dockerfile.",
default="eclipse-temurin:11-jre-focal",
)

parser.add_argument(
"-p",
"--pyspark",
action="store_true",
help="Have PySpark support or not.",
)

parser.add_argument(
"-r",
"--sparkr",
action="store_true",
help="Have SparkR support or not.",
)

args, unknown = parser.parse_known_args()
if unknown:
parser.error("Unsupported arguments: %s" % " ".join(unknown))
return args


def main():
opts = parse_opts()
env = Environment(loader=FileSystemLoader("./"))
template = env.get_template(opts.template_file)
print(
template.render(
BASE_IMAGE=opts.image,
HAVE_PY=opts.pyspark,
HAVE_R=opts.sparkr,
SPARK_VERSION=opts.spark_version,
)
)


if __name__ == "__main__":
main()