From b2a50626a04e93f615464c1a57db8edb77a58957 Mon Sep 17 00:00:00 2001 From: julien bignon Date: Thu, 22 Aug 2024 18:00:20 +0200 Subject: [PATCH] Use official spark docker. --- .../spark-3.5/innerContexts/jre/Dockerfile | 53 +-------- .../spark-3.5/innerContexts/jre/entrypoint.sh | 107 ------------------ .../jre/spark-3.5-jre-11/image_test.yaml | 4 +- .../spark-3.5/innerContexts/python/Dockerfile | 14 +-- .../spark-3.5-python-3.9/image_test.yaml | 4 +- 5 files changed, 11 insertions(+), 171 deletions(-) delete mode 100644 technologies/job/spark/spark-3.5/innerContexts/jre/entrypoint.sh diff --git a/technologies/job/spark/spark-3.5/innerContexts/jre/Dockerfile b/technologies/job/spark/spark-3.5/innerContexts/jre/Dockerfile index 4974844f2..446939fcd 100644 --- a/technologies/job/spark/spark-3.5/innerContexts/jre/Dockerfile +++ b/technologies/job/spark/spark-3.5/innerContexts/jre/Dockerfile @@ -1,59 +1,18 @@ -ARG jre_major=11 -FROM openjdk:${jre_major}-slim-bullseye +FROM spark:3.5.2 -ARG SPARK_VERSION=3.5.2 -ARG HADOOP_VERSION=3 -ARG TINI_VERSION="v0.18.0" - -ENV DEBIAN_FRONTEND noninteractive - -ENV SPARK_HOME /opt/spark ENV PATH "$PATH:$SPARK_HOME/bin" -ENV HADOOP_CONF_DIR=/etc/hadoop/conf -ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/lib/hadoop/lib/native" - +ENV LANG C.UTF-8 # LIGHT DEPENDENCIES START -RUN echo "deb http://deb.debian.org/debian/ bullseye-backports main contrib non-free" | tee /etc/apt/sources.list.d/bulleseye-backports.list && \ - apt update -qq && apt install -yqq --no-install-recommends \ - ftp wget curl unzip telnet openssh-client krb5-user zip procps && \ +USER root +RUN apt update -qq && apt install -yqq --no-install-recommends \ + wget curl unzip krb5-user zip && \ rm -rf /var/lib/apt/lists/* -# LIGHT DEPENDENCIES END - -RUN set -ex && \ - mkdir -p /opt/spark && \ - mkdir -p /opt/spark/work-dir && \ - touch /opt/spark/RELEASE && \ - rm /bin/sh && \ - ln -sv /bin/bash /bin/sh && \ - echo "auth required pam_wheel.so use_uid" >> /etc/pam.d/su && \ - chgrp root /etc/passwd && chmod ug+rw /etc/passwd && \ - export TINI_HOME="/usr/local/sbin" && \ - curl -fSL "https://github.com/krallin/tini/releases/download/$TINI_VERSION/tini" -o "${TINI_HOME}/tini" && \ - curl -fSL "creleases/download/$TINI_VERSION/tini.asc" -o "${TINI_HOME}/tini.asc" && \ - chmod +x "${TINI_HOME}/tini" && \ - ln -s ${TINI_HOME}/tini /sbin/tini && \ - "${TINI_HOME}/tini" -h -RUN mkdir -p /tmp/spark && \ - cd /tmp/spark && \ - wget -nv https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz && \ - tar xf spark-*.tgz && \ - rm spark-*.tgz && \ - cp -R /tmp/spark/*/jars /opt/spark && \ - cp -R /tmp/spark/*/bin /opt/spark && \ - cp -R /tmp/spark/*/sbin /opt/spark && \ - rm -Rf /tmp/spark +USER spark #See hadoop version used by spark and udpate if necessary. #See https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-aws/3.3.4 to get right version of aws-java-sdk-bundle RUN wget -nv https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.262/aws-java-sdk-bundle-1.12.262.jar && \ wget -nv https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar && \ mv *.jar /opt/spark/jars/ - -COPY entrypoint.sh /opt/ -RUN chmod 755 /opt/entrypoint.sh - -WORKDIR /opt/spark/work-dir - -ENTRYPOINT [ "/opt/entrypoint.sh" ] diff --git a/technologies/job/spark/spark-3.5/innerContexts/jre/entrypoint.sh b/technologies/job/spark/spark-3.5/innerContexts/jre/entrypoint.sh deleted file mode 100644 index 4299d581d..000000000 --- a/technologies/job/spark/spark-3.5/innerContexts/jre/entrypoint.sh +++ /dev/null @@ -1,107 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# echo commands to the terminal output -set -e - -# Check whether there is a passwd entry for the container UID -myuid=$(id -u) -mygid=$(id -g) -# turn off -e for getent because it will return error code in anonymous uid case -set +e -uidentry=$(getent passwd $myuid) -set -e - -# If there is no passwd entry for the container UID, attempt to create one -if [ -z "$uidentry" ] ; then - if [ -w /etc/passwd ] ; then - echo "$myuid:x:$myuid:$mygid:${SPARK_USER_NAME:-anonymous uid}:$SPARK_HOME:/bin/false" >> /etc/passwd - else - echo "Container ENTRYPOINT failed to add passwd entry for anonymous UID" - fi -fi - -SPARK_CLASSPATH="$SPARK_CLASSPATH:${SPARK_HOME}/jars/*" -env | grep SPARK_JAVA_OPT_ | sort -t_ -k4 -n | sed 's/[^=]*=\(.*\)/\1/g' > /tmp/java_opts.txt -readarray -t SPARK_EXECUTOR_JAVA_OPTS < /tmp/java_opts.txt - -if [ -n "$SPARK_EXTRA_CLASSPATH" ]; then - SPARK_CLASSPATH="$SPARK_CLASSPATH:$SPARK_EXTRA_CLASSPATH" -fi - -# If HADOOP_HOME is set and SPARK_DIST_CLASSPATH is not set, set it here so Hadoop jars are available to the executor. -# It does not set SPARK_DIST_CLASSPATH if already set, to avoid overriding customizations of this value from elsewhere e.g. Docker/K8s. -if [ -n "${HADOOP_HOME}" ] && [ -z "${SPARK_DIST_CLASSPATH}" ]; then - export SPARK_DIST_CLASSPATH="$($HADOOP_HOME/bin/hadoop classpath)" -fi - -if ! [ -z ${HADOOP_CONF_DIR+x} ]; then - SPARK_CLASSPATH="$HADOOP_CONF_DIR:$SPARK_CLASSPATH"; -fi - -if ! [ -z ${SPARK_CONF_DIR+x} ]; then - SPARK_CLASSPATH="$SPARK_CONF_DIR:$SPARK_CLASSPATH"; -elif ! [ -z ${SPARK_HOME+x} ]; then - SPARK_CLASSPATH="$SPARK_HOME/conf:$SPARK_CLASSPATH"; -fi - -case "$1" in - driver) - shift 1 - CMD=( - "$SPARK_HOME/bin/spark-submit" - --conf "spark.driver.bindAddress=$SPARK_DRIVER_BIND_ADDRESS" - --deploy-mode client - "$@" - ) - ;; - executor) - shift 1 - CMD=( - ${JAVA_HOME}/bin/java - "${SPARK_EXECUTOR_JAVA_OPTS[@]}" - -Xms$SPARK_EXECUTOR_MEMORY - -Xmx$SPARK_EXECUTOR_MEMORY - -cp "$SPARK_CLASSPATH:$SPARK_DIST_CLASSPATH" - org.apache.spark.executor.CoarseGrainedExecutorBackend - --driver-url $SPARK_DRIVER_URL - --executor-id $SPARK_EXECUTOR_ID - --cores $SPARK_EXECUTOR_CORES - --app-id $SPARK_APPLICATION_ID - --hostname $SPARK_EXECUTOR_POD_IP - --resourceProfileId $SPARK_RESOURCE_PROFILE_ID - ) - ;; - - *) - cd /sandbox - mkdir -p /opt/spark/conf/ - cat conf/*.conf > /opt/spark/conf/spark-defaults.conf - echo "spark.kubernetes.driver.pod.name $HOSTNAME" >> /opt/spark/conf/spark-defaults.conf - if test -f main_script; - then - CMD=(/bin/sh ./main_script) - else - echo "Non-spark-on-k8s command provided, proceeding in pass-through mode...TOTO" - CMD=("$@") - fi; - ;; -esac - -# Execute the container CMD under tini for better hygiene -exec /sbin/tini -s -- "${CMD[@]}" diff --git a/technologies/job/spark/spark-3.5/innerContexts/jre/spark-3.5-jre-11/image_test.yaml b/technologies/job/spark/spark-3.5/innerContexts/jre/spark-3.5-jre-11/image_test.yaml index 3bf680313..9bd0cf508 100644 --- a/technologies/job/spark/spark-3.5/innerContexts/jre/spark-3.5-jre-11/image_test.yaml +++ b/technologies/job/spark/spark-3.5/innerContexts/jre/spark-3.5-jre-11/image_test.yaml @@ -5,11 +5,9 @@ metadataTest: - key: LANG value: "C.UTF-8" - key: JAVA_HOME - value: "/usr/local/openjdk-11" + value: "/usr/local/openjdk" - key: SPARK_HOME value: "/opt/spark" - - key: "HADOOP_CONF_DIR" - value: "/etc/hadoop/conf" fileExistenceTests: - name: "entrypoint.sh" diff --git a/technologies/job/spark/spark-3.5/innerContexts/python/Dockerfile b/technologies/job/spark/spark-3.5/innerContexts/python/Dockerfile index 6fdb4a056..b0c11acd6 100644 --- a/technologies/job/spark/spark-3.5/innerContexts/python/Dockerfile +++ b/technologies/job/spark/spark-3.5/innerContexts/python/Dockerfile @@ -9,17 +9,11 @@ COPY --from=SPARK_BASE /opt/spark /opt/spark COPY --from=SPARK_BASE /usr/local/sbin/tini /usr/local/sbin/tini COPY --from=SPARK_BASE /sbin/tini /sbin/tini -COPY --from=SPARK_BASE /usr/local/openjdk-11 /usr/local/openjdk-11 +COPY --from=SPARK_BASE /opt/java/openjdk /opt/java/openjdk -ENV JAVA_HOME /usr/local/openjdk-11 +ENV JAVA_HOME /opt/java/openjdk ENV LANG C.UTF-8 ENV SPARK_HOME /opt/spark -ENV HADOOP_CONF_DIR=/etc/hadoop/conf - -# ADD ps binaries for load-spark-env.sh -RUN apt update -qq && apt install -yqq --no-install-recommends \ - procps && \ - rm -rf /var/lib/apt/lists/* RUN pip --no-cache-dir install --upgrade pip \ && pip --no-cache-dir install pyspark==3.5.2 \ @@ -35,6 +29,4 @@ RUN chmod 755 /opt/entrypoint.sh WORKDIR /opt/spark/work-dir -ENTRYPOINT [ "/opt/entrypoint.sh" ] - -ENV REBUILD ME \ No newline at end of file +ENTRYPOINT [ "/opt/entrypoint.sh" ] \ No newline at end of file diff --git a/technologies/job/spark/spark-3.5/innerContexts/python/spark-3.5-python-3.9/image_test.yaml b/technologies/job/spark/spark-3.5/innerContexts/python/spark-3.5-python-3.9/image_test.yaml index 308c7751e..7ce16fc4d 100644 --- a/technologies/job/spark/spark-3.5/innerContexts/python/spark-3.5-python-3.9/image_test.yaml +++ b/technologies/job/spark/spark-3.5/innerContexts/python/spark-3.5-python-3.9/image_test.yaml @@ -5,11 +5,9 @@ metadataTest: - key: LANG value: "C.UTF-8" - key: JAVA_HOME - value: "/usr/local/openjdk-11" + value: "/usr/local/openjdk" - key: SPARK_HOME value: "/opt/spark" - - key: "HADOOP_CONF_DIR" - value: "/etc/hadoop/conf" fileExistenceTests: - name: "entrypoint.sh"