Skip to content

Commit

Permalink
Add spark 3.5
Browse files Browse the repository at this point in the history
  • Loading branch information
julien bignon committed Aug 22, 2024
1 parent f8a8311 commit 0e2613d
Show file tree
Hide file tree
Showing 26 changed files with 885 additions and 1 deletion.
1 change: 1 addition & 0 deletions technologies/job/spark/spark-2.4/context.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ label: "2.4"
available: true
recommended: false
trustLevel: stable
deprecationDate: "2024-09-01T00:00:00Z"
1 change: 1 addition & 0 deletions technologies/job/spark/spark-3.0/context.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ label: "3.0"
available: true
recommended: false
trustLevel: stable
deprecationDate: "2024-09-01T00:00:00Z"
3 changes: 2 additions & 1 deletion technologies/job/spark/spark-3.1/context.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
id: "3.1"
label: "3.1"
available: true
recommended: true
recommended: false
trustLevel: stable
deprecationDate: "2024-09-01T00:00:00Z"
61 changes: 61 additions & 0 deletions technologies/job/spark/spark-3.5/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
FROM openjdk:11-jre-slim-bullseye

ARG SPARK_VERSION=3.5.2
ARG HADOOP_VERSION=3
ARG TINI_VERSION="v0.18.0"

ENV DEBIAN_FRONTEND noninteractive

ENV SPARK_HOME /opt/spark
ENV PATH "$PATH:$SPARK_HOME/bin"
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/lib/hadoop/lib/native"

ENV HADOOP_CONF_DIR=/etc/hadoop/conf

# LIGHT DEPENDENCIES START
RUN echo "deb http://deb.debian.org/debian/ bullseye-backports main contrib non-free" | tee /etc/apt/sources.list.d/bulleseye-backports.list && \
apt update -qq && apt install -yqq --no-install-recommends \
ftp wget curl unzip telnet openssh-client krb5-user zip procps && \
rm -rf /var/lib/apt/lists/*
# LIGHT DEPENDENCIES END

# TINI INSTALL START
RUN set -ex && \
mkdir -p /opt/spark && \
mkdir -p /opt/spark/work-dir && \
touch /opt/spark/RELEASE && \
rm /bin/sh && \
ln -sv /bin/bash /bin/sh && \
echo "auth required pam_wheel.so use_uid" >> /etc/pam.d/su && \
chgrp root /etc/passwd && chmod ug+rw /etc/passwd && \
export TINI_HOME="/usr/local/sbin" && \
curl -fSL "https://github.com/krallin/tini/releases/download/$TINI_VERSION/tini" -o "${TINI_HOME}/tini" && \
curl -fSL "https://github.com/krallin/tini/releases/download/$TINI_VERSION/tini.asc" -o "${TINI_HOME}/tini.asc" && \
chmod +x "${TINI_HOME}/tini" && \
ln -s ${TINI_HOME}/tini /sbin/tini && \
"${TINI_HOME}/tini" -h
# TINI INSTALL END

# SPARK INSTALL START
RUN mkdir -p /tmp/spark && \
cd /tmp/spark && \
wget -nv https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz && \
tar xf spark-*.tgz && \
rm spark-*.tgz && \
cp -R /tmp/spark/*/jars /opt/spark && \
cp -R /tmp/spark/*/bin /opt/spark && \
cp -R /tmp/spark/*/sbin /opt/spark && \
rm -Rf /tmp/spark
# SPARK INSTALL END

COPY assets/hive_1.1.0_jars_download.sh /tmp/

RUN chmod +x /tmp/hive_1.1.0_jars_download.sh && \
/tmp/hive_1.1.0_jars_download.sh

COPY entrypoint.sh /opt/
RUN chmod 755 /opt/entrypoint.sh

WORKDIR /sandbox/

ENTRYPOINT [ "/opt/entrypoint.sh" ]
22 changes: 22 additions & 0 deletions technologies/job/spark/spark-3.5/build.gradle.kts
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* Copyright 2019-2021.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import com.bmuschko.gradle.docker.DockerRemoteApiPlugin
import com.saagie.technologies.SaagieTechnologiesGradlePlugin

apply<DockerRemoteApiPlugin>()
apply<SaagieTechnologiesGradlePlugin>()
5 changes: 5 additions & 0 deletions technologies/job/spark/spark-3.5/context.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
id: "3.5"
label: "3.5"
available: true
recommended: true
trustLevel: stable
4 changes: 4 additions & 0 deletions technologies/job/spark/spark-3.5/dockerInfo.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
image: saagie/spark
baseTag: 3.5
dynamicVersion: 1.139.0_SDKTECHNO-207
version: 3.5-1.139.0
54 changes: 54 additions & 0 deletions technologies/job/spark/spark-3.5/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#!/bin/bash
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# echo commands to the terminal output
set -e

# Check whether there is a passwd entry for the container UID
myuid=$(id -u)
mygid=$(id -g)
# turn off -e for getent because it will return error code in anonymous uid case
set +e
uidentry=$(getent passwd $myuid)
set -e

# If there is no passwd entry for the container UID, attempt to create one
if [ -z "$uidentry" ] ; then
if [ -w /etc/passwd ] ; then
echo "$myuid:x:$myuid:$mygid:${SPARK_USER_NAME:-anonymous uid}:$SPARK_HOME:/bin/false" >> /etc/passwd
else
echo "Container ENTRYPOINT failed to add passwd entry for anonymous UID"
fi
fi

# BEGIN SAAGIE SPECIFIC CODE
mkdir -p /opt/spark/conf/
cat conf/*.conf > /opt/spark/conf/spark-defaults.conf
echo "spark.kubernetes.driver.label.io.saagie/spark-submit-pod-uid $SPARK_SUBMIT_POD_UID" >> /opt/spark/conf/spark-defaults.conf

if test -f main_script;
then
# parse content and if pyfiles extract minio url and inject it
if grep -q "\--py-files" main_script;
then
echo "spark.kubernetes.driverEnv.PYSPARK_FILES `awk -F '.*--py-files=| ' '{print $2}' main_script`" >> /opt/spark/conf/spark-defaults.conf
fi;
sh ./main_script;
else exec "$@"
fi;
# END SAAGIE SPECIFIC CODE
62 changes: 62 additions & 0 deletions technologies/job/spark/spark-3.5/image_test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
schemaVersion: "2.0.0"

metadataTest:
env:
- key: LANG
value: "C.UTF-8"
- key: JAVA_HOME
value: "/usr/local/openjdk-11"
- key: SPARK_HOME
value: "/opt/spark"

fileExistenceTests:
- name: "entrypoint.sh"
path: "/opt/entrypoint.sh"
shouldExist: true
permissions: "-rwxr-xr-x"

- name: "kinit"
path: "/usr/bin/kinit"
shouldExist: true
permissions: "-rwxr-xr-x"

fileContentTests:
- name: "entrypoint.sh"
path: "/opt/entrypoint.sh"
expectedContents:
[
'mkdir -p /opt/spark/conf/',
'cat conf/\*\.conf > /opt/spark/conf/spark-defaults.conf',
'echo "spark\.kubernetes\.driver\.label\.io\.saagie/spark-submit-pod-uid \$SPARK_SUBMIT_POD_UID" >> /opt/spark/conf/spark-defaults.conf',
'sh \./main_script',
]

commandTests:
- name: "java installation"
command: "which"
args: ["java"]
expectedOutput: ["/usr/local/openjdk-11/bin/java"]

- name: "java version"
command: "java"
args: ["-version"]
expectedError: ['openjdk version "11*']

- name: "Workdir"
command: "pwd"
expectedOutput: ["/sandbox"]

- name: "Spark version"
command: "/opt/spark/bin/spark-submit"
args: ["--version"]
expectedError: ["version 3.5.*"]

- name: "spark-submit on path"
command: "which"
args: ["spark-submit"]
expectedOutput: ["/opt/spark/bin/spark-submit"]

- name: "unzip"
command: "which"
args: ["unzip"]
expectedOutput: ["/usr/bin/unzip"]
59 changes: 59 additions & 0 deletions technologies/job/spark/spark-3.5/innerContexts/jre/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
ARG jre_major=11
FROM openjdk:${jre_major}-slim-bullseye

ARG SPARK_VERSION=3.5.2
ARG HADOOP_VERSION=3
ARG TINI_VERSION="v0.18.0"

ENV DEBIAN_FRONTEND noninteractive

ENV SPARK_HOME /opt/spark
ENV PATH "$PATH:$SPARK_HOME/bin"
ENV HADOOP_CONF_DIR=/etc/hadoop/conf
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/lib/hadoop/lib/native"


# LIGHT DEPENDENCIES START
RUN echo "deb http://deb.debian.org/debian/ bullseye-backports main contrib non-free" | tee /etc/apt/sources.list.d/bulleseye-backports.list && \
apt update -qq && apt install -yqq --no-install-recommends \
ftp wget curl unzip telnet openssh-client krb5-user zip procps && \
rm -rf /var/lib/apt/lists/*
# LIGHT DEPENDENCIES END

RUN set -ex && \
mkdir -p /opt/spark && \
mkdir -p /opt/spark/work-dir && \
touch /opt/spark/RELEASE && \
rm /bin/sh && \
ln -sv /bin/bash /bin/sh && \
echo "auth required pam_wheel.so use_uid" >> /etc/pam.d/su && \
chgrp root /etc/passwd && chmod ug+rw /etc/passwd && \
export TINI_HOME="/usr/local/sbin" && \
curl -fSL "https://github.com/krallin/tini/releases/download/$TINI_VERSION/tini" -o "${TINI_HOME}/tini" && \
curl -fSL "creleases/download/$TINI_VERSION/tini.asc" -o "${TINI_HOME}/tini.asc" && \
chmod +x "${TINI_HOME}/tini" && \
ln -s ${TINI_HOME}/tini /sbin/tini && \
"${TINI_HOME}/tini" -h

RUN mkdir -p /tmp/spark && \
cd /tmp/spark && \
wget -nv https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz && \
tar xf spark-*.tgz && \
rm spark-*.tgz && \
cp -R /tmp/spark/*/jars /opt/spark && \
cp -R /tmp/spark/*/bin /opt/spark && \
cp -R /tmp/spark/*/sbin /opt/spark && \
rm -Rf /tmp/spark

#See hadoop version used by spark and udpate if necessary.
#See https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-aws/3.3.4 to get right version of aws-java-sdk-bundle
RUN wget -nv https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.262/aws-java-sdk-bundle-1.12.262.jar && \
wget -nv https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar && \
mv *.jar /opt/spark/jars/

COPY entrypoint.sh /opt/
RUN chmod 755 /opt/entrypoint.sh

WORKDIR /opt/spark/work-dir

ENTRYPOINT [ "/opt/entrypoint.sh" ]
18 changes: 18 additions & 0 deletions technologies/job/spark/spark-3.5/innerContexts/jre/context.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
id: java-scala
label: Java/Scala
available: true
trustLevel: stable
job:
features:
- type: COMMAND_LINE
label: Command line
mandatory: true
comment: Linux shell command to launch the job.
defaultValue: "spark-submit \\\n--conf spark.executor.memory=1G \\\n--conf spark.executor.cores=1 \\\n--conf spark.kubernetes.executor.limit.cores=1 \\\n--conf spark.executor.instances=2 \\\n--class=Main {file} arg1 arg2"
- type: ARTIFACT
label: Package
mandatory: true
comment: "Compatible upload file : .jar"
- type: SCHEDULER
label: Scheduled
mandatory: true
Loading

0 comments on commit 0e2613d

Please sign in to comment.