Skip to content

Commit

Permalink
Add spark 3.5
Browse files Browse the repository at this point in the history
  • Loading branch information
julien bignon committed Aug 27, 2024
1 parent f8a8311 commit 6784cb1
Show file tree
Hide file tree
Showing 27 changed files with 773 additions and 1 deletion.
1 change: 1 addition & 0 deletions technologies/job/spark/spark-2.4/context.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ label: "2.4"
available: true
recommended: false
trustLevel: stable
deprecationDate: "2024-09-01T00:00:00Z"
1 change: 1 addition & 0 deletions technologies/job/spark/spark-3.0/context.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ label: "3.0"
available: true
recommended: false
trustLevel: stable
deprecationDate: "2024-09-01T00:00:00Z"
3 changes: 2 additions & 1 deletion technologies/job/spark/spark-3.1/context.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
id: "3.1"
label: "3.1"
available: true
recommended: true
recommended: false
trustLevel: stable
deprecationDate: "2024-09-01T00:00:00Z"
22 changes: 22 additions & 0 deletions technologies/job/spark/spark-3.5/build.gradle.kts
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* Copyright 2019-2021.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import com.bmuschko.gradle.docker.DockerRemoteApiPlugin
import com.saagie.technologies.SaagieTechnologiesGradlePlugin

apply<DockerRemoteApiPlugin>()
apply<SaagieTechnologiesGradlePlugin>()
5 changes: 5 additions & 0 deletions technologies/job/spark/spark-3.5/context.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
id: "3.5"
label: "3.5"
available: true
recommended: true
trustLevel: stable
22 changes: 22 additions & 0 deletions technologies/job/spark/spark-3.5/innerContexts/jre/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
ARG jre_major
FROM spark:3.5.2-scala2.12-java$jre_major-ubuntu

ENV PATH "$PATH:$SPARK_HOME/bin"
ENV LANG C.UTF-8

# LIGHT DEPENDENCIES START
USER root
RUN apt update -qq && apt install -yqq --no-install-recommends \
wget curl unzip krb5-user zip && \
rm -rf /var/lib/apt/lists/*s

COPY entrypoint.sh /opt/
RUN chmod 755 /opt/entrypoint.sh

USER spark

#See hadoop version used by spark and udpate if necessary.
#See https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-aws/3.3.4 to get right version of aws-java-sdk-bundle
RUN wget -nv https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.262/aws-java-sdk-bundle-1.12.262.jar && \
wget -nv https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar && \
mv *.jar /opt/spark/jars/
18 changes: 18 additions & 0 deletions technologies/job/spark/spark-3.5/innerContexts/jre/context.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
id: java-scala
label: Java/Scala
available: true
trustLevel: stable
job:
features:
- type: COMMAND_LINE
label: Command line
mandatory: true
comment: Linux shell command to launch the job.
defaultValue: "spark-submit \\\n--conf spark.executor.memory=1G \\\n--conf spark.executor.cores=1 \\\n--conf spark.kubernetes.executor.limit.cores=1 \\\n--conf spark.executor.instances=2 \\\n--class=Main {file} arg1 arg2"
- type: ARTIFACT
label: Package
mandatory: true
comment: "Compatible upload file : .jar"
- type: SCHEDULER
label: Scheduled
mandatory: true
141 changes: 141 additions & 0 deletions technologies/job/spark/spark-3.5/innerContexts/jre/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
#!/bin/bash
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Prevent any errors from being silently ignored
set -eo pipefail

attempt_setup_fake_passwd_entry() {
# Check whether there is a passwd entry for the container UID
local myuid; myuid="$(id -u)"
# If there is no passwd entry for the container UID, attempt to fake one
# You can also refer to the https://github.com/docker-library/official-images/pull/13089#issuecomment-1534706523
# It's to resolve OpenShift random UID case.
# See also: https://github.com/docker-library/postgres/pull/448
if ! getent passwd "$myuid" &> /dev/null; then
local wrapper
for wrapper in {/usr,}/lib{/*,}/libnss_wrapper.so; do
if [ -s "$wrapper" ]; then
NSS_WRAPPER_PASSWD="$(mktemp)"
NSS_WRAPPER_GROUP="$(mktemp)"
export LD_PRELOAD="$wrapper" NSS_WRAPPER_PASSWD NSS_WRAPPER_GROUP
local mygid; mygid="$(id -g)"
printf 'spark:x:%s:%s:${SPARK_USER_NAME:-anonymous uid}:%s:/bin/false\n' "$myuid" "$mygid" "$SPARK_HOME" > "$NSS_WRAPPER_PASSWD"
printf 'spark:x:%s:\n' "$mygid" > "$NSS_WRAPPER_GROUP"
break
fi
done
fi
}

if [ -z "$JAVA_HOME" ]; then
JAVA_HOME=$(java -XshowSettings:properties -version 2>&1 > /dev/null | grep 'java.home' | awk '{print $3}')
fi

SPARK_CLASSPATH="$SPARK_CLASSPATH:${SPARK_HOME}/jars/*"
for v in "${!SPARK_JAVA_OPT_@}"; do
SPARK_EXECUTOR_JAVA_OPTS+=( "${!v}" )
done

if [ -n "$SPARK_EXTRA_CLASSPATH" ]; then
SPARK_CLASSPATH="$SPARK_CLASSPATH:$SPARK_EXTRA_CLASSPATH"
fi

if ! [ -z "${PYSPARK_PYTHON+x}" ]; then
export PYSPARK_PYTHON
fi
if ! [ -z "${PYSPARK_DRIVER_PYTHON+x}" ]; then
export PYSPARK_DRIVER_PYTHON
fi

# If HADOOP_HOME is set and SPARK_DIST_CLASSPATH is not set, set it here so Hadoop jars are available to the executor.
# It does not set SPARK_DIST_CLASSPATH if already set, to avoid overriding customizations of this value from elsewhere e.g. Docker/K8s.
if [ -n "${HADOOP_HOME}" ] && [ -z "${SPARK_DIST_CLASSPATH}" ]; then
export SPARK_DIST_CLASSPATH="$($HADOOP_HOME/bin/hadoop classpath)"
fi

if ! [ -z "${HADOOP_CONF_DIR+x}" ]; then
SPARK_CLASSPATH="$HADOOP_CONF_DIR:$SPARK_CLASSPATH";
fi

if ! [ -z "${SPARK_CONF_DIR+x}" ]; then
SPARK_CLASSPATH="$SPARK_CONF_DIR:$SPARK_CLASSPATH";
elif ! [ -z "${SPARK_HOME+x}" ]; then
SPARK_CLASSPATH="$SPARK_HOME/conf:$SPARK_CLASSPATH";
fi

# SPARK-43540: add current working directory into executor classpath
SPARK_CLASSPATH="$SPARK_CLASSPATH:$PWD"

# Switch to spark if no USER specified (root by default) otherwise use USER directly
switch_spark_if_root() {
if [ $(id -u) -eq 0 ]; then
echo gosu spark
fi
}

case "$1" in
driver)
shift 1
CMD=(
"$SPARK_HOME/bin/spark-submit"
--conf "spark.driver.bindAddress=$SPARK_DRIVER_BIND_ADDRESS"
--conf "spark.executorEnv.SPARK_DRIVER_POD_IP=$SPARK_DRIVER_BIND_ADDRESS"
--deploy-mode client
"$@"
)
attempt_setup_fake_passwd_entry
# Execute the container CMD under tini for better hygiene
exec $(switch_spark_if_root) /usr/bin/tini -s -- "${CMD[@]}"
;;
executor)
shift 1
CMD=(
${JAVA_HOME}/bin/java
"${SPARK_EXECUTOR_JAVA_OPTS[@]}"
-Xms"$SPARK_EXECUTOR_MEMORY"
-Xmx"$SPARK_EXECUTOR_MEMORY"
-cp "$SPARK_CLASSPATH:$SPARK_DIST_CLASSPATH"
org.apache.spark.scheduler.cluster.k8s.KubernetesExecutorBackend
--driver-url "$SPARK_DRIVER_URL"
--executor-id "$SPARK_EXECUTOR_ID"
--cores "$SPARK_EXECUTOR_CORES"
--app-id "$SPARK_APPLICATION_ID"
--hostname "$SPARK_EXECUTOR_POD_IP"
--resourceProfileId "$SPARK_RESOURCE_PROFILE_ID"
--podName "$SPARK_EXECUTOR_POD_NAME"
)
attempt_setup_fake_passwd_entry
# Execute the container CMD under tini for better hygiene
exec $(switch_spark_if_root) /usr/bin/tini -s -- "${CMD[@]}"
;;

*)
# BEGIN SAAGIE SPECIFIC CODE
cd /sandbox
mkdir -p /opt/spark/conf/
cat conf/*.conf > /opt/spark/conf/spark-defaults.conf
if test -f main_script;
then
CMD=(/bin/sh ./main_script)
exec "${CMD[@]}"
else
# END SAAGIE SPECIFIC CODE
#Non-spark-on-k8s command provided, proceeding in pass-through mode...
exec "$@"
fi;
;;
esac
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* Copyright 2019-2021.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import com.bmuschko.gradle.docker.DockerRemoteApiPlugin
import com.saagie.technologies.SaagieTechnologiesGradlePlugin
import com.saagie.technologies.readDockerInfo
import com.saagie.technologies.getVersionForDocker


apply<DockerRemoteApiPlugin>()
apply<SaagieTechnologiesGradlePlugin>()

tasks.withType(com.bmuschko.gradle.docker.tasks.image.DockerBuildImage::class) {
this.buildArgs.put(
"jre_major",
"11"
)
}
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
image: saagie/spark
baseTag: 3.5-jre-11
dynamicVersion: 1.125.0
version: 3.5-jre-11-1.125.0
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
schemaVersion: "2.0.0"

metadataTest:
env:
- key: LANG
value: "C.UTF-8"
- key: JAVA_HOME
value: "/opt/java/openjdk"
- key: SPARK_HOME
value: "/opt/spark"

fileExistenceTests:
- name: "entrypoint.sh"
path: "/opt/entrypoint.sh"
shouldExist: true
permissions: "-rwxr-xr-x"

- name: "kinit"
path: "/usr/bin/kinit"
shouldExist: true
permissions: "-rwxr-xr-x"

commandTests:
- name: "Workdir"
command: "pwd"
expectedOutput: ["/opt/spark/work-dir"]

- name: "Spark version"
command: "/opt/spark/bin/spark-submit"
args: ["--version"]
expectedError: ["version 3.5.*"]

- name: "krb5-user installation"
command: "kinit"
expectedError: ["kinit: Client's credentials have been revoked while getting initial credentials"]
exitCode: 1

- name: "wget"
args: ["--help"]
command: "wget"
exitCode: 0

- name: "curl"
args: ["--help"]
command: "curl"
exitCode: 0

- name: "unzip"
args: ["--help"]
command: "unzip"
exitCode: 0

- name: "tar"
args: ["--help"]
command: "tar"
exitCode: 0

- name: "tini"
command: "/usr/bin/tini"
args: ["--version"]
expectedOutput: ["tini version 0.18.0.*"]
exitCode: 0
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
id: "11"
label: "11"
available: true
trustLevel: stable
recommended: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* Copyright 2019-2021.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import com.bmuschko.gradle.docker.DockerRemoteApiPlugin
import com.saagie.technologies.SaagieTechnologiesGradlePlugin
import com.saagie.technologies.readDockerInfo
import com.saagie.technologies.getVersionForDocker


apply<DockerRemoteApiPlugin>()
apply<SaagieTechnologiesGradlePlugin>()

tasks.withType(com.bmuschko.gradle.docker.tasks.image.DockerBuildImage::class) {
this.buildArgs.put(
"jre_major",
"17"
)
}
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
image: saagie/spark
baseTag: 3.5-jre-17
dynamicVersion: 1.125.0
version: 3.5-jre-11-1.125.0
Loading

0 comments on commit 6784cb1

Please sign in to comment.