Skip to content

Commit

Permalink
Use official spark docker.
Browse files Browse the repository at this point in the history
  • Loading branch information
julien bignon committed Aug 22, 2024
1 parent 0e2613d commit 50d5aac
Show file tree
Hide file tree
Showing 5 changed files with 12 additions and 206 deletions.
53 changes: 6 additions & 47 deletions technologies/job/spark/spark-3.5/innerContexts/jre/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,59 +1,18 @@
ARG jre_major=11
FROM openjdk:${jre_major}-slim-bullseye
FROM spark:3.5.2

ARG SPARK_VERSION=3.5.2
ARG HADOOP_VERSION=3
ARG TINI_VERSION="v0.18.0"

ENV DEBIAN_FRONTEND noninteractive

ENV SPARK_HOME /opt/spark
ENV PATH "$PATH:$SPARK_HOME/bin"
ENV HADOOP_CONF_DIR=/etc/hadoop/conf
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/lib/hadoop/lib/native"

ENV LANG C.UTF-8

# LIGHT DEPENDENCIES START
RUN echo "deb http://deb.debian.org/debian/ bullseye-backports main contrib non-free" | tee /etc/apt/sources.list.d/bulleseye-backports.list && \
apt update -qq && apt install -yqq --no-install-recommends \
ftp wget curl unzip telnet openssh-client krb5-user zip procps && \
USER root
RUN apt update -qq && apt install -yqq --no-install-recommends \
wget curl unzip krb5-user zip && \
rm -rf /var/lib/apt/lists/*
# LIGHT DEPENDENCIES END

RUN set -ex && \
mkdir -p /opt/spark && \
mkdir -p /opt/spark/work-dir && \
touch /opt/spark/RELEASE && \
rm /bin/sh && \
ln -sv /bin/bash /bin/sh && \
echo "auth required pam_wheel.so use_uid" >> /etc/pam.d/su && \
chgrp root /etc/passwd && chmod ug+rw /etc/passwd && \
export TINI_HOME="/usr/local/sbin" && \
curl -fSL "https://github.com/krallin/tini/releases/download/$TINI_VERSION/tini" -o "${TINI_HOME}/tini" && \
curl -fSL "creleases/download/$TINI_VERSION/tini.asc" -o "${TINI_HOME}/tini.asc" && \
chmod +x "${TINI_HOME}/tini" && \
ln -s ${TINI_HOME}/tini /sbin/tini && \
"${TINI_HOME}/tini" -h

RUN mkdir -p /tmp/spark && \
cd /tmp/spark && \
wget -nv https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz && \
tar xf spark-*.tgz && \
rm spark-*.tgz && \
cp -R /tmp/spark/*/jars /opt/spark && \
cp -R /tmp/spark/*/bin /opt/spark && \
cp -R /tmp/spark/*/sbin /opt/spark && \
rm -Rf /tmp/spark
USER spark

#See hadoop version used by spark and udpate if necessary.
#See https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-aws/3.3.4 to get right version of aws-java-sdk-bundle
RUN wget -nv https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.262/aws-java-sdk-bundle-1.12.262.jar && \
wget -nv https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar && \
mv *.jar /opt/spark/jars/

COPY entrypoint.sh /opt/
RUN chmod 755 /opt/entrypoint.sh

WORKDIR /opt/spark/work-dir

ENTRYPOINT [ "/opt/entrypoint.sh" ]
107 changes: 0 additions & 107 deletions technologies/job/spark/spark-3.5/innerContexts/jre/entrypoint.sh

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,9 @@ metadataTest:
- key: LANG
value: "C.UTF-8"
- key: JAVA_HOME
value: "/usr/local/openjdk-11"
value: "/opt/java/openjdk"
- key: SPARK_HOME
value: "/opt/spark"
- key: "HADOOP_CONF_DIR"
value: "/etc/hadoop/conf"

fileExistenceTests:
- name: "entrypoint.sh"
Expand Down Expand Up @@ -42,11 +40,6 @@ commandTests:
expectedError: ['kinit: Program lacks support for encryption type while getting initial credentials']
exitCode: 1

- name: "ftp"
args: ["-h"]
command: "ftp"
exitCode: 0

- name: "wget"
args: ["--help"]
command: "wget"
Expand All @@ -67,18 +60,6 @@ commandTests:
command: "tar"
exitCode: 0

- name: "telnet"
command: "which"
args: ["telnet"]
expectedOutput: ["/usr/bin/telnet"]
exitCode: 0

- name: "scp"
command: "which"
args: ["scp"]
expectedOutput: ["/usr/bin/scp"]
exitCode: 0

- name: "tini"
command: "/sbin/tini"
args: ["--version"]
Expand Down
14 changes: 3 additions & 11 deletions technologies/job/spark/spark-3.5/innerContexts/python/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,11 @@ COPY --from=SPARK_BASE /opt/spark /opt/spark
COPY --from=SPARK_BASE /usr/local/sbin/tini /usr/local/sbin/tini
COPY --from=SPARK_BASE /sbin/tini /sbin/tini

COPY --from=SPARK_BASE /usr/local/openjdk-11 /usr/local/openjdk-11
COPY --from=SPARK_BASE /opt/java/openjdk /opt/java/openjdk

ENV JAVA_HOME /usr/local/openjdk-11
ENV JAVA_HOME /opt/java/openjdk
ENV LANG C.UTF-8
ENV SPARK_HOME /opt/spark
ENV HADOOP_CONF_DIR=/etc/hadoop/conf

# ADD ps binaries for load-spark-env.sh
RUN apt update -qq && apt install -yqq --no-install-recommends \
procps && \
rm -rf /var/lib/apt/lists/*

RUN pip --no-cache-dir install --upgrade pip \
&& pip --no-cache-dir install pyspark==3.5.2 \
Expand All @@ -35,6 +29,4 @@ RUN chmod 755 /opt/entrypoint.sh

WORKDIR /opt/spark/work-dir

ENTRYPOINT [ "/opt/entrypoint.sh" ]

ENV REBUILD ME
ENTRYPOINT [ "/opt/entrypoint.sh" ]
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,9 @@ metadataTest:
- key: LANG
value: "C.UTF-8"
- key: JAVA_HOME
value: "/usr/local/openjdk-11"
value: "/opt/java/openjdk"
- key: SPARK_HOME
value: "/opt/spark"
- key: "HADOOP_CONF_DIR"
value: "/etc/hadoop/conf"

fileExistenceTests:
- name: "entrypoint.sh"
Expand Down Expand Up @@ -54,11 +52,6 @@ commandTests:
expectedError: ['kinit: Program lacks support for encryption type while getting initial credentials']
exitCode: 1

- name: "ftp"
args: ["-h"]
command: "ftp"
exitCode: 0

- name: "wget"
args: ["--help"]
command: "wget"
Expand All @@ -78,19 +71,7 @@ commandTests:
args: ["--help"]
command: "tar"
exitCode: 0

- name: "telnet"
command: "which"
args: ["telnet"]
expectedOutput: ["/usr/bin/telnet"]
exitCode: 0

- name: "scp"
command: "which"
args: ["scp"]
expectedOutput: ["/usr/bin/scp"]
exitCode: 0


- name: "tini"
command: "/sbin/tini"
args: ["--version"]
Expand Down

0 comments on commit 50d5aac

Please sign in to comment.