Skip to content

Commit 7f83637

Browse files
committed
[SPARK-43365] Refactor Dockerfile and workflow based on base image
### What changes were proposed in this pull request? This PR changes Dockerfile and workflow based on base image to save space by sharing layers by having one image from another. After this PR: - The spark / PySpark / SparkR related files extract into base image - Install PySpark / SparkR deps in PySpark / SparkR images. - Add the base image build step - Apply changes to template: `./add-dockerfiles.sh 3.4.0` to make it work. - This PR didn't contain changes on 3.3.X Dockerfiles to make PR more clear, the 3.3.x changes will be a separate PR when we address all comments for 3.4.0. [1] docker-library/official-images#13089 ### Why are the changes needed? Address DOI comments, and also to save space by sharing layers by having one image from another. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? CI passed. Closes #36 from Yikun/official. Authored-by: Yikun Jiang <yikunkero@gmail.com> Signed-off-by: Yikun Jiang <yikunkero@gmail.com>
1 parent fe05e38 commit 7f83637

File tree

14 files changed

+93
-534
lines changed

14 files changed

+93
-534
lines changed

.github/workflows/main.yml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,10 +91,12 @@ jobs:
9191
scala) SUFFIX=ubuntu
9292
;;
9393
esac
94+
BASE_IMGAE_TAG=${{ inputs.spark }}-scala${{ inputs.scala }}-java${{ inputs.java }}-ubuntu
9495
TAG=scala${{ inputs.scala }}-java${{ inputs.java }}-$SUFFIX
9596
9697
IMAGE_NAME=spark
9798
IMAGE_PATH=${{ inputs.spark }}/$TAG
99+
BASE_IMAGE_PATH=${{ inputs.spark }}/scala${{ inputs.scala }}-java${{ inputs.java }}-ubuntu
98100
if [ "${{ inputs.build }}" == "true" ]; then
99101
# Use the local registry to build and test
100102
REPO_OWNER=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]')
@@ -105,6 +107,7 @@ jobs:
105107
TEST_REPO=${{ inputs.repository }}
106108
UNIQUE_IMAGE_TAG=${{ inputs.image-tag }}
107109
fi
110+
BASE_IMAGE_URL=$TEST_REPO/$IMAGE_NAME:$BASE_IMGAE_TAG
108111
IMAGE_URL=$TEST_REPO/$IMAGE_NAME:$UNIQUE_IMAGE_TAG
109112
110113
PUBLISH_REPO=${{ inputs.repository }}
@@ -116,8 +119,12 @@ jobs:
116119
echo "TEST_REPO=${TEST_REPO}" >> $GITHUB_ENV
117120
# Image name: spark
118121
echo "IMAGE_NAME=${IMAGE_NAME}" >> $GITHUB_ENV
122+
# Base Image Dockerfile: 3.3.0/scala2.12-java11-ubuntu
123+
echo "BASE_IMAGE_PATH=${BASE_IMAGE_PATH}" >> $GITHUB_ENV
119124
# Image dockerfile path: 3.3.0/scala2.12-java11-python3-ubuntu
120125
echo "IMAGE_PATH=${IMAGE_PATH}" >> $GITHUB_ENV
126+
# Base Image URL: spark:3.3.0-scala2.12-java11-ubuntu
127+
echo "BASE_IMAGE_URL=${BASE_IMAGE_URL}" >> $GITHUB_ENV
121128
# Image URL: ghcr.io/apache/spark-docker/spark:3.3.0-scala2.12-java11-python3-ubuntu
122129
echo "IMAGE_URL=${IMAGE_URL}" >> $GITHUB_ENV
123130
@@ -132,6 +139,9 @@ jobs:
132139
echo "IMAGE_PATH: "${IMAGE_PATH}
133140
echo "IMAGE_URL: "${IMAGE_URL}
134141
142+
echo "BASE_IMAGE_PATH: "${BASE_IMAGE_PATH}
143+
echo "BASE_IMAGE_URL: "${BASE_IMAGE_URL}
144+
135145
echo "PUBLISH_REPO:"${PUBLISH_REPO}
136146
echo "PUBLISH_IMAGE_URL:"${PUBLISH_IMAGE_URL}
137147
@@ -146,10 +156,20 @@ jobs:
146156
# This required by local registry
147157
driver-opts: network=host
148158

159+
- name: Build - Build the base image
160+
if: ${{ inputs.build }}
161+
uses: docker/build-push-action@v3
162+
with:
163+
context: ${{ env.BASE_IMAGE_PATH }}
164+
tags: ${{ env.BASE_IMAGE_URL }}
165+
platforms: linux/amd64,linux/arm64
166+
push: true
167+
149168
- name: Build - Build and push test image
150169
if: ${{ inputs.build }}
151170
uses: docker/build-push-action@v3
152171
with:
172+
build-args: BASE_IMAGE=${{ env.BASE_IMAGE_URL }}
153173
context: ${{ env.IMAGE_PATH }}
154174
tags: ${{ env.IMAGE_URL }}
155175
platforms: linux/amd64,linux/arm64

3.4.0/scala2.12-java11-python3-r-ubuntu/Dockerfile

Lines changed: 2 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -14,73 +14,14 @@
1414
# See the License for the specific language governing permissions and
1515
# limitations under the License.
1616
#
17-
FROM eclipse-temurin:11-jre-focal
18-
19-
ARG spark_uid=185
20-
21-
RUN groupadd --system --gid=${spark_uid} spark && \
22-
useradd --system --uid=${spark_uid} --gid=spark spark
17+
ARG BASE_IMAGE=spark:3.4.0-scala2.12-java11-ubuntu
18+
FROM $BASE_IMAGE
2319

2420
RUN set -ex && \
2521
apt-get update && \
26-
ln -s /lib /lib64 && \
27-
apt install -y gnupg2 wget bash tini libc6 libpam-modules krb5-user libnss3 procps net-tools gosu && \
2822
apt install -y python3 python3-pip && \
2923
apt install -y r-base r-base-dev && \
30-
mkdir -p /opt/spark && \
31-
mkdir /opt/spark/python && \
32-
mkdir -p /opt/spark/examples && \
33-
mkdir -p /opt/spark/work-dir && \
34-
touch /opt/spark/RELEASE && \
35-
chown -R spark:spark /opt/spark && \
36-
rm /bin/sh && \
37-
ln -sv /bin/bash /bin/sh && \
38-
echo "auth required pam_wheel.so use_uid" >> /etc/pam.d/su && \
39-
chgrp root /etc/passwd && chmod ug+rw /etc/passwd && \
4024
rm -rf /var/cache/apt/* && \
4125
rm -rf /var/lib/apt/lists/*
4226

43-
# Install Apache Spark
44-
# https://downloads.apache.org/spark/KEYS
45-
ENV SPARK_TGZ_URL=https://archive.apache.org/dist/spark/spark-3.4.0/spark-3.4.0-bin-hadoop3.tgz \
46-
SPARK_TGZ_ASC_URL=https://archive.apache.org/dist/spark/spark-3.4.0/spark-3.4.0-bin-hadoop3.tgz.asc \
47-
GPG_KEY=CC68B3D16FE33A766705160BA7E57908C7A4E1B1
48-
49-
RUN set -ex; \
50-
export SPARK_TMP="$(mktemp -d)"; \
51-
cd $SPARK_TMP; \
52-
wget -nv -O spark.tgz "$SPARK_TGZ_URL"; \
53-
wget -nv -O spark.tgz.asc "$SPARK_TGZ_ASC_URL"; \
54-
export GNUPGHOME="$(mktemp -d)"; \
55-
gpg --keyserver hkps://keys.openpgp.org --recv-key "$GPG_KEY" || \
56-
gpg --keyserver hkps://keyserver.ubuntu.com --recv-keys "$GPG_KEY"; \
57-
gpg --batch --verify spark.tgz.asc spark.tgz; \
58-
gpgconf --kill all; \
59-
rm -rf "$GNUPGHOME" spark.tgz.asc; \
60-
\
61-
tar -xf spark.tgz --strip-components=1; \
62-
chown -R spark:spark .; \
63-
mv jars /opt/spark/; \
64-
mv bin /opt/spark/; \
65-
mv sbin /opt/spark/; \
66-
mv kubernetes/dockerfiles/spark/decom.sh /opt/; \
67-
mv examples /opt/spark/; \
68-
mv kubernetes/tests /opt/spark/; \
69-
mv data /opt/spark/; \
70-
mv python/pyspark /opt/spark/python/pyspark/; \
71-
mv python/lib /opt/spark/python/lib/; \
72-
mv R /opt/spark/; \
73-
cd ..; \
74-
rm -rf "$SPARK_TMP";
75-
76-
COPY entrypoint.sh /opt/
77-
78-
ENV SPARK_HOME /opt/spark
7927
ENV R_HOME /usr/lib/R
80-
81-
WORKDIR /opt/spark/work-dir
82-
RUN chmod g+w /opt/spark/work-dir
83-
RUN chmod a+x /opt/decom.sh
84-
RUN chmod a+x /opt/entrypoint.sh
85-
86-
ENTRYPOINT [ "/opt/entrypoint.sh" ]

3.4.0/scala2.12-java11-python3-r-ubuntu/entrypoint.sh

Lines changed: 0 additions & 114 deletions
This file was deleted.

3.4.0/scala2.12-java11-python3-ubuntu/Dockerfile

Lines changed: 2 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -14,70 +14,11 @@
1414
# See the License for the specific language governing permissions and
1515
# limitations under the License.
1616
#
17-
FROM eclipse-temurin:11-jre-focal
18-
19-
ARG spark_uid=185
20-
21-
RUN groupadd --system --gid=${spark_uid} spark && \
22-
useradd --system --uid=${spark_uid} --gid=spark spark
17+
ARG BASE_IMAGE=spark:3.4.0-scala2.12-java11-ubuntu
18+
FROM $BASE_IMAGE
2319

2420
RUN set -ex && \
2521
apt-get update && \
26-
ln -s /lib /lib64 && \
27-
apt install -y gnupg2 wget bash tini libc6 libpam-modules krb5-user libnss3 procps net-tools gosu && \
2822
apt install -y python3 python3-pip && \
29-
mkdir -p /opt/spark && \
30-
mkdir /opt/spark/python && \
31-
mkdir -p /opt/spark/examples && \
32-
mkdir -p /opt/spark/work-dir && \
33-
touch /opt/spark/RELEASE && \
34-
chown -R spark:spark /opt/spark && \
35-
rm /bin/sh && \
36-
ln -sv /bin/bash /bin/sh && \
37-
echo "auth required pam_wheel.so use_uid" >> /etc/pam.d/su && \
38-
chgrp root /etc/passwd && chmod ug+rw /etc/passwd && \
3923
rm -rf /var/cache/apt/* && \
4024
rm -rf /var/lib/apt/lists/*
41-
42-
# Install Apache Spark
43-
# https://downloads.apache.org/spark/KEYS
44-
ENV SPARK_TGZ_URL=https://archive.apache.org/dist/spark/spark-3.4.0/spark-3.4.0-bin-hadoop3.tgz \
45-
SPARK_TGZ_ASC_URL=https://archive.apache.org/dist/spark/spark-3.4.0/spark-3.4.0-bin-hadoop3.tgz.asc \
46-
GPG_KEY=CC68B3D16FE33A766705160BA7E57908C7A4E1B1
47-
48-
RUN set -ex; \
49-
export SPARK_TMP="$(mktemp -d)"; \
50-
cd $SPARK_TMP; \
51-
wget -nv -O spark.tgz "$SPARK_TGZ_URL"; \
52-
wget -nv -O spark.tgz.asc "$SPARK_TGZ_ASC_URL"; \
53-
export GNUPGHOME="$(mktemp -d)"; \
54-
gpg --keyserver hkps://keys.openpgp.org --recv-key "$GPG_KEY" || \
55-
gpg --keyserver hkps://keyserver.ubuntu.com --recv-keys "$GPG_KEY"; \
56-
gpg --batch --verify spark.tgz.asc spark.tgz; \
57-
gpgconf --kill all; \
58-
rm -rf "$GNUPGHOME" spark.tgz.asc; \
59-
\
60-
tar -xf spark.tgz --strip-components=1; \
61-
chown -R spark:spark .; \
62-
mv jars /opt/spark/; \
63-
mv bin /opt/spark/; \
64-
mv sbin /opt/spark/; \
65-
mv kubernetes/dockerfiles/spark/decom.sh /opt/; \
66-
mv examples /opt/spark/; \
67-
mv kubernetes/tests /opt/spark/; \
68-
mv data /opt/spark/; \
69-
mv python/pyspark /opt/spark/python/pyspark/; \
70-
mv python/lib /opt/spark/python/lib/; \
71-
cd ..; \
72-
rm -rf "$SPARK_TMP";
73-
74-
COPY entrypoint.sh /opt/
75-
76-
ENV SPARK_HOME /opt/spark
77-
78-
WORKDIR /opt/spark/work-dir
79-
RUN chmod g+w /opt/spark/work-dir
80-
RUN chmod a+x /opt/decom.sh
81-
RUN chmod a+x /opt/entrypoint.sh
82-
83-
ENTRYPOINT [ "/opt/entrypoint.sh" ]

0 commit comments

Comments
 (0)