From 494d8693e932a0025d4659e504620d6fc7c5db05 Mon Sep 17 00:00:00 2001 From: uti Date: Fri, 13 Dec 2024 23:36:44 +0000 Subject: [PATCH 01/10] Updated spark with scala and python --- spark-3.5.yaml => spark-3.5-scala-2.12.yaml | 4 +- .../dataframe-import.patch | 0 .../internal-access.patch | 0 .../load-spark-env.sh.diff | 0 .../make-distribution.patch | 0 .../pombump-deps.yaml | 0 .../pombump-properties.yaml | 0 .../scala-test | 0 .../spark-daemon.sh.diff | 0 spark-3.5-scala-2.13.yaml | 45 ++++++++++++------- spark-3.5-scala-2.13/pombump-deps.yaml | 11 +++++ 11 files changed, 42 insertions(+), 18 deletions(-) rename spark-3.5.yaml => spark-3.5-scala-2.12.yaml (99%) rename {spark-3.5 => spark-3.5-scala-2.12}/dataframe-import.patch (100%) rename {spark-3.5 => spark-3.5-scala-2.12}/internal-access.patch (100%) rename {spark-3.5 => spark-3.5-scala-2.12}/load-spark-env.sh.diff (100%) rename {spark-3.5 => spark-3.5-scala-2.12}/make-distribution.patch (100%) rename {spark-3.5 => spark-3.5-scala-2.12}/pombump-deps.yaml (100%) rename {spark-3.5 => spark-3.5-scala-2.12}/pombump-properties.yaml (100%) rename {spark-3.5 => spark-3.5-scala-2.12}/scala-test (100%) rename {spark-3.5 => spark-3.5-scala-2.12}/spark-daemon.sh.diff (100%) create mode 100644 spark-3.5-scala-2.13/pombump-deps.yaml diff --git a/spark-3.5.yaml b/spark-3.5-scala-2.12.yaml similarity index 99% rename from spark-3.5.yaml rename to spark-3.5-scala-2.12.yaml index c8a90c1ec03..19d1df629f5 100644 --- a/spark-3.5.yaml +++ b/spark-3.5-scala-2.12.yaml @@ -1,5 +1,5 @@ package: - name: spark-3.5 + name: spark-3.5-scala-2.12 version: 3.5.3 epoch: 5 description: Unified engine for large-scale data analytics @@ -161,7 +161,7 @@ subpackages: EOF /usr/lib/spark/bin/spark-submit sparkr_test.R - - name: ${{package.name}}-compat + - name: spark-compat description: "Compatibility package to place binaries in the location expected by upstream image" pipeline: - runs: | diff --git a/spark-3.5/dataframe-import.patch b/spark-3.5-scala-2.12/dataframe-import.patch similarity index 100% rename from spark-3.5/dataframe-import.patch rename to spark-3.5-scala-2.12/dataframe-import.patch diff --git a/spark-3.5/internal-access.patch b/spark-3.5-scala-2.12/internal-access.patch similarity index 100% rename from spark-3.5/internal-access.patch rename to spark-3.5-scala-2.12/internal-access.patch diff --git a/spark-3.5/load-spark-env.sh.diff b/spark-3.5-scala-2.12/load-spark-env.sh.diff similarity index 100% rename from spark-3.5/load-spark-env.sh.diff rename to spark-3.5-scala-2.12/load-spark-env.sh.diff diff --git a/spark-3.5/make-distribution.patch b/spark-3.5-scala-2.12/make-distribution.patch similarity index 100% rename from spark-3.5/make-distribution.patch rename to spark-3.5-scala-2.12/make-distribution.patch diff --git a/spark-3.5/pombump-deps.yaml b/spark-3.5-scala-2.12/pombump-deps.yaml similarity index 100% rename from spark-3.5/pombump-deps.yaml rename to spark-3.5-scala-2.12/pombump-deps.yaml diff --git a/spark-3.5/pombump-properties.yaml b/spark-3.5-scala-2.12/pombump-properties.yaml similarity index 100% rename from spark-3.5/pombump-properties.yaml rename to spark-3.5-scala-2.12/pombump-properties.yaml diff --git a/spark-3.5/scala-test b/spark-3.5-scala-2.12/scala-test similarity index 100% rename from spark-3.5/scala-test rename to spark-3.5-scala-2.12/scala-test diff --git a/spark-3.5/spark-daemon.sh.diff b/spark-3.5-scala-2.12/spark-daemon.sh.diff similarity index 100% rename from spark-3.5/spark-daemon.sh.diff rename to spark-3.5-scala-2.12/spark-daemon.sh.diff diff --git a/spark-3.5-scala-2.13.yaml b/spark-3.5-scala-2.13.yaml index ef866d6c0d2..c1680ebd9ac 100644 --- a/spark-3.5-scala-2.13.yaml +++ b/spark-3.5-scala-2.13.yaml @@ -29,8 +29,9 @@ environment: - openjdk-17-default-jdk - perl-utils - procps + - py3-setuptools - py3.11-pip - - python-3.11 + - python3 - wolfi-base - wolfi-baselayout - yaml-dev @@ -50,23 +51,24 @@ pipeline: ./dev/change-scala-version.sh 2.13 - uses: maven/pombump - with: - properties-file: pombump-properties.yaml - pom: pom.xml - runs: | - ./build/mvn -DskipTests -Pscala-2.13 clean package - - mkdir -p ${{targets.contextdir}}/usr/lib/spark - mkdir -p ${{targets.contextdir}}/usr/lib/spark/work-dir - mv bin/ ${{targets.contextdir}}/usr/lib/spark - mv sbin/ ${{targets.contextdir}}/usr/lib/spark - mv target ${{targets.contextdir}}/usr/lib/spark - cp resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh ${{targets.contextdir}}/usr/lib/spark/ - cp -R assembly/target/scala-2.13/jars ${{targets.contextdir}}/usr/lib/spark/ - mv assembly ${{targets.contextdir}}/usr/lib/spark + mkdir -p ${{targets.contextdir}}/usr/lib/spark/ + ./dev/make-distribution.sh --name pyspark-2.13 --pip --tgz -Pscala-2.13 -Phive -Phive-thriftserver -Pyarn + mv dist/* ${{targets.contextdir}}/usr/lib/spark/ subpackages: + - name: pyspark-2.13 + dependencies: + runtime: + - python3 + pipeline: + - working-directory: python + pipeline: + - uses: python/build + - name: Python Install + uses: python/install + - name: ${{package.name}}-compat description: "Compatibility package to place binaries in the location expected by upstream image" pipeline: @@ -74,7 +76,6 @@ subpackages: mkdir -p "${{targets.subpkgdir}}"/usr/bin mkdir -p "${{targets.subpkgdir}}"/opt - runs: | - mkdir -p "${{targets.subpkgdir}}"/usr/bin ln -s /usr/lib/spark/ ${{targets.subpkgdir}}/opt/spark ln -sf /usr/lib/spark/bin/spark-submit ${{targets.subpkgdir}}/usr/bin/spark-submit ln -sf /usr/lib/spark/bin/spark-shell ${{targets.subpkgdir}}/usr/bin/spark-shell @@ -89,6 +90,7 @@ test: packages: - openjdk-17-default-jvm - bash + - python3 environment: LANG: en_US.UTF-8 SCALA_VERSION: 2.13 @@ -101,7 +103,7 @@ test: - name: Test ${{package.name}} with OpenJDK 17 pipeline: - name: Test if the Scala versions are correct - runs: ls /usr/lib/spark/assembly/target/scala-2.13/jars/scala-* | grep -q $SCALA_VERSION + runs: ls /usr/lib/spark/jars/scala-* | grep -q $SCALA_VERSION - name: Check spark-shell --version runs: /usr/lib/spark/bin/spark-shell --version - name: Check spark-submit --version @@ -133,6 +135,17 @@ test: assert(result.count() == 1 && result.first().getString(0) == "Bob") EOF cat SQLTest.scala | /usr/lib/spark/bin/spark-shell --conf spark.jars.ivy=/tmp/.ivy --master local[*] + - name: Run a simple Spark job in Python + runs: | + cat < simple_job.py + from pyspark.sql import SparkSession + spark = SparkSession.builder.appName("SimpleJob").getOrCreate() + data = [1, 2, 3, 4, 5] + rdd = spark.sparkContext.parallelize(data) + sum = rdd.reduce(lambda x, y: x + y) + assert sum == 15 + EOF + /usr/lib/spark/bin/spark-submit simple_job.py --jars /usr/lib/spark/jars/guava-32.0.1-jre.jar --conf spark.jars.ivy=/tmp/.ivy --master local[*] update: enabled: true diff --git a/spark-3.5-scala-2.13/pombump-deps.yaml b/spark-3.5-scala-2.13/pombump-deps.yaml new file mode 100644 index 00000000000..869bd886be6 --- /dev/null +++ b/spark-3.5-scala-2.13/pombump-deps.yaml @@ -0,0 +1,11 @@ +patches: + - groupId: com.squareup.okio + artifactId: okio + version: 1.17.6 + scope: import + type: jar + - groupId: com.google.code.gson + artifactId: gson + version: 2.10.1 + scope: import + type: jar From 9feebaedefdcf925cc09c4355c8b210d4b6a4453 Mon Sep 17 00:00:00 2001 From: uti Date: Fri, 13 Dec 2024 23:44:32 +0000 Subject: [PATCH 02/10] epoch bumps --- spark-3.5-scala-2.12.yaml | 2 +- spark-3.5-scala-2.13.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/spark-3.5-scala-2.12.yaml b/spark-3.5-scala-2.12.yaml index 19d1df629f5..482029acfea 100644 --- a/spark-3.5-scala-2.12.yaml +++ b/spark-3.5-scala-2.12.yaml @@ -1,7 +1,7 @@ package: name: spark-3.5-scala-2.12 version: 3.5.3 - epoch: 5 + epoch: 0 description: Unified engine for large-scale data analytics copyright: - license: Apache-2.0 diff --git a/spark-3.5-scala-2.13.yaml b/spark-3.5-scala-2.13.yaml index c1680ebd9ac..8d1ec1d6117 100644 --- a/spark-3.5-scala-2.13.yaml +++ b/spark-3.5-scala-2.13.yaml @@ -1,7 +1,7 @@ package: name: spark-3.5-scala-2.13 version: 3.5.3 - epoch: 0 + epoch: 1 description: Unified engine for large-scale data analytics copyright: - license: Apache-2.0 From 5ab76f8bd4d29ee0d4afb3a56df1bcf739f7e8bf Mon Sep 17 00:00:00 2001 From: uti Date: Fri, 13 Dec 2024 23:49:02 +0000 Subject: [PATCH 03/10] remove deps on python --- spark-3.5-scala-2.13.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/spark-3.5-scala-2.13.yaml b/spark-3.5-scala-2.13.yaml index 8d1ec1d6117..07488188394 100644 --- a/spark-3.5-scala-2.13.yaml +++ b/spark-3.5-scala-2.13.yaml @@ -59,9 +59,6 @@ pipeline: subpackages: - name: pyspark-2.13 - dependencies: - runtime: - - python3 pipeline: - working-directory: python pipeline: From 310feba2d8c96d5c3445c5e66c26dcc1601c5da8 Mon Sep 17 00:00:00 2001 From: uti Date: Sat, 14 Dec 2024 15:54:29 +0000 Subject: [PATCH 04/10] Adding maven home --- spark-3.5-scala-2.12.yaml | 2 +- spark-3.5-scala-2.13.yaml | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/spark-3.5-scala-2.12.yaml b/spark-3.5-scala-2.12.yaml index 482029acfea..cc0edd26c78 100644 --- a/spark-3.5-scala-2.12.yaml +++ b/spark-3.5-scala-2.12.yaml @@ -161,7 +161,7 @@ subpackages: EOF /usr/lib/spark/bin/spark-submit sparkr_test.R - - name: spark-compat + - name: ${{package.name}}-compat description: "Compatibility package to place binaries in the location expected by upstream image" pipeline: - runs: | diff --git a/spark-3.5-scala-2.13.yaml b/spark-3.5-scala-2.13.yaml index 07488188394..161f86e0598 100644 --- a/spark-3.5-scala-2.13.yaml +++ b/spark-3.5-scala-2.13.yaml @@ -39,6 +39,7 @@ environment: LANG: en_US.UTF-8 JAVA_HOME: /usr/lib/jvm/java-17-openjdk MAVEN_OPTS: "-Xss64m -Xmx2g -XX:ReservedCodeCacheSize=1g" + M2_HOME: /usr/share/maven pipeline: - uses: git-checkout From eff546aa40e40d138600cc7fcaa7d965b49160d7 Mon Sep 17 00:00:00 2001 From: uti Date: Sat, 14 Dec 2024 16:32:23 +0000 Subject: [PATCH 05/10] Adding patch --- spark-3.5-scala-2.13.yaml | 9 ++++++--- spark-3.5-scala-2.13/make-distribution.patch | 13 +++++++++++++ 2 files changed, 19 insertions(+), 3 deletions(-) create mode 100644 spark-3.5-scala-2.13/make-distribution.patch diff --git a/spark-3.5-scala-2.13.yaml b/spark-3.5-scala-2.13.yaml index 161f86e0598..8cfa88a80d2 100644 --- a/spark-3.5-scala-2.13.yaml +++ b/spark-3.5-scala-2.13.yaml @@ -39,15 +39,18 @@ environment: LANG: en_US.UTF-8 JAVA_HOME: /usr/lib/jvm/java-17-openjdk MAVEN_OPTS: "-Xss64m -Xmx2g -XX:ReservedCodeCacheSize=1g" - M2_HOME: /usr/share/maven - + M2_HOME: /usr/share/java/maven-3.9 + MAVEN_HOME: /usr/share/java/maven-3.9 + PATH: /usr/share/java/maven-3.9/bin:$PATH pipeline: - uses: git-checkout with: repository: https://github.com/apache/spark tag: v${{package.version}} expected-commit: 32232e9ed33bb16b93ad58cfde8b82e0f07c0970 - + - uses: patch + with: + patches: make-distribution.patch - runs: | ./dev/change-scala-version.sh 2.13 diff --git a/spark-3.5-scala-2.13/make-distribution.patch b/spark-3.5-scala-2.13/make-distribution.patch new file mode 100644 index 00000000000..5ec8e33f4c9 --- /dev/null +++ b/spark-3.5-scala-2.13/make-distribution.patch @@ -0,0 +1,13 @@ +diff --git a/dev/make-distribution.sh b/dev/make-distribution.sh +index ef7c010e930..376ae239a66 100755 +--- a/dev/make-distribution.sh ++++ b/dev/make-distribution.sh +@@ -36,7 +36,7 @@ MAKE_TGZ=false + MAKE_PIP=false + MAKE_R=false + NAME=none +-MVN="$SPARK_HOME/build/mvn" ++MVN="mvn" + + function exit_with_usage { + set +x From fa229c69f45773987663f0a0531576bac1f1b185 Mon Sep 17 00:00:00 2001 From: uti Date: Sat, 14 Dec 2024 16:42:51 +0000 Subject: [PATCH 06/10] yam lint --- spark-3.5-scala-2.13.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/spark-3.5-scala-2.13.yaml b/spark-3.5-scala-2.13.yaml index 8cfa88a80d2..91a8dce9e5b 100644 --- a/spark-3.5-scala-2.13.yaml +++ b/spark-3.5-scala-2.13.yaml @@ -42,15 +42,18 @@ environment: M2_HOME: /usr/share/java/maven-3.9 MAVEN_HOME: /usr/share/java/maven-3.9 PATH: /usr/share/java/maven-3.9/bin:$PATH + pipeline: - uses: git-checkout with: repository: https://github.com/apache/spark tag: v${{package.version}} expected-commit: 32232e9ed33bb16b93ad58cfde8b82e0f07c0970 + - uses: patch with: patches: make-distribution.patch + - runs: | ./dev/change-scala-version.sh 2.13 From 557af19d812c84dba49555d258b637cab4ea9476 Mon Sep 17 00:00:00 2001 From: uti Date: Sat, 14 Dec 2024 16:56:48 +0000 Subject: [PATCH 07/10] build command change --- spark-3.5-scala-2.13/make-distribution.patch | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/spark-3.5-scala-2.13/make-distribution.patch b/spark-3.5-scala-2.13/make-distribution.patch index 5ec8e33f4c9..1d072bb80e5 100644 --- a/spark-3.5-scala-2.13/make-distribution.patch +++ b/spark-3.5-scala-2.13/make-distribution.patch @@ -9,5 +9,8 @@ index ef7c010e930..376ae239a66 100755 -MVN="$SPARK_HOME/build/mvn" +MVN="mvn" - function exit_with_usage { - set +x +-BUILD_COMMAND=("$MVN" clean package \ ++BUILD_COMMAND=("$MVN" -T$(grep -c processor /proc/cpuinfo) clean package \ + -DskipTests \ + -Dmaven.javadoc.skip=true \ + -Dmaven.scaladoc.skip=true \ \ No newline at end of file From 752bd5d9dae141c42a92d9b6e806f0f133916c84 Mon Sep 17 00:00:00 2001 From: uti Date: Sat, 14 Dec 2024 19:37:45 +0000 Subject: [PATCH 08/10] mvn fixed version --- spark-3.5-scala-2.12.yaml | 3 ++- spark-3.5-scala-2.13.yaml | 5 ++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/spark-3.5-scala-2.12.yaml b/spark-3.5-scala-2.12.yaml index cc0edd26c78..ff9adb00169 100644 --- a/spark-3.5-scala-2.12.yaml +++ b/spark-3.5-scala-2.12.yaml @@ -18,7 +18,7 @@ environment: - glibc-iconv - glibc-locale-en - grep - - maven + - maven-3.9.8 - openjdk-11 - openjdk-17 # Only 8 is used during the build process @@ -32,6 +32,7 @@ environment: - yaml-dev environment: LANG: en_US.UTF-8 + M2_HOME: /usr/share/java/maven-3.9.8 pipeline: - uses: git-checkout diff --git a/spark-3.5-scala-2.13.yaml b/spark-3.5-scala-2.13.yaml index 91a8dce9e5b..787daf9e170 100644 --- a/spark-3.5-scala-2.13.yaml +++ b/spark-3.5-scala-2.13.yaml @@ -25,7 +25,7 @@ environment: - glibc-iconv - glibc-locale-en - grep - - maven + - maven-3.9.8 - openjdk-17-default-jdk - perl-utils - procps @@ -39,8 +39,7 @@ environment: LANG: en_US.UTF-8 JAVA_HOME: /usr/lib/jvm/java-17-openjdk MAVEN_OPTS: "-Xss64m -Xmx2g -XX:ReservedCodeCacheSize=1g" - M2_HOME: /usr/share/java/maven-3.9 - MAVEN_HOME: /usr/share/java/maven-3.9 + M2_HOME: /usr/share/java/maven-3.9.8 PATH: /usr/share/java/maven-3.9/bin:$PATH pipeline: From c93da391b14b0af56254ad9b43c49b943e8307c3 Mon Sep 17 00:00:00 2001 From: uti Date: Sat, 14 Dec 2024 19:44:53 +0000 Subject: [PATCH 09/10] Maven versions --- spark-3.5-scala-2.12.yaml | 4 ++-- spark-3.5-scala-2.13.yaml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/spark-3.5-scala-2.12.yaml b/spark-3.5-scala-2.12.yaml index ff9adb00169..3d58378fb46 100644 --- a/spark-3.5-scala-2.12.yaml +++ b/spark-3.5-scala-2.12.yaml @@ -18,7 +18,7 @@ environment: - glibc-iconv - glibc-locale-en - grep - - maven-3.9.8 + - maven-3.9 - openjdk-11 - openjdk-17 # Only 8 is used during the build process @@ -32,7 +32,7 @@ environment: - yaml-dev environment: LANG: en_US.UTF-8 - M2_HOME: /usr/share/java/maven-3.9.8 + M2_HOME: /usr/share/java/maven-3.9 pipeline: - uses: git-checkout diff --git a/spark-3.5-scala-2.13.yaml b/spark-3.5-scala-2.13.yaml index 787daf9e170..b5c1ad941d7 100644 --- a/spark-3.5-scala-2.13.yaml +++ b/spark-3.5-scala-2.13.yaml @@ -25,7 +25,7 @@ environment: - glibc-iconv - glibc-locale-en - grep - - maven-3.9.8 + - maven-3.9 - openjdk-17-default-jdk - perl-utils - procps @@ -39,7 +39,7 @@ environment: LANG: en_US.UTF-8 JAVA_HOME: /usr/lib/jvm/java-17-openjdk MAVEN_OPTS: "-Xss64m -Xmx2g -XX:ReservedCodeCacheSize=1g" - M2_HOME: /usr/share/java/maven-3.9.8 + M2_HOME: /usr/share/java/maven-3.9 PATH: /usr/share/java/maven-3.9/bin:$PATH pipeline: From af59ebe84a7a0be42cb7d14bc04ec9212b51a8b2 Mon Sep 17 00:00:00 2001 From: uti Date: Sat, 14 Dec 2024 19:54:14 +0000 Subject: [PATCH 10/10] Updating patch to match --- spark-3.5-scala-2.13/make-distribution.patch | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/spark-3.5-scala-2.13/make-distribution.patch b/spark-3.5-scala-2.13/make-distribution.patch index 1d072bb80e5..9816073ef18 100644 --- a/spark-3.5-scala-2.13/make-distribution.patch +++ b/spark-3.5-scala-2.13/make-distribution.patch @@ -1,7 +1,8 @@ diff --git a/dev/make-distribution.sh b/dev/make-distribution.sh -index ef7c010e930..376ae239a66 100755 +index ef7c010e930..1769ecfad29 100755 --- a/dev/make-distribution.sh +++ b/dev/make-distribution.sh + @@ -36,7 +36,7 @@ MAKE_TGZ=false MAKE_PIP=false MAKE_R=false @@ -9,6 +10,10 @@ index ef7c010e930..376ae239a66 100755 -MVN="$SPARK_HOME/build/mvn" +MVN="mvn" +@@ -166,7 +166,7 @@ export MAVEN_OPTS="${MAVEN_OPTS:--Xss128m -Xmx4g -XX:ReservedCodeCacheSize=128m} + # Store the command as an array because $MVN variable might have spaces in it. + # Normal quoting tricks don't work. + # See: http://mywiki.wooledge.org/BashFAQ/050 -BUILD_COMMAND=("$MVN" clean package \ +BUILD_COMMAND=("$MVN" -T$(grep -c processor /proc/cpuinfo) clean package \ -DskipTests \