diff --git a/spark-3.5.yaml b/spark-3.5-scala-2.12.yaml similarity index 98% rename from spark-3.5.yaml rename to spark-3.5-scala-2.12.yaml index c8a90c1ec03..3d58378fb46 100644 --- a/spark-3.5.yaml +++ b/spark-3.5-scala-2.12.yaml @@ -1,7 +1,7 @@ package: - name: spark-3.5 + name: spark-3.5-scala-2.12 version: 3.5.3 - epoch: 5 + epoch: 0 description: Unified engine for large-scale data analytics copyright: - license: Apache-2.0 @@ -18,7 +18,7 @@ environment: - glibc-iconv - glibc-locale-en - grep - - maven + - maven-3.9 - openjdk-11 - openjdk-17 # Only 8 is used during the build process @@ -32,6 +32,7 @@ environment: - yaml-dev environment: LANG: en_US.UTF-8 + M2_HOME: /usr/share/java/maven-3.9 pipeline: - uses: git-checkout diff --git a/spark-3.5/dataframe-import.patch b/spark-3.5-scala-2.12/dataframe-import.patch similarity index 100% rename from spark-3.5/dataframe-import.patch rename to spark-3.5-scala-2.12/dataframe-import.patch diff --git a/spark-3.5/internal-access.patch b/spark-3.5-scala-2.12/internal-access.patch similarity index 100% rename from spark-3.5/internal-access.patch rename to spark-3.5-scala-2.12/internal-access.patch diff --git a/spark-3.5/load-spark-env.sh.diff b/spark-3.5-scala-2.12/load-spark-env.sh.diff similarity index 100% rename from spark-3.5/load-spark-env.sh.diff rename to spark-3.5-scala-2.12/load-spark-env.sh.diff diff --git a/spark-3.5/make-distribution.patch b/spark-3.5-scala-2.12/make-distribution.patch similarity index 100% rename from spark-3.5/make-distribution.patch rename to spark-3.5-scala-2.12/make-distribution.patch diff --git a/spark-3.5/pombump-deps.yaml b/spark-3.5-scala-2.12/pombump-deps.yaml similarity index 100% rename from spark-3.5/pombump-deps.yaml rename to spark-3.5-scala-2.12/pombump-deps.yaml diff --git a/spark-3.5/pombump-properties.yaml b/spark-3.5-scala-2.12/pombump-properties.yaml similarity index 100% rename from spark-3.5/pombump-properties.yaml rename to spark-3.5-scala-2.12/pombump-properties.yaml diff --git a/spark-3.5/scala-test b/spark-3.5-scala-2.12/scala-test similarity index 100% rename from spark-3.5/scala-test rename to spark-3.5-scala-2.12/scala-test diff --git a/spark-3.5/spark-daemon.sh.diff b/spark-3.5-scala-2.12/spark-daemon.sh.diff similarity index 100% rename from spark-3.5/spark-daemon.sh.diff rename to spark-3.5-scala-2.12/spark-daemon.sh.diff diff --git a/spark-3.5-scala-2.13.yaml b/spark-3.5-scala-2.13.yaml index ef866d6c0d2..b5c1ad941d7 100644 --- a/spark-3.5-scala-2.13.yaml +++ b/spark-3.5-scala-2.13.yaml @@ -1,7 +1,7 @@ package: name: spark-3.5-scala-2.13 version: 3.5.3 - epoch: 0 + epoch: 1 description: Unified engine for large-scale data analytics copyright: - license: Apache-2.0 @@ -25,12 +25,13 @@ environment: - glibc-iconv - glibc-locale-en - grep - - maven + - maven-3.9 - openjdk-17-default-jdk - perl-utils - procps + - py3-setuptools - py3.11-pip - - python-3.11 + - python3 - wolfi-base - wolfi-baselayout - yaml-dev @@ -38,6 +39,8 @@ environment: LANG: en_US.UTF-8 JAVA_HOME: /usr/lib/jvm/java-17-openjdk MAVEN_OPTS: "-Xss64m -Xmx2g -XX:ReservedCodeCacheSize=1g" + M2_HOME: /usr/share/java/maven-3.9 + PATH: /usr/share/java/maven-3.9/bin:$PATH pipeline: - uses: git-checkout @@ -46,27 +49,29 @@ pipeline: tag: v${{package.version}} expected-commit: 32232e9ed33bb16b93ad58cfde8b82e0f07c0970 + - uses: patch + with: + patches: make-distribution.patch + - runs: | ./dev/change-scala-version.sh 2.13 - uses: maven/pombump - with: - properties-file: pombump-properties.yaml - pom: pom.xml - runs: | - ./build/mvn -DskipTests -Pscala-2.13 clean package - - mkdir -p ${{targets.contextdir}}/usr/lib/spark - mkdir -p ${{targets.contextdir}}/usr/lib/spark/work-dir - mv bin/ ${{targets.contextdir}}/usr/lib/spark - mv sbin/ ${{targets.contextdir}}/usr/lib/spark - mv target ${{targets.contextdir}}/usr/lib/spark - cp resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh ${{targets.contextdir}}/usr/lib/spark/ - cp -R assembly/target/scala-2.13/jars ${{targets.contextdir}}/usr/lib/spark/ - mv assembly ${{targets.contextdir}}/usr/lib/spark + mkdir -p ${{targets.contextdir}}/usr/lib/spark/ + ./dev/make-distribution.sh --name pyspark-2.13 --pip --tgz -Pscala-2.13 -Phive -Phive-thriftserver -Pyarn + mv dist/* ${{targets.contextdir}}/usr/lib/spark/ subpackages: + - name: pyspark-2.13 + pipeline: + - working-directory: python + pipeline: + - uses: python/build + - name: Python Install + uses: python/install + - name: ${{package.name}}-compat description: "Compatibility package to place binaries in the location expected by upstream image" pipeline: @@ -74,7 +79,6 @@ subpackages: mkdir -p "${{targets.subpkgdir}}"/usr/bin mkdir -p "${{targets.subpkgdir}}"/opt - runs: | - mkdir -p "${{targets.subpkgdir}}"/usr/bin ln -s /usr/lib/spark/ ${{targets.subpkgdir}}/opt/spark ln -sf /usr/lib/spark/bin/spark-submit ${{targets.subpkgdir}}/usr/bin/spark-submit ln -sf /usr/lib/spark/bin/spark-shell ${{targets.subpkgdir}}/usr/bin/spark-shell @@ -89,6 +93,7 @@ test: packages: - openjdk-17-default-jvm - bash + - python3 environment: LANG: en_US.UTF-8 SCALA_VERSION: 2.13 @@ -101,7 +106,7 @@ test: - name: Test ${{package.name}} with OpenJDK 17 pipeline: - name: Test if the Scala versions are correct - runs: ls /usr/lib/spark/assembly/target/scala-2.13/jars/scala-* | grep -q $SCALA_VERSION + runs: ls /usr/lib/spark/jars/scala-* | grep -q $SCALA_VERSION - name: Check spark-shell --version runs: /usr/lib/spark/bin/spark-shell --version - name: Check spark-submit --version @@ -133,6 +138,17 @@ test: assert(result.count() == 1 && result.first().getString(0) == "Bob") EOF cat SQLTest.scala | /usr/lib/spark/bin/spark-shell --conf spark.jars.ivy=/tmp/.ivy --master local[*] + - name: Run a simple Spark job in Python + runs: | + cat < simple_job.py + from pyspark.sql import SparkSession + spark = SparkSession.builder.appName("SimpleJob").getOrCreate() + data = [1, 2, 3, 4, 5] + rdd = spark.sparkContext.parallelize(data) + sum = rdd.reduce(lambda x, y: x + y) + assert sum == 15 + EOF + /usr/lib/spark/bin/spark-submit simple_job.py --jars /usr/lib/spark/jars/guava-32.0.1-jre.jar --conf spark.jars.ivy=/tmp/.ivy --master local[*] update: enabled: true diff --git a/spark-3.5-scala-2.13/make-distribution.patch b/spark-3.5-scala-2.13/make-distribution.patch new file mode 100644 index 00000000000..9816073ef18 --- /dev/null +++ b/spark-3.5-scala-2.13/make-distribution.patch @@ -0,0 +1,21 @@ +diff --git a/dev/make-distribution.sh b/dev/make-distribution.sh +index ef7c010e930..1769ecfad29 100755 +--- a/dev/make-distribution.sh ++++ b/dev/make-distribution.sh + +@@ -36,7 +36,7 @@ MAKE_TGZ=false + MAKE_PIP=false + MAKE_R=false + NAME=none +-MVN="$SPARK_HOME/build/mvn" ++MVN="mvn" + +@@ -166,7 +166,7 @@ export MAVEN_OPTS="${MAVEN_OPTS:--Xss128m -Xmx4g -XX:ReservedCodeCacheSize=128m} + # Store the command as an array because $MVN variable might have spaces in it. + # Normal quoting tricks don't work. + # See: http://mywiki.wooledge.org/BashFAQ/050 +-BUILD_COMMAND=("$MVN" clean package \ ++BUILD_COMMAND=("$MVN" -T$(grep -c processor /proc/cpuinfo) clean package \ + -DskipTests \ + -Dmaven.javadoc.skip=true \ + -Dmaven.scaladoc.skip=true \ \ No newline at end of file diff --git a/spark-3.5-scala-2.13/pombump-deps.yaml b/spark-3.5-scala-2.13/pombump-deps.yaml new file mode 100644 index 00000000000..869bd886be6 --- /dev/null +++ b/spark-3.5-scala-2.13/pombump-deps.yaml @@ -0,0 +1,11 @@ +patches: + - groupId: com.squareup.okio + artifactId: okio + version: 1.17.6 + scope: import + type: jar + - groupId: com.google.code.gson + artifactId: gson + version: 2.10.1 + scope: import + type: jar