diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 369d60ef61..14391a7999 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -41,10 +41,12 @@ repos: args: [--config, ./mypy.ini] additional_dependencies: [ + "beautifulsoup4", "numpy", "pytest", "requests", "urllib3", + "types-beautifulsoup4", "types-requests", "types-tabulate", "types-urllib3", diff --git a/docs/using/specifics.md b/docs/using/specifics.md index f0ede8f4cf..c578e37803 100644 --- a/docs/using/specifics.md +++ b/docs/using/specifics.md @@ -42,18 +42,20 @@ ipython profile create You can build a `pyspark-notebook` image with a different `Spark` version by overriding the default value of the following arguments at build time. `all-spark-notebook` is inherited from `pyspark-notebook`, so you have to first build `pyspark-notebook` and then `all-spark-notebook` to get the same version in `all-spark-notebook`. -- Spark distribution is defined by the combination of Spark, Hadoop, and Scala versions and verified by the package checksum, +- Spark distribution is defined by the combination of Spark, Hadoop, and Scala versions, see [Download Apache Spark](https://spark.apache.org/downloads.html) and the [archive repo](https://archive.apache.org/dist/spark/) for more information. - - `spark_version`: The Spark version to install (`3.3.0`). - - `hadoop_version`: The Hadoop version (`3.2`). - - `scala_version`: The Scala version (`2.13`, optional). - - `spark_checksum`: The package checksum (`BFE4540...`). - - `openjdk_version`: The version of the OpenJDK (JRE headless) distribution (`17`). + - `openjdk_version`: The version of the OpenJDK (JRE headless) distribution (`17` by default). - This version needs to match the version supported by the Spark distribution used above. - See [Spark Overview](https://spark.apache.org/docs/latest/#downloading) and [Ubuntu packages](https://packages.ubuntu.com/search?keywords=openjdk). - -- Starting with _Spark >= 3.2_, the distribution file might contain the Scala version. + - `spark_version` (optional): The Spark version to install, for example `3.5.0`. + If not specified (this is the default), latest stable Spark will be installed. + - `hadoop_version`: The Hadoop version (`3` by default). + Note, that _Spark < 3.3_ require to specify `major.minor` Hadoop version (i.e. `3.2`). + - `scala_version` (optional): The Scala version, for example `2.13` (not specified by default). + Starting with _Spark >= 3.2_, the distribution file might contain the Scala version. + - `spark_download_url`: URL to use for Spark downloads. + You may need to use url if you want to download old Spark versions. For example, here is how to build a `pyspark-notebook` image with Spark `3.2.0`, Hadoop `3.2`, and OpenJDK `11`. @@ -65,14 +67,14 @@ This recipe is not tested and might be broken. # From the root of the project # Build the image with different arguments docker build --rm --force-rm \ - -t jupyter/pyspark-notebook:spark-3.2.0 ./images/pyspark-notebook \ + -t my-pyspark-notebook ./images/pyspark-notebook \ + --build-arg openjdk_version=11 \ --build-arg spark_version=3.2.0 \ --build-arg hadoop_version=3.2 \ - --build-arg spark_checksum=707DDE035926A50B75E53FCA72CADA519F3239B14A96546911CB4916A58DCF69A1D2BFDD2C7DD5899324DBD82B6EEAB9797A7B4ABF86736FFCA4C26D0E0BF0EE \ - --build-arg openjdk_version=11 + --build-arg spark_download_url="https://archive.apache.org/dist/spark/" # Check the newly built image -docker run -it --rm quay.io/jupyter/pyspark-notebook:spark-3.2.0 pyspark --version +docker run -it --rm my-pyspark-notebook pyspark --version # Welcome to # ____ __ @@ -81,7 +83,12 @@ docker run -it --rm quay.io/jupyter/pyspark-notebook:spark-3.2.0 pyspark --versi # /___/ .__/\_,_/_/ /_/\_\ version 3.2.0 # /_/ -# Using Scala version 2.13.5, OpenJDK 64-Bit Server VM, 11.0.15 +# Using Scala version 2.12.15, OpenJDK 64-Bit Server VM, 11.0.21 +# Branch HEAD +# Compiled by user ubuntu on 2021-10-06T12:46:30Z +# Revision 5d45a415f3a29898d92380380cfd82bfc7f579ea +# Url https://github.com/apache/spark +# Type --help for more information. ``` ### Usage Examples diff --git a/images/pyspark-notebook/Dockerfile b/images/pyspark-notebook/Dockerfile index 334181c373..212e3a5502 100644 --- a/images/pyspark-notebook/Dockerfile +++ b/images/pyspark-notebook/Dockerfile @@ -16,49 +16,38 @@ USER root # Spark dependencies # Default values can be overridden at build time # (ARGS are in lowercase to distinguish them from ENV) -ARG spark_version="3.5.0" -ARG hadoop_version="3" -ARG scala_version -ARG spark_checksum="8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319" ARG openjdk_version="17" -ENV APACHE_SPARK_VERSION="${spark_version}" \ - HADOOP_VERSION="${hadoop_version}" - RUN apt-get update --yes && \ apt-get install --yes --no-install-recommends \ "openjdk-${openjdk_version}-jre-headless" \ ca-certificates-java && \ apt-get clean && rm -rf /var/lib/apt/lists/* -# Spark installation -WORKDIR /tmp - -# You need to use https://archive.apache.org/dist/ website if you want to download old Spark versions +# If spark_version is not set, latest stable Spark will be installed +ARG spark_version +ARG hadoop_version="3" +# If scala_version is not set, Spark without Scala will be installed +ARG scala_version +# URL to use for Spark downloads +# You need to use https://archive.apache.org/dist/spark/ website if you want to download old Spark versions # But it seems to be slower, that's why we use the recommended site for download -RUN if [ -z "${scala_version}" ]; then \ - curl --progress-bar --location --output "spark.tgz" \ - "https://dlcdn.apache.org/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz"; \ - else \ - curl --progress-bar --location --output "spark.tgz" \ - "https://dlcdn.apache.org/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${scala_version}.tgz"; \ - fi && \ - echo "${spark_checksum} *spark.tgz" | sha512sum -c - && \ - tar xzf "spark.tgz" -C /usr/local --owner root --group root --no-same-owner && \ - rm "spark.tgz" +ARG spark_download_url="https://dlcdn.apache.org/spark/" # Configure Spark +ENV SPARK_VERSION="${spark_version}" \ + HADOOP_VERSION="${hadoop_version}" \ + SCALA_VERSION="${scala_version}" \ + SPARK_DOWNLOAD_URL="${spark_download_url}" + ENV SPARK_HOME=/usr/local/spark -ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" \ - PATH="${PATH}:${SPARK_HOME}/bin" +ENV PATH="${PATH}:${SPARK_HOME}/bin" +ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" + +COPY setup_spark.py /opt/setup-scripts/ -RUN if [ -z "${scala_version}" ]; then \ - ln -s "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" "${SPARK_HOME}"; \ - else \ - ln -s "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${scala_version}" "${SPARK_HOME}"; \ - fi && \ - # Add a link in the before_notebook hook in order to source automatically PYTHONPATH && \ - ln -s "${SPARK_HOME}/sbin/spark-config.sh" /usr/local/bin/before-notebook.d/10spark-config.sh +# Setup Spark +RUN /opt/setup-scripts/setup_spark.py # Configure IPython system-wide COPY ipython_kernel_config.py "/etc/ipython/" diff --git a/images/pyspark-notebook/setup_spark.py b/images/pyspark-notebook/setup_spark.py new file mode 100755 index 0000000000..54e5994855 --- /dev/null +++ b/images/pyspark-notebook/setup_spark.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python3 +# Copyright (c) Jupyter Development Team. +# Distributed under the terms of the Modified BSD License. + +# Requirements: +# - Run as the root user +# - Required env variables: SPARK_HOME, HADOOP_VERSION, SPARK_DOWNLOAD_URL +# - Optional env variables: SPARK_VERSION, SCALA_VERSION + +import os +import subprocess +from pathlib import Path + +import requests +from bs4 import BeautifulSoup + + +def get_all_refs(url: str) -> list[str]: + """ + Get all the references for a given webpage + """ + resp = requests.get(url) + soup = BeautifulSoup(resp.text, "html.parser") + return [a["href"] for a in soup.find_all("a", href=True)] + + +def get_spark_version() -> str: + """ + If ${SPARK_VERSION} env variable is non-empty, simply returns it + Otherwise, returns the last stable version of Spark using spark archive + """ + if (version := os.environ["SPARK_VERSION"]) != "": + return version + all_refs = get_all_refs("https://archive.apache.org/dist/spark/") + stable_versions = [ + ref.removeprefix("spark-").removesuffix("/") + for ref in all_refs + if ref.startswith("spark-") and "incubating" not in ref and "preview" not in ref + ] + # Compare versions semantically + return max( + stable_versions, key=lambda ver: [int(sub_ver) for sub_ver in ver.split(".")] + ) + + +def download_spark( + spark_version: str, + hadoop_version: str, + scala_version: str, + spark_download_url: Path, +) -> str: + """ + Downloads and unpacks spark + The resulting spark directory name is returned + """ + spark_dir_name = f"spark-{spark_version}-bin-hadoop{hadoop_version}" + if scala_version: + spark_dir_name += f"-scala{scala_version}" + spark_url = spark_download_url / f"spark-{spark_version}" / f"{spark_dir_name}.tgz" + + tmp_file = Path("/tmp/spark.tar.gz") + subprocess.check_call( + ["curl", "--progress-bar", "--location", "--output", tmp_file, spark_url] + ) + subprocess.check_call( + [ + "tar", + "xzf", + tmp_file, + "-C", + "/usr/local", + "--owner", + "root", + "--group", + "root", + "--no-same-owner", + ] + ) + tmp_file.unlink() + return spark_dir_name + + +def prepare_spark(spark_dir_name: str, spark_home: Path) -> None: + """ + Creates a ${SPARK_HOME} symlink to a versioned spark directory + Creates a 10spark-config.sh symlink to source PYTHONPATH automatically + """ + subprocess.check_call(["ln", "-s", f"/usr/local/{spark_dir_name}", spark_home]) + + # Add a link in the before_notebook hook in order to source PYTHONPATH automatically + CONFIG_SCRIPT = "/usr/local/bin/before-notebook.d/10spark-config.sh" + subprocess.check_call( + ["ln", "-s", spark_home / "sbin/spark-config.sh", CONFIG_SCRIPT] + ) + + +if __name__ == "__main__": + spark_version = get_spark_version() + spark_dir_name = download_spark( + spark_version=spark_version, + hadoop_version=os.environ["HADOOP_VERSION"], + scala_version=os.environ["SCALA_VERSION"], + spark_download_url=Path(os.environ["SPARK_DOWNLOAD_URL"]), + ) + prepare_spark( + spark_dir_name=spark_dir_name, spark_home=Path(os.environ["SPARK_HOME"]) + ) diff --git a/tagging/taggers.py b/tagging/taggers.py index 0aee5188e3..daf987b1e1 100644 --- a/tagging/taggers.py +++ b/tagging/taggers.py @@ -128,7 +128,12 @@ def tag_value(container: Container) -> str: class SparkVersionTagger(TaggerInterface): @staticmethod def tag_value(container: Container) -> str: - return "spark-" + _get_env_variable(container, "APACHE_SPARK_VERSION") + SPARK_VERSION_LINE_PREFIX = r" /___/ .__/\_,_/_/ /_/\_\ version" + + spark_version = _get_program_version(container, "spark-submit") + version_line = spark_version.split("\n")[4] + assert version_line.startswith(SPARK_VERSION_LINE_PREFIX) + return "spark-" + version_line.split(" ")[-1] class HadoopVersionTagger(TaggerInterface):