From b74418220768a48580795b8fe1b580a63956498a Mon Sep 17 00:00:00 2001 From: Ayaz Salikhov Date: Tue, 22 Oct 2024 11:47:45 +0100 Subject: [PATCH] Start using spark4-preview versions (#2159) * Start using spark4-preview versions * Allow to download preview versions * Expect warnings in spark * Disable local_sparklyr test for now --- CHANGELOG.md | 9 +++++++++ images/pyspark-notebook/Dockerfile | 2 +- images/pyspark-notebook/setup_spark.py | 2 +- tests/all-spark-notebook/test_spark_notebooks.py | 6 +++++- tests/pyspark-notebook/test_spark.py | 12 ++++++++++-- tests/pyspark-notebook/units/unit_pandas_version.py | 2 +- 6 files changed, 27 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 51a43c5d03..8da0848ab5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,15 @@ This changelog only contains breaking and/or significant changes manually introduced to this repository (using Pull Requests). All image manifests can be found in [the wiki](https://github.com/jupyter/docker-stacks/wiki). +## 2024-10-22 + +Affected: `pyspark-notebook` and `all-spark-notebook` images users + +- **Breaking:** Start using Spark 4.0.0 preview versions ([#2159](https://github.com/jupyter/docker-stacks/pull/2159)). + `sparklyr` doesn't seem to support Spark v4 yet when using Spark locally. + + Reason: Spark v3 is not compatible with Python 3.12, and [the voting group has decided](https://github.com/jupyter/docker-stacks/pull/2072#issuecomment-2414123851) to switch to Spark v4 preview version. + ## 2024-10-09 Affected: users building a custom set of images diff --git a/images/pyspark-notebook/Dockerfile b/images/pyspark-notebook/Dockerfile index 71b1c81ce6..8585232c72 100644 --- a/images/pyspark-notebook/Dockerfile +++ b/images/pyspark-notebook/Dockerfile @@ -63,7 +63,7 @@ USER ${NB_UID} RUN mamba install --yes \ 'grpcio-status' \ 'grpcio' \ - 'pandas=2.0.3' \ + 'pandas=2.2.2' \ 'pyarrow' && \ mamba clean --all -f -y && \ fix-permissions "${CONDA_DIR}" && \ diff --git a/images/pyspark-notebook/setup_spark.py b/images/pyspark-notebook/setup_spark.py index 266f2f7885..79e571af56 100755 --- a/images/pyspark-notebook/setup_spark.py +++ b/images/pyspark-notebook/setup_spark.py @@ -36,7 +36,7 @@ def get_latest_spark_version() -> str: stable_versions = [ ref.removeprefix("spark-").removesuffix("/") for ref in all_refs - if ref.startswith("spark-") and "incubating" not in ref and "preview" not in ref + if ref.startswith("spark-") and "incubating" not in ref ] # Compare versions semantically diff --git a/tests/all-spark-notebook/test_spark_notebooks.py b/tests/all-spark-notebook/test_spark_notebooks.py index 7e54e5b04c..81b172846a 100644 --- a/tests/all-spark-notebook/test_spark_notebooks.py +++ b/tests/all-spark-notebook/test_spark_notebooks.py @@ -14,7 +14,7 @@ @pytest.mark.flaky(retries=3, delay=1) @pytest.mark.parametrize( "test_file", - ["issue_1168", "local_pyspark", "local_sparklyr", "local_sparkR"], + ["issue_1168", "local_pyspark", "local_sparkR"], ) def test_nbconvert(container: TrackedContainer, test_file: str) -> None: """Check if Spark notebooks can be executed""" @@ -31,10 +31,14 @@ def test_nbconvert(container: TrackedContainer, test_file: str) -> None: ) logs = container.run_and_wait( timeout=60, + no_warnings=False, volumes={str(host_data_dir): {"bind": cont_data_dir, "mode": "ro"}}, tty=True, command=["bash", "-c", command], ) + warnings = TrackedContainer.get_warnings(logs) + assert len(warnings) == 1 + assert "Using incubator modules: jdk.incubator.vector" in warnings[0] expected_file = f"{output_dir}/{test_file}.md" assert expected_file in logs, f"Expected file {expected_file} not generated" diff --git a/tests/pyspark-notebook/test_spark.py b/tests/pyspark-notebook/test_spark.py index 211432f01e..2ba32cc9fb 100644 --- a/tests/pyspark-notebook/test_spark.py +++ b/tests/pyspark-notebook/test_spark.py @@ -3,12 +3,20 @@ import logging from tests.conftest import TrackedContainer -from tests.run_command import run_command LOGGER = logging.getLogger(__name__) def test_spark_shell(container: TrackedContainer) -> None: """Checking if Spark (spark-shell) is running properly""" - logs = run_command(container, 'spark-shell <<< "1+1"', timeout=60) + logs = container.run_and_wait( + timeout=60, + no_warnings=False, + tty=True, + command=["bash", "-c", 'spark-shell <<< "1+1"'], + ) + warnings = TrackedContainer.get_warnings(logs) + assert len(warnings) == 1 + assert "Using incubator modules: jdk.incubator.vector" in warnings[0] + assert "res0: Int = 2" in logs, "spark-shell does not work" diff --git a/tests/pyspark-notebook/units/unit_pandas_version.py b/tests/pyspark-notebook/units/unit_pandas_version.py index 03920db4b4..802a219215 100644 --- a/tests/pyspark-notebook/units/unit_pandas_version.py +++ b/tests/pyspark-notebook/units/unit_pandas_version.py @@ -2,4 +2,4 @@ # Distributed under the terms of the Modified BSD License. import pandas -assert pandas.__version__ == "2.0.3" +assert pandas.__version__ == "2.2.2"