-
Notifications
You must be signed in to change notification settings - Fork 3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Automatically install latest spark version (#2075)
* Automatically install latest pyspark version * Better text * Do not use shutil to keep behaviour * Make setup_script cwd independent * Use _get_program_version to calculate spark version * Update setup_spark.py reqs * Update setup_spark.py * Add info about HADOOP_VERSION * Add customization back * Better text * Specify build args when they are actually needed * Better text * Better code * Better code * Better text * Get rid of warning * Improve code * Remove information about checksum * Better text
- Loading branch information
1 parent
c122930
commit c294e9e
Showing
5 changed files
with
154 additions
and
44 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
#!/usr/bin/env python3 | ||
# Copyright (c) Jupyter Development Team. | ||
# Distributed under the terms of the Modified BSD License. | ||
|
||
# Requirements: | ||
# - Run as the root user | ||
# - Required env variables: SPARK_HOME, HADOOP_VERSION, SPARK_DOWNLOAD_URL | ||
# - Optional env variables: SPARK_VERSION, SCALA_VERSION | ||
|
||
import os | ||
import subprocess | ||
from pathlib import Path | ||
|
||
import requests | ||
from bs4 import BeautifulSoup | ||
|
||
|
||
def get_all_refs(url: str) -> list[str]: | ||
""" | ||
Get all the references for a given webpage | ||
""" | ||
resp = requests.get(url) | ||
soup = BeautifulSoup(resp.text, "html.parser") | ||
return [a["href"] for a in soup.find_all("a", href=True)] | ||
|
||
|
||
def get_spark_version() -> str: | ||
""" | ||
If ${SPARK_VERSION} env variable is non-empty, simply returns it | ||
Otherwise, returns the last stable version of Spark using spark archive | ||
""" | ||
if (version := os.environ["SPARK_VERSION"]) != "": | ||
return version | ||
all_refs = get_all_refs("https://archive.apache.org/dist/spark/") | ||
stable_versions = [ | ||
ref.removeprefix("spark-").removesuffix("/") | ||
for ref in all_refs | ||
if ref.startswith("spark-") and "incubating" not in ref and "preview" not in ref | ||
] | ||
# Compare versions semantically | ||
return max( | ||
stable_versions, key=lambda ver: [int(sub_ver) for sub_ver in ver.split(".")] | ||
) | ||
|
||
|
||
def download_spark( | ||
spark_version: str, | ||
hadoop_version: str, | ||
scala_version: str, | ||
spark_download_url: Path, | ||
) -> str: | ||
""" | ||
Downloads and unpacks spark | ||
The resulting spark directory name is returned | ||
""" | ||
spark_dir_name = f"spark-{spark_version}-bin-hadoop{hadoop_version}" | ||
if scala_version: | ||
spark_dir_name += f"-scala{scala_version}" | ||
spark_url = spark_download_url / f"spark-{spark_version}" / f"{spark_dir_name}.tgz" | ||
|
||
tmp_file = Path("/tmp/spark.tar.gz") | ||
subprocess.check_call( | ||
["curl", "--progress-bar", "--location", "--output", tmp_file, spark_url] | ||
) | ||
subprocess.check_call( | ||
[ | ||
"tar", | ||
"xzf", | ||
tmp_file, | ||
"-C", | ||
"/usr/local", | ||
"--owner", | ||
"root", | ||
"--group", | ||
"root", | ||
"--no-same-owner", | ||
] | ||
) | ||
tmp_file.unlink() | ||
return spark_dir_name | ||
|
||
|
||
def prepare_spark(spark_dir_name: str, spark_home: Path) -> None: | ||
""" | ||
Creates a ${SPARK_HOME} symlink to a versioned spark directory | ||
Creates a 10spark-config.sh symlink to source PYTHONPATH automatically | ||
""" | ||
subprocess.check_call(["ln", "-s", f"/usr/local/{spark_dir_name}", spark_home]) | ||
|
||
# Add a link in the before_notebook hook in order to source PYTHONPATH automatically | ||
CONFIG_SCRIPT = "/usr/local/bin/before-notebook.d/10spark-config.sh" | ||
subprocess.check_call( | ||
["ln", "-s", spark_home / "sbin/spark-config.sh", CONFIG_SCRIPT] | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
spark_version = get_spark_version() | ||
spark_dir_name = download_spark( | ||
spark_version=spark_version, | ||
hadoop_version=os.environ["HADOOP_VERSION"], | ||
scala_version=os.environ["SCALA_VERSION"], | ||
spark_download_url=Path(os.environ["SPARK_DOWNLOAD_URL"]), | ||
) | ||
prepare_spark( | ||
spark_dir_name=spark_dir_name, spark_home=Path(os.environ["SPARK_HOME"]) | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters