From 911b5d17d271404202f87797f86e6dac94b8968c Mon Sep 17 00:00:00 2001 From: Raphael MESSNER Date: Wed, 6 Apr 2016 11:17:24 +0200 Subject: [PATCH 1/3] Add a new option for an alternate mirror for spark binaries --- flintrock/config.yaml.template | 1 + flintrock/flintrock.py | 8 +++++++- flintrock/scripts/install-spark.sh | 5 ++++- flintrock/services.py | 11 +++++++---- 4 files changed, 19 insertions(+), 6 deletions(-) diff --git a/flintrock/config.yaml.template b/flintrock/config.yaml.template index 4dcba251..050822df 100644 --- a/flintrock/config.yaml.template +++ b/flintrock/config.yaml.template @@ -1,6 +1,7 @@ services: spark: version: 1.6.0 + # preferred-mirror: # optional; default to 'https://s3.amazonaws.com/spark-related-packages/${file}' # git-commit: latest # if not 'latest', provide a full commit SHA; e.g. d6dc12ef0146ae409834c78737c116050961f350 # git-repository: # optional; defaults to https://github.com/apache/spark hdfs: diff --git a/flintrock/flintrock.py b/flintrock/flintrock.py index f05ce035..35084c81 100644 --- a/flintrock/flintrock.py +++ b/flintrock/flintrock.py @@ -186,6 +186,11 @@ def cli(cli_context, config, provider): help="Git commit to build Spark from. " "Set to 'latest' to build Spark from the latest commit on the " "repository's default branch.") +@click.option('--spark-preferred-mirror', + help="HTTP mirror to download the spark binaries. " + "Available variable : file, spark_version, distribution", + default="https://s3.amazonaws.com/spark-related-packages/${file}", + show_default=True) @click.option('--spark-git-repository', help="Git repository to clone Spark from.", default='https://github.com/apache/spark', @@ -222,6 +227,7 @@ def launch( spark_version, spark_git_commit, spark_git_repository, + spark_preferred_mirror, assume_yes, ec2_key_name, ec2_identity_file, @@ -286,7 +292,7 @@ def launch( services += [hdfs] if install_spark: if spark_version: - spark = Spark(version=spark_version) + spark = Spark(version=spark_version,preferred_mirror=spark_preferred_mirror) elif spark_git_commit: print( "Warning: Building Spark takes a long time. " diff --git a/flintrock/scripts/install-spark.sh b/flintrock/scripts/install-spark.sh index 39b77261..b0f23ae9 100644 --- a/flintrock/scripts/install-spark.sh +++ b/flintrock/scripts/install-spark.sh @@ -4,12 +4,15 @@ set -e spark_version="$1" distribution="$2" +mirror="$3" echo "Installing Spark..." echo " version: ${spark_version}" echo " distribution: ${distribution}" +echo " mirror: ${mirror}" file="spark-${spark_version}-bin-${distribution}.tgz" +url=$(eval "echo \"$mirror\"") # S3 is generally reliable, but sometimes when launching really large # clusters it can hiccup on us, in which case we'll need to retry the @@ -17,7 +20,7 @@ file="spark-${spark_version}-bin-${distribution}.tgz" set +e tries=1 while true; do - curl --remote-name "https://s3.amazonaws.com/spark-related-packages/${file}" + curl --remote-name "${url}" curl_ret=$? if ((curl_ret == 0)); then diff --git a/flintrock/services.py b/flintrock/services.py index 99e2fdf2..51d28982 100644 --- a/flintrock/services.py +++ b/flintrock/services.py @@ -197,7 +197,7 @@ def health_check(self, master_host: str): class Spark(FlintrockService): - def __init__(self, version: str=None, git_commit: str=None, git_repository: str=None): + def __init__(self, version: str=None, git_commit: str=None, git_repository: str=None, preferred_mirror: str="https://s3.amazonaws.com/spark-related-packages/${file}"): # TODO: Convert these checks into something that throws a proper exception. # Perhaps reuse logic from CLI. assert bool(version) ^ bool(git_commit) @@ -207,11 +207,13 @@ def __init__(self, version: str=None, git_commit: str=None, git_repository: str= self.version = version self.git_commit = git_commit self.git_repository = git_repository + self.preferred_mirror = preferred_mirror self.manifest = { 'version': version, 'git_commit': git_commit, - 'git_repository': git_repository} + 'git_repository': git_repository, + 'preferred_mirror': preferred_mirror} def install( self, @@ -234,11 +236,12 @@ def install( client=ssh_client, command=""" set -e - /tmp/install-spark.sh {spark_version} {distribution} + /tmp/install-spark.sh {spark_version} {distribution} {mirror} rm -f /tmp/install-spark.sh """.format( spark_version=shlex.quote(self.version), - distribution=shlex.quote(distribution))) + distribution=shlex.quote(distribution), + mirror=shlex.quote(self.preferred_mirror))) else: ssh_check_output( client=ssh_client, From 40f6d193ea74f6a94761b3bd456ef822a0aebbec Mon Sep 17 00:00:00 2001 From: Raphael MESSNER Date: Wed, 6 Apr 2016 11:39:42 +0200 Subject: [PATCH 2/3] Fix pep8 compliance --- flintrock/flintrock.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flintrock/flintrock.py b/flintrock/flintrock.py index 35084c81..2a56ce83 100644 --- a/flintrock/flintrock.py +++ b/flintrock/flintrock.py @@ -292,7 +292,7 @@ def launch( services += [hdfs] if install_spark: if spark_version: - spark = Spark(version=spark_version,preferred_mirror=spark_preferred_mirror) + spark = Spark(version=spark_version, preferred_mirror=spark_preferred_mirror) elif spark_git_commit: print( "Warning: Building Spark takes a long time. " From 42438012275f9ae7babe2e29894bdecce14d912c Mon Sep 17 00:00:00 2001 From: Raphael MESSNER Date: Wed, 6 Apr 2016 18:02:31 +0200 Subject: [PATCH 3/3] Fix terms consistency AND hadoop distribution within spark --- flintrock/config.yaml.template | 3 ++- flintrock/flintrock.py | 13 ++++++++----- flintrock/scripts/install-spark.sh | 13 +++++++------ flintrock/services.py | 25 +++++++++++++------------ 4 files changed, 30 insertions(+), 24 deletions(-) diff --git a/flintrock/config.yaml.template b/flintrock/config.yaml.template index 050822df..bb33831f 100644 --- a/flintrock/config.yaml.template +++ b/flintrock/config.yaml.template @@ -1,7 +1,8 @@ services: spark: version: 1.6.0 - # preferred-mirror: # optional; default to 'https://s3.amazonaws.com/spark-related-packages/${file}' + # distribution: # optional; default to '2.6' + # download-source: # optional; default to 'https://s3.amazonaws.com/spark-related-packages/spark-${version}-bin-hadoop${distribution}.tgz' # git-commit: latest # if not 'latest', provide a full commit SHA; e.g. d6dc12ef0146ae409834c78737c116050961f350 # git-repository: # optional; defaults to https://github.com/apache/spark hdfs: diff --git a/flintrock/flintrock.py b/flintrock/flintrock.py index 2a56ce83..b5849568 100644 --- a/flintrock/flintrock.py +++ b/flintrock/flintrock.py @@ -182,14 +182,16 @@ def cli(cli_context, config, provider): @click.option('--install-spark/--no-install-spark', default=True) @click.option('--spark-version', help="Spark release version to install.") +@click.option('--spark-distribution', + help="Hadoop distribution for Spark release to install.", default='2.6') @click.option('--spark-git-commit', help="Git commit to build Spark from. " "Set to 'latest' to build Spark from the latest commit on the " "repository's default branch.") -@click.option('--spark-preferred-mirror', - help="HTTP mirror to download the spark binaries. " +@click.option('--spark-download-source', + help="HTTP source to download the spark binaries. " "Available variable : file, spark_version, distribution", - default="https://s3.amazonaws.com/spark-related-packages/${file}", + default="https://s3.amazonaws.com/spark-related-packages/spark-${version}-bin-hadoop${distribution}.tgz", show_default=True) @click.option('--spark-git-repository', help="Git repository to clone Spark from.", @@ -225,9 +227,10 @@ def launch( hdfs_version, install_spark, spark_version, + spark_distribution, spark_git_commit, spark_git_repository, - spark_preferred_mirror, + spark_download_source, assume_yes, ec2_key_name, ec2_identity_file, @@ -292,7 +295,7 @@ def launch( services += [hdfs] if install_spark: if spark_version: - spark = Spark(version=spark_version, preferred_mirror=spark_preferred_mirror) + spark = Spark(version=spark_version, distribution=spark_distribution, download_source=spark_download_source) elif spark_git_commit: print( "Warning: Building Spark takes a long time. " diff --git a/flintrock/scripts/install-spark.sh b/flintrock/scripts/install-spark.sh index b0f23ae9..93638365 100644 --- a/flintrock/scripts/install-spark.sh +++ b/flintrock/scripts/install-spark.sh @@ -2,17 +2,18 @@ set -e -spark_version="$1" +version="$1" distribution="$2" -mirror="$3" +download_source="$3" + +url=$(eval "echo \"$download_source\"") +file="${url##*/}" echo "Installing Spark..." echo " version: ${spark_version}" echo " distribution: ${distribution}" -echo " mirror: ${mirror}" - -file="spark-${spark_version}-bin-${distribution}.tgz" -url=$(eval "echo \"$mirror\"") +echo " download source: ${download_source}" +echo "Final Spark URL: ${url}" # S3 is generally reliable, but sometimes when launching really large # clusters it can hiccup on us, in which case we'll need to retry the diff --git a/flintrock/services.py b/flintrock/services.py index 51d28982..6d976c71 100644 --- a/flintrock/services.py +++ b/flintrock/services.py @@ -197,7 +197,7 @@ def health_check(self, master_host: str): class Spark(FlintrockService): - def __init__(self, version: str=None, git_commit: str=None, git_repository: str=None, preferred_mirror: str="https://s3.amazonaws.com/spark-related-packages/${file}"): + def __init__(self, version: str=None, distribution: str=None, git_commit: str=None, git_repository: str=None, download_source: str="https://s3.amazonaws.com/spark-related-packages/spark-${version}-bin-hadoop${distribution}.tgz"): # TODO: Convert these checks into something that throws a proper exception. # Perhaps reuse logic from CLI. assert bool(version) ^ bool(git_commit) @@ -205,22 +205,22 @@ def __init__(self, version: str=None, git_commit: str=None, git_repository: str= assert git_repository self.version = version + self.distribution = distribution self.git_commit = git_commit self.git_repository = git_repository - self.preferred_mirror = preferred_mirror + self.download_source = download_source self.manifest = { 'version': version, + 'distribution': distribution, 'git_commit': git_commit, 'git_repository': git_repository, - 'preferred_mirror': preferred_mirror} + 'download_source': download_source} def install( self, ssh_client: paramiko.client.SSHClient, cluster: FlintrockCluster): - # TODO: Allow users to specify the Spark "distribution". (?) - distribution = 'hadoop2.6' print("[{h}] Installing Spark...".format( h=ssh_client.get_transport().getpeername()[0])) @@ -236,12 +236,12 @@ def install( client=ssh_client, command=""" set -e - /tmp/install-spark.sh {spark_version} {distribution} {mirror} + /tmp/install-spark.sh {version} {distribution} {download_source} rm -f /tmp/install-spark.sh """.format( - spark_version=shlex.quote(self.version), - distribution=shlex.quote(distribution), - mirror=shlex.quote(self.preferred_mirror))) + version=shlex.quote(self.version), + distribution=shlex.quote(self.distribution), + download_source=shlex.quote(self.download_source))) else: ssh_check_output( client=ssh_client, @@ -258,13 +258,14 @@ def install( cd spark git reset --hard {commit} if [ -e "make-distribution.sh" ]; then - ./make-distribution.sh -Phadoop-2.6 + ./make-distribution.sh -Phadoop-{distribution} else - ./dev/make-distribution.sh -Phadoop-2.6 + ./dev/make-distribution.sh -Phadoop-{distribution} fi """.format( repo=shlex.quote(self.git_repository), - commit=shlex.quote(self.git_commit))) + commit=shlex.quote(self.git_commit), + distribution=shlex.quote(self.distribution))) except Exception as e: # TODO: This should be a more specific exception. print("Error: Failed to install Spark.", file=sys.stderr)