From 911b5d17d271404202f87797f86e6dac94b8968c Mon Sep 17 00:00:00 2001
From: Raphael MESSNER <raphael.messner@adotmob.com>
Date: Wed, 6 Apr 2016 11:17:24 +0200
Subject: [PATCH 1/3] Add a new option for an alternate mirror for spark
 binaries

---
 flintrock/config.yaml.template     |  1 +
 flintrock/flintrock.py             |  8 +++++++-
 flintrock/scripts/install-spark.sh |  5 ++++-
 flintrock/services.py              | 11 +++++++----
 4 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/flintrock/config.yaml.template b/flintrock/config.yaml.template
index 4dcba251..050822df 100644
--- a/flintrock/config.yaml.template
+++ b/flintrock/config.yaml.template
@@ -1,6 +1,7 @@
 services:
   spark:
     version: 1.6.0
+    # preferred-mirror: # optional; default to 'https://s3.amazonaws.com/spark-related-packages/${file}'
     # git-commit: latest  # if not 'latest', provide a full commit SHA; e.g. d6dc12ef0146ae409834c78737c116050961f350
     # git-repository:  # optional; defaults to https://github.com/apache/spark
   hdfs:
diff --git a/flintrock/flintrock.py b/flintrock/flintrock.py
index f05ce035..35084c81 100644
--- a/flintrock/flintrock.py
+++ b/flintrock/flintrock.py
@@ -186,6 +186,11 @@ def cli(cli_context, config, provider):
               help="Git commit to build Spark from. "
                    "Set to 'latest' to build Spark from the latest commit on the "
                    "repository's default branch.")
+@click.option('--spark-preferred-mirror',
+              help="HTTP mirror to download the spark binaries. "
+                   "Available variable : file, spark_version, distribution",
+              default="https://s3.amazonaws.com/spark-related-packages/${file}",
+              show_default=True)
 @click.option('--spark-git-repository',
               help="Git repository to clone Spark from.",
               default='https://github.com/apache/spark',
@@ -222,6 +227,7 @@ def launch(
         spark_version,
         spark_git_commit,
         spark_git_repository,
+        spark_preferred_mirror,
         assume_yes,
         ec2_key_name,
         ec2_identity_file,
@@ -286,7 +292,7 @@ def launch(
         services += [hdfs]
     if install_spark:
         if spark_version:
-            spark = Spark(version=spark_version)
+            spark = Spark(version=spark_version,preferred_mirror=spark_preferred_mirror)
         elif spark_git_commit:
             print(
                 "Warning: Building Spark takes a long time. "
diff --git a/flintrock/scripts/install-spark.sh b/flintrock/scripts/install-spark.sh
index 39b77261..b0f23ae9 100644
--- a/flintrock/scripts/install-spark.sh
+++ b/flintrock/scripts/install-spark.sh
@@ -4,12 +4,15 @@ set -e
 
 spark_version="$1"
 distribution="$2"
+mirror="$3"
 
 echo "Installing Spark..."
 echo "  version: ${spark_version}"
 echo "  distribution: ${distribution}"
+echo "  mirror: ${mirror}"
 
 file="spark-${spark_version}-bin-${distribution}.tgz"
+url=$(eval "echo \"$mirror\"")
 
 # S3 is generally reliable, but sometimes when launching really large
 # clusters it can hiccup on us, in which case we'll need to retry the
@@ -17,7 +20,7 @@ file="spark-${spark_version}-bin-${distribution}.tgz"
 set +e
 tries=1
 while true; do
-    curl --remote-name "https://s3.amazonaws.com/spark-related-packages/${file}"
+    curl --remote-name "${url}"
     curl_ret=$?
 
     if ((curl_ret == 0)); then
diff --git a/flintrock/services.py b/flintrock/services.py
index 99e2fdf2..51d28982 100644
--- a/flintrock/services.py
+++ b/flintrock/services.py
@@ -197,7 +197,7 @@ def health_check(self, master_host: str):
 
 
 class Spark(FlintrockService):
-    def __init__(self, version: str=None, git_commit: str=None, git_repository: str=None):
+    def __init__(self, version: str=None, git_commit: str=None, git_repository: str=None, preferred_mirror: str="https://s3.amazonaws.com/spark-related-packages/${file}"):
         # TODO: Convert these checks into something that throws a proper exception.
         #       Perhaps reuse logic from CLI.
         assert bool(version) ^ bool(git_commit)
@@ -207,11 +207,13 @@ def __init__(self, version: str=None, git_commit: str=None, git_repository: str=
         self.version = version
         self.git_commit = git_commit
         self.git_repository = git_repository
+        self.preferred_mirror = preferred_mirror
 
         self.manifest = {
             'version': version,
             'git_commit': git_commit,
-            'git_repository': git_repository}
+            'git_repository': git_repository,
+            'preferred_mirror': preferred_mirror}
 
     def install(
             self,
@@ -234,11 +236,12 @@ def install(
                     client=ssh_client,
                     command="""
                         set -e
-                        /tmp/install-spark.sh {spark_version} {distribution}
+                        /tmp/install-spark.sh {spark_version} {distribution} {mirror}
                         rm -f /tmp/install-spark.sh
                     """.format(
                             spark_version=shlex.quote(self.version),
-                            distribution=shlex.quote(distribution)))
+                            distribution=shlex.quote(distribution),
+                            mirror=shlex.quote(self.preferred_mirror)))
             else:
                 ssh_check_output(
                     client=ssh_client,

From 40f6d193ea74f6a94761b3bd456ef822a0aebbec Mon Sep 17 00:00:00 2001
From: Raphael MESSNER <raphael.messner@adotmob.com>
Date: Wed, 6 Apr 2016 11:39:42 +0200
Subject: [PATCH 2/3] Fix pep8 compliance

---
 flintrock/flintrock.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flintrock/flintrock.py b/flintrock/flintrock.py
index 35084c81..2a56ce83 100644
--- a/flintrock/flintrock.py
+++ b/flintrock/flintrock.py
@@ -292,7 +292,7 @@ def launch(
         services += [hdfs]
     if install_spark:
         if spark_version:
-            spark = Spark(version=spark_version,preferred_mirror=spark_preferred_mirror)
+            spark = Spark(version=spark_version, preferred_mirror=spark_preferred_mirror)
         elif spark_git_commit:
             print(
                 "Warning: Building Spark takes a long time. "

From 42438012275f9ae7babe2e29894bdecce14d912c Mon Sep 17 00:00:00 2001
From: Raphael MESSNER <raphael.messner@adotmob.com>
Date: Wed, 6 Apr 2016 18:02:31 +0200
Subject: [PATCH 3/3] Fix terms consistency AND hadoop distribution within
 spark

---
 flintrock/config.yaml.template     |  3 ++-
 flintrock/flintrock.py             | 13 ++++++++-----
 flintrock/scripts/install-spark.sh | 13 +++++++------
 flintrock/services.py              | 25 +++++++++++++------------
 4 files changed, 30 insertions(+), 24 deletions(-)

diff --git a/flintrock/config.yaml.template b/flintrock/config.yaml.template
index 050822df..bb33831f 100644
--- a/flintrock/config.yaml.template
+++ b/flintrock/config.yaml.template
@@ -1,7 +1,8 @@
 services:
   spark:
     version: 1.6.0
-    # preferred-mirror: # optional; default to 'https://s3.amazonaws.com/spark-related-packages/${file}'
+    # distribution: # optional; default to '2.6'
+    # download-source: # optional; default to 'https://s3.amazonaws.com/spark-related-packages/spark-${version}-bin-hadoop${distribution}.tgz'
     # git-commit: latest  # if not 'latest', provide a full commit SHA; e.g. d6dc12ef0146ae409834c78737c116050961f350
     # git-repository:  # optional; defaults to https://github.com/apache/spark
   hdfs:
diff --git a/flintrock/flintrock.py b/flintrock/flintrock.py
index 2a56ce83..b5849568 100644
--- a/flintrock/flintrock.py
+++ b/flintrock/flintrock.py
@@ -182,14 +182,16 @@ def cli(cli_context, config, provider):
 @click.option('--install-spark/--no-install-spark', default=True)
 @click.option('--spark-version',
               help="Spark release version to install.")
+@click.option('--spark-distribution',
+              help="Hadoop distribution for Spark release to install.", default='2.6')
 @click.option('--spark-git-commit',
               help="Git commit to build Spark from. "
                    "Set to 'latest' to build Spark from the latest commit on the "
                    "repository's default branch.")
-@click.option('--spark-preferred-mirror',
-              help="HTTP mirror to download the spark binaries. "
+@click.option('--spark-download-source',
+              help="HTTP source to download the spark binaries. "
                    "Available variable : file, spark_version, distribution",
-              default="https://s3.amazonaws.com/spark-related-packages/${file}",
+              default="https://s3.amazonaws.com/spark-related-packages/spark-${version}-bin-hadoop${distribution}.tgz",
               show_default=True)
 @click.option('--spark-git-repository',
               help="Git repository to clone Spark from.",
@@ -225,9 +227,10 @@ def launch(
         hdfs_version,
         install_spark,
         spark_version,
+        spark_distribution,
         spark_git_commit,
         spark_git_repository,
-        spark_preferred_mirror,
+        spark_download_source,
         assume_yes,
         ec2_key_name,
         ec2_identity_file,
@@ -292,7 +295,7 @@ def launch(
         services += [hdfs]
     if install_spark:
         if spark_version:
-            spark = Spark(version=spark_version, preferred_mirror=spark_preferred_mirror)
+            spark = Spark(version=spark_version, distribution=spark_distribution, download_source=spark_download_source)
         elif spark_git_commit:
             print(
                 "Warning: Building Spark takes a long time. "
diff --git a/flintrock/scripts/install-spark.sh b/flintrock/scripts/install-spark.sh
index b0f23ae9..93638365 100644
--- a/flintrock/scripts/install-spark.sh
+++ b/flintrock/scripts/install-spark.sh
@@ -2,17 +2,18 @@
 
 set -e
 
-spark_version="$1"
+version="$1"
 distribution="$2"
-mirror="$3"
+download_source="$3"
+
+url=$(eval "echo \"$download_source\"")
+file="${url##*/}"
 
 echo "Installing Spark..."
 echo "  version: ${spark_version}"
 echo "  distribution: ${distribution}"
-echo "  mirror: ${mirror}"
-
-file="spark-${spark_version}-bin-${distribution}.tgz"
-url=$(eval "echo \"$mirror\"")
+echo "  download source: ${download_source}"
+echo "Final Spark URL: ${url}"
 
 # S3 is generally reliable, but sometimes when launching really large
 # clusters it can hiccup on us, in which case we'll need to retry the
diff --git a/flintrock/services.py b/flintrock/services.py
index 51d28982..6d976c71 100644
--- a/flintrock/services.py
+++ b/flintrock/services.py
@@ -197,7 +197,7 @@ def health_check(self, master_host: str):
 
 
 class Spark(FlintrockService):
-    def __init__(self, version: str=None, git_commit: str=None, git_repository: str=None, preferred_mirror: str="https://s3.amazonaws.com/spark-related-packages/${file}"):
+    def __init__(self, version: str=None, distribution: str=None, git_commit: str=None, git_repository: str=None, download_source: str="https://s3.amazonaws.com/spark-related-packages/spark-${version}-bin-hadoop${distribution}.tgz"):
         # TODO: Convert these checks into something that throws a proper exception.
         #       Perhaps reuse logic from CLI.
         assert bool(version) ^ bool(git_commit)
@@ -205,22 +205,22 @@ def __init__(self, version: str=None, git_commit: str=None, git_repository: str=
             assert git_repository
 
         self.version = version
+        self.distribution = distribution
         self.git_commit = git_commit
         self.git_repository = git_repository
-        self.preferred_mirror = preferred_mirror
+        self.download_source = download_source
 
         self.manifest = {
             'version': version,
+            'distribution': distribution,
             'git_commit': git_commit,
             'git_repository': git_repository,
-            'preferred_mirror': preferred_mirror}
+            'download_source': download_source}
 
     def install(
             self,
             ssh_client: paramiko.client.SSHClient,
             cluster: FlintrockCluster):
-        # TODO: Allow users to specify the Spark "distribution". (?)
-        distribution = 'hadoop2.6'
 
         print("[{h}] Installing Spark...".format(
             h=ssh_client.get_transport().getpeername()[0]))
@@ -236,12 +236,12 @@ def install(
                     client=ssh_client,
                     command="""
                         set -e
-                        /tmp/install-spark.sh {spark_version} {distribution} {mirror}
+                        /tmp/install-spark.sh {version} {distribution} {download_source}
                         rm -f /tmp/install-spark.sh
                     """.format(
-                            spark_version=shlex.quote(self.version),
-                            distribution=shlex.quote(distribution),
-                            mirror=shlex.quote(self.preferred_mirror)))
+                            version=shlex.quote(self.version),
+                            distribution=shlex.quote(self.distribution),
+                            download_source=shlex.quote(self.download_source)))
             else:
                 ssh_check_output(
                     client=ssh_client,
@@ -258,13 +258,14 @@ def install(
                         cd spark
                         git reset --hard {commit}
                         if [ -e "make-distribution.sh" ]; then
-                            ./make-distribution.sh -Phadoop-2.6
+                            ./make-distribution.sh -Phadoop-{distribution}
                         else
-                            ./dev/make-distribution.sh -Phadoop-2.6
+                            ./dev/make-distribution.sh -Phadoop-{distribution}
                         fi
                     """.format(
                         repo=shlex.quote(self.git_repository),
-                        commit=shlex.quote(self.git_commit)))
+                        commit=shlex.quote(self.git_commit),
+                        distribution=shlex.quote(self.distribution)))
         except Exception as e:
             # TODO: This should be a more specific exception.
             print("Error: Failed to install Spark.", file=sys.stderr)