nchammas · rmessner · Apr 6, 2016 · Apr 6, 2016 · Apr 6, 2016 · nchammas
diff --git a/flintrock/config.yaml.template b/flintrock/config.yaml.template
@@ -1,6 +1,8 @@
 services:
   spark:
     version: 1.6.0
+    # distribution: # optional; default to '2.6'
+    # download-source: # optional; default to 'https://s3.amazonaws.com/spark-related-packages/spark-${version}-bin-hadoop${distribution}.tgz'
     # git-commit: latest  # if not 'latest', provide a full commit SHA; e.g. d6dc12ef0146ae409834c78737c116050961f350
     # git-repository:  # optional; defaults to https://github.com/apache/spark
   hdfs:

diff --git a/flintrock/flintrock.py b/flintrock/flintrock.py
@@ -182,10 +182,17 @@ def cli(cli_context, config, provider):
 @click.option('--install-spark/--no-install-spark', default=True)
 @click.option('--spark-version',
               help="Spark release version to install.")
+@click.option('--spark-distribution',
+              help="Hadoop distribution for Spark release to install.", default='2.6')
 @click.option('--spark-git-commit',
               help="Git commit to build Spark from. "
                    "Set to 'latest' to build Spark from the latest commit on the "
                    "repository's default branch.")
+@click.option('--spark-download-source',
+              help="HTTP source to download the spark binaries. "
+                   "Available variable : file, spark_version, distribution",
+              default="https://s3.amazonaws.com/spark-related-packages/spark-${version}-bin-hadoop${distribution}.tgz",
+              show_default=True)
 @click.option('--spark-git-repository',
               help="Git repository to clone Spark from.",
               default='https://github.com/apache/spark',
@@ -220,8 +227,10 @@ def launch(
         hdfs_version,
         install_spark,
         spark_version,
+        spark_distribution,
         spark_git_commit,
         spark_git_repository,
+        spark_download_source,
         assume_yes,
         ec2_key_name,
         ec2_identity_file,
@@ -286,7 +295,7 @@ def launch(
         services += [hdfs]
     if install_spark:
         if spark_version:
-            spark = Spark(version=spark_version)
+            spark = Spark(version=spark_version, distribution=spark_distribution, download_source=spark_download_source)
         elif spark_git_commit:
             print(
                 "Warning: Building Spark takes a long time. "

diff --git a/flintrock/scripts/install-spark.sh b/flintrock/scripts/install-spark.sh
@@ -2,22 +2,26 @@
 
 set -e
 
-spark_version="$1"
+version="$1"
 distribution="$2"
+download_source="$3"
+
+url=$(eval "echo \"$download_source\"")
+file="${url##*/}"
 
 echo "Installing Spark..."
 echo "  version: ${spark_version}"
 echo "  distribution: ${distribution}"
-
-file="spark-${spark_version}-bin-${distribution}.tgz"
+echo "  download source: ${download_source}"
+echo "Final Spark URL: ${url}"
 
 # S3 is generally reliable, but sometimes when launching really large
 # clusters it can hiccup on us, in which case we'll need to retry the
 # download.
 set +e
 tries=1
 while true; do
-    curl --remote-name "https://s3.amazonaws.com/spark-related-packages/${file}"
+    curl --remote-name "${url}"
     curl_ret=$?
 
     if ((curl_ret == 0)); then

diff --git a/flintrock/services.py b/flintrock/services.py
@@ -197,28 +197,30 @@ def health_check(self, master_host: str):
 
 
 class Spark(FlintrockService):
-    def __init__(self, version: str=None, git_commit: str=None, git_repository: str=None):
+    def __init__(self, version: str=None, distribution: str=None, git_commit: str=None, git_repository: str=None, download_source: str="https://s3.amazonaws.com/spark-related-packages/spark-${version}-bin-hadoop${distribution}.tgz"):
         # TODO: Convert these checks into something that throws a proper exception.
         #       Perhaps reuse logic from CLI.
         assert bool(version) ^ bool(git_commit)
         if git_commit:
             assert git_repository
 
         self.version = version
+        self.distribution = distribution
         self.git_commit = git_commit
         self.git_repository = git_repository
+        self.download_source = download_source
 
         self.manifest = {
             'version': version,
+            'distribution': distribution,
             'git_commit': git_commit,
-            'git_repository': git_repository}
+            'git_repository': git_repository,
+            'download_source': download_source}
 
     def install(
             self,
             ssh_client: paramiko.client.SSHClient,
             cluster: FlintrockCluster):
-        # TODO: Allow users to specify the Spark "distribution". (?)
-        distribution = 'hadoop2.6'
 
         print("[{h}] Installing Spark...".format(
             h=ssh_client.get_transport().getpeername()[0]))
@@ -234,11 +236,12 @@ def install(
                     client=ssh_client,
                     command="""
                         set -e
-                        /tmp/install-spark.sh {spark_version} {distribution}
+                        /tmp/install-spark.sh {version} {distribution} {download_source}
                         rm -f /tmp/install-spark.sh
                     """.format(
-                            spark_version=shlex.quote(self.version),
-                            distribution=shlex.quote(distribution)))
+                            version=shlex.quote(self.version),
+                            distribution=shlex.quote(self.distribution),
+                            download_source=shlex.quote(self.download_source)))
             else:
                 ssh_check_output(
                     client=ssh_client,
@@ -255,13 +258,14 @@ def install(
                         cd spark
                         git reset --hard {commit}
                         if [ -e "make-distribution.sh" ]; then
-                            ./make-distribution.sh -Phadoop-2.6
+                            ./make-distribution.sh -Phadoop-{distribution}
                         else
-                            ./dev/make-distribution.sh -Phadoop-2.6
+                            ./dev/make-distribution.sh -Phadoop-{distribution}
                         fi
                     """.format(
                         repo=shlex.quote(self.git_repository),
-                        commit=shlex.quote(self.git_commit)))
+                        commit=shlex.quote(self.git_commit),
+                        distribution=shlex.quote(self.distribution)))
         except Exception as e:
             # TODO: This should be a more specific exception.
             print("Error: Failed to install Spark.", file=sys.stderr)