nchammas · nchammas · Jun 29, 2016 · Jun 21, 2016 · Jun 21, 2016 · Jun 21, 2016
diff --git a/flintrock/config.yaml.template b/flintrock/config.yaml.template
@@ -3,10 +3,16 @@ services:
     version: 1.6.1
     # git-commit: latest  # if not 'latest', provide a full commit SHA; e.g. d6dc12ef0146ae409834c78737c116050961f350
     # git-repository:  # optional; defaults to https://github.com/apache/spark
+    # optional; defaults to download from from the official Spark S3 bucket
+    #   - must contain a {v} template corresponding to the version
+    #   - Spark must be pre-built
+    #   - must be a tar.gz file
+    # download-source: "https://www.example.com/files/spark/{v}/spark-{v}.tar.gz"
   hdfs:
     version: 2.7.2
     # optional; defaults to download from a dynamically selected Apache mirror
-    # must contain a {v} template corresponding to the version; must be a .tar.gz file
+    #   - must contain a {v} template corresponding to the version
+    #   - must be a .tar.gz file
     # download-source: "https://www.example.com/files/hadoop/{v}/hadoop-{v}.tar.gz"
 
 provider: ec2

diff --git a/flintrock/flintrock.py b/flintrock/flintrock.py
@@ -186,6 +186,10 @@ def cli(cli_context, config, provider):
 @click.option('--install-spark/--no-install-spark', default=True)
 @click.option('--spark-version',
               help="Spark release version to install.")
+@click.option('--spark-download-source',
+              help="URL to download a release of Spark from.",
+              default='https://s3.amazonaws.com/spark-related-packages/spark-{v}-bin-hadoop2.6.tgz',
+              show_default=True)
 @click.option('--spark-git-commit',
               help="Git commit to build Spark from. "
                    "Set to 'latest' to build Spark from the latest commit on the "
@@ -227,6 +231,7 @@ def launch(
         spark_version,
         spark_git_commit,
         spark_git_repository,
+        spark_download_source,
         assume_yes,
         ec2_key_name,
         ec2_identity_file,
@@ -289,7 +294,7 @@ def launch(
         services += [hdfs]
     if install_spark:
         if spark_version:
-            spark = Spark(version=spark_version)
+            spark = Spark(version=spark_version, download_source=spark_download_source)
         elif spark_git_commit:
             print(
                 "Warning: Building Spark takes a long time. "

diff --git a/flintrock/scripts/install-spark.sh b/flintrock/scripts/install-spark.sh
@@ -2,22 +2,20 @@
 
 set -e
 
-spark_version="$1"
-distribution="$2"
+url="$1"
 
 echo "Installing Spark..."
-echo "  version: ${spark_version}"
-echo "  distribution: ${distribution}"
+echo "  from: ${url}"
 
-file="spark-${spark_version}-bin-${distribution}.tgz"
+file="$(basename ${url})"
 
 # S3 is generally reliable, but sometimes when launching really large
 # clusters it can hiccup on us, in which case we'll need to retry the
 # download.
 set +e
 tries=1
 while true; do
-    curl --remote-name "https://s3.amazonaws.com/spark-related-packages/${file}"
+    curl --remote-name "${url}"
     curl_ret=$?
 
     if ((curl_ret == 0)); then

diff --git a/flintrock/services.py b/flintrock/services.py
@@ -202,28 +202,29 @@ def health_check(self, master_host: str):
 
 
 class Spark(FlintrockService):
-    def __init__(self, version: str=None, git_commit: str=None, git_repository: str=None):
+    def __init__(self, version: str=None, download_source: str=None,
+                 git_commit: str=None, git_repository: str=None):
         # TODO: Convert these checks into something that throws a proper exception.
         #       Perhaps reuse logic from CLI.
         assert bool(version) ^ bool(git_commit)
         if git_commit:
             assert git_repository
 
         self.version = version
+        self.download_source = download_source
         self.git_commit = git_commit
         self.git_repository = git_repository
 
         self.manifest = {
             'version': version,
+            'download_source': download_source,
             'git_commit': git_commit,
             'git_repository': git_repository}
 
     def install(
             self,
             ssh_client: paramiko.client.SSHClient,
             cluster: FlintrockCluster):
-        # TODO: Allow users to specify the Spark "distribution". (?)
-        distribution = 'hadoop2.6'
 
         print("[{h}] Installing Spark...".format(
             h=ssh_client.get_transport().getpeername()[0]))
@@ -235,15 +236,14 @@ def install(
                         localpath=os.path.join(SCRIPTS_DIR, 'install-spark.sh'),
                         remotepath='/tmp/install-spark.sh')
                     sftp.chmod(path='/tmp/install-spark.sh', mode=0o755)
+                url = self.download_source.format(v=self.version)
                 ssh_check_output(
                     client=ssh_client,
                     command="""
                         set -e
-                        /tmp/install-spark.sh {spark_version} {distribution}
+                        /tmp/install-spark.sh {url}
                         rm -f /tmp/install-spark.sh
-                    """.format(
-                            spark_version=shlex.quote(self.version),
-                            distribution=shlex.quote(distribution)))
+                    """.format(url=shlex.quote(url)))
             else:
                 ssh_check_output(
                     client=ssh_client,