apache · kevincox · Jan 1, 2015 · Jan 1, 2015 · Jan 1, 2015 · Jan 2, 2015
diff --git a/.gitignore b/.gitignore
@@ -62,6 +62,9 @@ unit-tests.log
 ec2/lib/
 rat-results.txt
 scalastyle.txt
+conf/spark-defaults.conf.bak
+conf/*.conf
+conf/conf.cloudera.yarn
 scalastyle-output.xml
 R-unit-tests.log
 R/unit-tests.out

diff --git a/Capfile b/Capfile
@@ -0,0 +1,69 @@
+require 'bundler/setup'
+require 'capistrano_recipes/deploy/packserv'
+
+set :application, "spark"
+set :user, "deploy"
+set :shared_work_path, "/u/apps/spark/shared/work"
+set :shared_logs_path, "/u/apps/spark/shared/log"
+set :shared_conf_path, "/u/apps/spark/shared/conf"
+set :spark_jar_path, "hdfs://hadoop-production/user/sparkles"
+set :gateway, nil
+set :keep_releases, 5
+set :branch, fetch(:branch, `git symbolic-ref --short HEAD`.gsub("\s",""))
+
+DATANODES = (2..48).map {|i| "dn%02d.chi.shopify.com" % i }
+OTHERNODES = ["hadoop-etl1.chi.shopify.com", "hadoop-misc4.chi.shopify.com", "spark-etl1.chi.shopify.com", "reportify-etl4.chi.shopify.com"]
+BROKEN = [] # Node is down don't try to send code
+
+task :production do
+  role :app, *(DATANODES + OTHERNODES - BROKEN)
+  role :history, "hadoop-rm.chi.shopify.com"
+  role :uploader, "spark-etl1.chi.shopify.com"
+end
+
+namespace :deploy do
+  task :cleanup do
+    count = fetch(:keep_releases, 5).to_i
+    run "ls -1dt /u/apps/spark/releases/* | tail -n +#{count + 1} | xargs rm -rf"
+  end
+
+  task :upload_to_hdfs, :roles => :uploader, :on_no_matching_servers => :continue do
+    run "hdfs dfs -copyFromLocal -f #{release_path}/lib/spark-assembly-*.jar #{fetch(:spark_jar_path)}/spark-assembly-#{fetch(:sha)}.jar"
+    run "hdfs dfs -copyFromLocal -f #{release_path}/python/lib/pyspark.zip #{fetch(:spark_jar_path)}/pyspark-#{fetch(:sha)}.zip"
+    run "hdfs dfs -copyFromLocal -f #{release_path}/python/lib/py4j-*.zip #{fetch(:spark_jar_path)}/py4j-#{fetch(:sha)}.zip"
+  end
+
+  task :prevent_gateway do
+    set :gateway, nil
+  end
+
+  task :symlink_shared do
+    run "ln -nfs #{shared_work_path} #{release_path}/work"
+    run "ln -nfs #{shared_logs_path} #{release_path}/logs"
+    run "rm -rf #{release_path}/conf && ln -nfs #{shared_conf_path} #{release_path}/conf"
+  end
+
+  task :remind_us_to_update_starscream do
+    puts "****************************************************************"
+    puts "*"
+    puts "*    Remember to update starscream/conf/config.yml"
+    puts "*"
+    puts "*    spark_production"
+    puts "*      conf_options:"
+    puts "*      <<: *spark_remote"
+    puts "*      spark.yarn.jar: \"#{fetch(:spark_jar_path)}/spark-assembly-\033[31m#{fetch(:sha)}\033[0m.jar\""
+    puts "*"
+    puts "****************************************************************"
+  end
+
+  task :restart do
+  end
+
+  after 'deploy:initialize_variables', 'deploy:prevent_gateway' # capistrano recipes packserv deploy always uses a gateway
+  before 'deploy:symlink_current', 'deploy:symlink_shared'
+  before 'deploy:test_spark_jar', 'deploy:initialize_variables'
+  before 'deploy:upload_to_hdfs', 'deploy:initialize_variables'
+  after 'deploy:unpack', 'deploy:upload_to_hdfs'
+  after 'deploy:restart', 'deploy:cleanup'
+  after 'deploy:cleanup', 'deploy:remind_us_to_update_starscream'
+end
diff --git a/Gemfile b/Gemfile
@@ -0,0 +1,7 @@
+# A sample Gemfile
+source "https://rubygems.org"
+
+group :deploy do
+  gem 'capistrano', '~> 2'
+  gem 'capistrano-recipes', git: "git@github.com:Shopify/capistrano-recipes", ref: '57bd4ed4accc5561d4774ec2f072bb71bd1b2ea7'
+end
diff --git a/Gemfile.lock b/Gemfile.lock
@@ -0,0 +1,34 @@
+GIT
+  remote: git@github.com:Shopify/capistrano-recipes
+  revision: 57bd4ed4accc5561d4774ec2f072bb71bd1b2ea7
+  ref: 57bd4ed4accc5561d4774ec2f072bb71bd1b2ea7
+  specs:
+    capistrano-recipes (1.1.0)
+      capistrano (~> 2.15.5)
+      json (>= 1.8.1)
+
+GEM
+  remote: https://rubygems.org/
+  specs:
+    capistrano (2.15.5)
+      highline
+      net-scp (>= 1.0.0)
+      net-sftp (>= 2.0.0)
+      net-ssh (>= 2.0.14)
+      net-ssh-gateway (>= 1.1.0)
+    highline (1.6.21)
+    json (1.8.1)
+    net-scp (1.1.2)
+      net-ssh (>= 2.6.5)
+    net-sftp (2.1.2)
+      net-ssh (>= 2.6.5)
+    net-ssh (2.8.0)
+    net-ssh-gateway (1.2.0)
+      net-ssh (>= 2.6.5)
+
+PLATFORMS
+  ruby
+
+DEPENDENCIES
+  capistrano (~> 2)
+  capistrano-recipes!
diff --git a/README.md b/README.md
@@ -1,14 +1,9 @@
-# Apache Spark
+# Shopify's Apache Spark
 
-Spark is a fast and general cluster computing system for Big Data. It provides
-high-level APIs in Scala, Java, and Python, and an optimized engine that
-supports general computation graphs for data analysis. It also supports a
-rich set of higher-level tools including Spark SQL for SQL and DataFrames,
-MLlib for machine learning, GraphX for graph processing,
-and Spark Streaming for stream processing.
-
-<http://spark.apache.org/>
+Spark is a fast and general cluster computing system for Big Data.
 
+This is Shopify's clone with specific to Shopify customizations, mostly
+surrounding configuration.
 
 ## Online Documentation
 
@@ -17,82 +12,14 @@ guide, on the [project web page](http://spark.apache.org/documentation.html)
 and [project wiki](https://cwiki.apache.org/confluence/display/SPARK).
 This README file only contains basic setup instructions.
 
-## Building Spark
-
-Spark is built using [Apache Maven](http://maven.apache.org/).
-To build Spark and its example programs, run:
-
-    build/mvn -DskipTests clean package
-
-(You do not need to do this if you downloaded a pre-built package.)
-More detailed documentation is available from the project site, at
-["Building Spark"](http://spark.apache.org/docs/latest/building-spark.html).
-
-## Interactive Scala Shell
-
-The easiest way to start using Spark is through the Scala shell:
-
-    ./bin/spark-shell
-
-Try the following command, which should return 1000:
-
-    scala> sc.parallelize(1 to 1000).count()
-
-## Interactive Python Shell
-
-Alternatively, if you prefer Python, you can use the Python shell:
-
-    ./bin/pyspark
-
-And run the following command, which should also return 1000:
-
-    >>> sc.parallelize(range(1000)).count()
-
-## Example Programs
-
-Spark also comes with several sample programs in the `examples` directory.
-To run one of them, use `./bin/run-example <class> [params]`. For example:
-
-    ./bin/run-example SparkPi
-
-will run the Pi example locally.
-
-You can set the MASTER environment variable when running examples to submit
-examples to a cluster. This can be a mesos:// or spark:// URL,
-"yarn-cluster" or "yarn-client" to run on YARN, and "local" to run
-locally with one thread, or "local[N]" to run locally with N threads. You
-can also use an abbreviated class name if the class is in the `examples`
-package. For instance:
-
-    MASTER=spark://host:7077 ./bin/run-example SparkPi
-
-Many of the example programs print usage help if no params are given.
-
-## Running Tests
-
-Testing first requires [building Spark](#building-spark). Once Spark is built, tests
-can be run using:
-
-    ./dev/run-tests
-
-Please see the guidance on how to
-[run tests for a module, or individual tests](https://cwiki.apache.org/confluence/display/SPARK/Useful+Developer+Tools).
+## Building Shopify Spark
 
-## A Note About Hadoop Versions
+You can build Shopify spark using `script/setup`, or continuously and incrementally using `script/watch`
 
-Spark uses the Hadoop core library to talk to HDFS and other Hadoop-supported
-storage systems. Because the protocols have changed in different versions of
-Hadoop, you must build Spark against the same version that your cluster runs.
+## Testing Shopify Spark
 
-Please refer to the build documentation at
-["Specifying the Hadoop Version"](http://spark.apache.org/docs/latest/building-spark.html#specifying-the-hadoop-version)
-for detailed guidance on building for a particular distribution of Hadoop, including
-building for particular Hive and Hive Thriftserver distributions. See also
-["Third Party Hadoop Distributions"](http://spark.apache.org/docs/latest/hadoop-third-party-distributions.html)
-for guidance on building a Spark application that works with a particular
-distribution.
+To test a Shopify spark build, assemble the spark jar with `script/setup` or maven, and then unset the `spark.yarn.jar` property from the defaults.conf or the config of the application you are using. Spark will then upload your local assembly to your YARN application's staging, no deploy involved.
 
-## Configuration
+## Deploying Shopify Spark
 
-Please refer to the [Configuration guide](http://spark.apache.org/docs/latest/configuration.html)
-in the online documentation for an overview on how to configure Spark.
+The cap deploy script is only for deploying Shopify Spark to production. To deploy, execute `bundle exec cap production deploy`
diff --git a/SHOPIFY_HADOOP_OPTIONS b/SHOPIFY_HADOOP_OPTIONS
@@ -0,0 +1 @@
+-Phadoop-2.4 -Dhadoop.version=2.6.0 -Pyarn -Phive
diff --git a/assembly/pom.xml b/assembly/pom.xml
@@ -92,27 +92,6 @@
           <skip>true</skip>
         </configuration>
       </plugin>
-        <!-- zip pyspark archives to run python application on yarn mode -->
-        <plugin>
-          <groupId>org.apache.maven.plugins</groupId>
-            <artifactId>maven-antrun-plugin</artifactId>
-            <executions>
-              <execution>
-                <phase>package</phase>
-                  <goals>
-                    <goal>run</goal>
-                  </goals>
-              </execution>
-            </executions>
-            <configuration>
-              <target>
-                <delete dir="${basedir}/../python/lib/pyspark.zip"/>
-                <zip destfile="${basedir}/../python/lib/pyspark.zip">
-                  <fileset dir="${basedir}/../python/" includes="pyspark/**/*"/>
-                </zip>
-              </target>
-            </configuration>
-        </plugin>
       <!-- Use the shade plugin to create a big JAR with all the dependencies -->
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
@@ -162,6 +141,27 @@
           </execution>
         </executions>
       </plugin>
+        <!-- zip pyspark archives to run python application on yarn mode -->
+        <plugin>
+          <groupId>org.apache.maven.plugins</groupId>
+            <artifactId>maven-antrun-plugin</artifactId>
+            <executions>
+              <execution>
+                <phase>package</phase>
+                  <goals>
+                    <goal>run</goal>
+                  </goals>
+              </execution>
+            </executions>
+            <configuration>
+              <target>
+                <delete dir="${basedir}/../python/lib/pyspark.zip"/>
+                <zip destfile="${basedir}/../python/lib/pyspark.zip">
+                  <fileset dir="${basedir}/../python/" includes="pyspark/**/*"/>
+                </zip>
+              </target>
+            </configuration>
+        </plugin>
     </plugins>
   </build>
 

diff --git a/conf/java-opts b/conf/java-opts
@@ -0,0 +1 @@
+-Djava.security.krb5.realm= -Djava.security.krb5.kdc= -Djava.security.krb5.conf=/dev/null
diff --git a/conf/log4j.properties b/conf/log4j.properties
@@ -0,0 +1,22 @@
+# Set everything to be logged to the console
+log4j.rootCategory=INFO, console, file
+
+log4j.appender.console=org.apache.log4j.ConsoleAppender
+log4j.appender.console.Threshold=WARN
+log4j.appender.console.target=System.err
+log4j.appender.console.layout=org.apache.log4j.PatternLayout
+log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
+
+# setttings for file appender that captures more verbose output
+log4j.appender.file=org.apache.log4j.RollingFileAppender
+log4j.appender.file.File=/tmp/spark.log
+log4j.appender.file.MaxFileSize=20MB
+log4j.appender.file.Threshold=INFO
+log4j.appender.file.MaxBackupIndex=1
+log4j.appender.file.layout=org.apache.log4j.PatternLayout
+log4j.appender.file.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1} %m%n
+
+# Settings to quiet third party logs that are too verbose
+log4j.logger.org.eclipse.jetty=WARN
+log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
+log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
diff --git a/conf/spark-defaults.conf b/conf/spark-defaults.conf
@@ -0,0 +1,3 @@
+# Shopify doesn't use defaults here and instead lets all the clients specify their own set of defaults.
+# This way, each client can set defaults appropriate to it, as well as change those defaults based on the environment.
+# They also don't have to care about this weird set of overridden values that is different than the defaults listed in the docs.
diff --git a/conf/spark-env.sh b/conf/spark-env.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+
+echoerr() { echo "$@" 1>&2; }
+FWDIR="$(cd `dirname $0`/..; pwd)"
+
+
+if [ "$(uname)" == "Darwin" ]; then
+  case "$PYTHON_ENV" in
+    'remote_development')
+        echoerr "Sparkify: Connecting to chicago spark cluster ..."
+        # Figure out the local IP to bind spark to for shell <-> master communication
+        vpn_interface=tap0;
+        get_ip_command="ifconfig $vpn_interface 2>&1 | grep 'inet' | awk '{print \$2}'"
+        if ifconfig $vpn_interface > /dev/null 2>&1; then
+          export SPARK_LOCAL_IP=`bash -c "$get_ip_command"`
+        else
+          echoerr "ERROR: could not find an VPN interface to connect to the Shopify Spark Cluster! Please connect your VPN client! See https://vault-unicorn.shopify.com/VPN---Servers ."
+          exit 1
+        fi
+
+        export HADOOP_CONF_DIR=$FWDIR/conf/conf.cloudera.yarn
+        ;;
+    'test'|'development')
+      export SPARK_LOCAL_IP=127.0.0.1
+      ;;
+  esac
+fi
+
+if which ipython > /dev/null; then
+  export IPYTHON=1
+fi
diff --git a/core/pom.xml b/core/pom.xml
@@ -380,7 +380,7 @@
     <dependency>
       <groupId>net.razorvine</groupId>
       <artifactId>pyrolite</artifactId>
-      <version>4.4</version>
+      <version>4.9</version>
       <exclusions>
         <exclusion>
           <groupId>net.razorvine</groupId>
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		-Phadoop-2.4 -Dhadoop.version=2.6.0 -Pyarn -Phive
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		-Djava.security.krb5.realm= -Djava.security.krb5.kdc= -Djava.security.krb5.conf=/dev/null