alteryx · mbautin · Sep 9, 2015 · Sep 3, 2015 · Sep 3, 2015 · Sep 4, 2015
diff --git a/README.md b/README.md
@@ -1,7 +1,7 @@
 # Apache Spark
 
 Spark is a fast and general cluster computing system for Big Data. It provides
-high-level APIs in Scala, Java, and Python, and an optimized engine that
+high-level APIs in Scala, Java, Python, and R, and an optimized engine that
 supports general computation graphs for data analysis. It also supports a
 rich set of higher-level tools including Spark SQL for SQL and DataFrames,
 MLlib for machine learning, GraphX for graph processing,
@@ -94,5 +94,5 @@ distribution.
 
 ## Configuration
 
-Please refer to the [Configuration guide](http://spark.apache.org/docs/latest/configuration.html)
+Please refer to the [Configuration Guide](http://spark.apache.org/docs/latest/configuration.html)
 in the online documentation for an overview on how to configure Spark.
diff --git a/assembly/pom.xml b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-candidate-csd-1-SNAPSHOT</version>
+    <version>1.5.0-csd-1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

diff --git a/bagel/pom.xml b/bagel/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-candidate-csd-1-SNAPSHOT</version>
+    <version>1.5.0-csd-1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

diff --git a/core/pom.xml b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-candidate-csd-1-SNAPSHOT</version>
+    <version>1.5.0-csd-1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

diff --git a/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala b/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala
@@ -286,6 +286,10 @@ class InputOutputMetricsSuite extends SparkFunSuite with SharedSparkContext
   private def runAndReturnMetrics(job: => Unit,
       collector: (SparkListenerTaskEnd) => Option[Long]): Long = {
     val taskMetrics = new ArrayBuffer[Long]()
+
+    // Avoid receiving earlier taskEnd events
+    sc.listenerBus.waitUntilEmpty(500)
+
     sc.addSparkListener(new SparkListener() {
       override def onTaskEnd(taskEnd: SparkListenerTaskEnd) {
         collector(taskEnd).foreach(taskMetrics += _)

diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
@@ -527,6 +527,7 @@ class DAGSchedulerSuite
     }
 
     // The map stage should have been submitted.
+    sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
     assert(countSubmittedMapStageAttempts() === 1)
 
     complete(taskSets(0), Seq(

diff --git a/docs/building-spark.md b/docs/building-spark.md
@@ -61,12 +61,13 @@ If you don't run this, you may see errors like the following:
 You can fix this by setting the `MAVEN_OPTS` variable as discussed before.
 
 **Note:**
-* *For Java 8 and above this step is not required.*
-* *If using `build/mvn` and `MAVEN_OPTS` were not already set, the script will automate this for you.*
+
+* For Java 8 and above this step is not required.
+* If using `build/mvn` with no `MAVEN_OPTS` set, the script will automate this for you.
 
 # Specifying the Hadoop Version
 
-Because HDFS is not protocol-compatible across versions, if you want to read from HDFS, you'll need to build Spark against the specific HDFS version in your environment. You can do this through the "hadoop.version" property. If unset, Spark will build against Hadoop 2.2.0 by default. Note that certain build profiles are required for particular Hadoop versions:
+Because HDFS is not protocol-compatible across versions, if you want to read from HDFS, you'll need to build Spark against the specific HDFS version in your environment. You can do this through the `hadoop.version` property. If unset, Spark will build against Hadoop 2.2.0 by default. Note that certain build profiles are required for particular Hadoop versions:
 
 <table class="table">
   <thead>
@@ -91,7 +92,7 @@ mvn -Dhadoop.version=1.2.1 -Phadoop-1 -DskipTests clean package
 mvn -Dhadoop.version=2.0.0-mr1-cdh4.2.0 -Phadoop-1 -DskipTests clean package
 {% endhighlight %}
 
-You can enable the "yarn" profile and optionally set the "yarn.version" property if it is different from "hadoop.version". Spark only supports YARN versions 2.2.0 and later.
+You can enable the `yarn` profile and optionally set the `yarn.version` property if it is different from `hadoop.version`. Spark only supports YARN versions 2.2.0 and later.
 
 Examples:
 
@@ -125,7 +126,7 @@ mvn -Pyarn -Phadoop-2.4 -Dhadoop.version=2.4.0 -Phive -Phive-thriftserver -Dskip
 # Building for Scala 2.11
 To produce a Spark package compiled with Scala 2.11, use the `-Dscala-2.11` property:
 
-    dev/change-scala-version.sh 2.11
+    ./dev/change-scala-version.sh 2.11
     mvn -Pyarn -Phadoop-2.4 -Dscala-2.11 -DskipTests clean package
 
 Spark does not yet support its JDBC component for Scala 2.11.
@@ -163,11 +164,9 @@ the `spark-parent` module).
 
 Thus, the full flow for running continuous-compilation of the `core` submodule may look more like:
 
-```
- $ mvn install
- $ cd core
- $ mvn scala:cc
-```
+    $ mvn install
+    $ cd core
+    $ mvn scala:cc
 
 # Building Spark with IntelliJ IDEA or Eclipse
 
@@ -193,11 +192,11 @@ then ship it over to the cluster. We are investigating the exact cause for this.
 
 # Packaging without Hadoop Dependencies for YARN
 
-The assembly jar produced by `mvn package` will, by default, include all of Spark's dependencies, including Hadoop and some of its ecosystem projects. On YARN deployments, this causes multiple versions of these to appear on executor classpaths: the version packaged in the Spark assembly and the version on each node, included with yarn.application.classpath.  The `hadoop-provided` profile builds the assembly without including Hadoop-ecosystem projects, like ZooKeeper and Hadoop itself.
+The assembly jar produced by `mvn package` will, by default, include all of Spark's dependencies, including Hadoop and some of its ecosystem projects. On YARN deployments, this causes multiple versions of these to appear on executor classpaths: the version packaged in the Spark assembly and the version on each node, included with `yarn.application.classpath`.  The `hadoop-provided` profile builds the assembly without including Hadoop-ecosystem projects, like ZooKeeper and Hadoop itself.
 
 # Building with SBT
 
-Maven is the official recommendation for packaging Spark, and is the "build of reference".
+Maven is the official build tool recommended for packaging Spark, and is the *build of reference*.
 But SBT is supported for day-to-day development since it can provide much faster iterative
 compilation. More advanced developers may wish to use SBT.
 

diff --git a/docs/cluster-overview.md b/docs/cluster-overview.md
@@ -5,18 +5,19 @@ title: Cluster Mode Overview
 
 This document gives a short overview of how Spark runs on clusters, to make it easier to understand
 the components involved. Read through the [application submission guide](submitting-applications.html)
-to submit applications to a cluster.
+to learn about launching applications on a cluster.
 
 # Components
 
-Spark applications run as independent sets of processes on a cluster, coordinated by the SparkContext
+Spark applications run as independent sets of processes on a cluster, coordinated by the `SparkContext`
 object in your main program (called the _driver program_).
+
 Specifically, to run on a cluster, the SparkContext can connect to several types of _cluster managers_
-(either Spark's own standalone cluster manager or Mesos/YARN), which allocate resources across
+(either Spark's own standalone cluster manager, Mesos or YARN), which allocate resources across
 applications. Once connected, Spark acquires *executors* on nodes in the cluster, which are
 processes that run computations and store data for your application.
 Next, it sends your application code (defined by JAR or Python files passed to SparkContext) to
-the executors. Finally, SparkContext sends *tasks* for the executors to run.
+the executors. Finally, SparkContext sends *tasks* to the executors to run.
 
 <p style="text-align: center;">
   <img src="img/cluster-overview.png" title="Spark cluster components" alt="Spark cluster components" />
@@ -33,9 +34,9 @@ There are several useful things to note about this architecture:
 2. Spark is agnostic to the underlying cluster manager. As long as it can acquire executor
    processes, and these communicate with each other, it is relatively easy to run it even on a
    cluster manager that also supports other applications (e.g. Mesos/YARN).
-3. The driver program must listen for and accept incoming connections from its executors throughout 
-   its lifetime (e.g., see [spark.driver.port and spark.fileserver.port in the network config 
-   section](configuration.html#networking)). As such, the driver program must be network 
+3. The driver program must listen for and accept incoming connections from its executors throughout
+   its lifetime (e.g., see [spark.driver.port and spark.fileserver.port in the network config
+   section](configuration.html#networking)). As such, the driver program must be network
    addressable from the worker nodes.
 4. Because the driver schedules tasks on the cluster, it should be run close to the worker
    nodes, preferably on the same local area network. If you'd like to send requests to the

diff --git a/docs/configuration.md b/docs/configuration.md
@@ -1437,6 +1437,19 @@ Apart from these, the following properties are also available, and may be useful
 #### Spark Streaming
 <table class="table">
 <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
+<tr>
+  <td><code>spark.streaming.backpressure.enabled</code></td>
+  <td>false</td>
+  <td>
+    Enables or disables Spark Streaming's internal backpressure mechanism (since 1.5).
+    This enables the Spark Streaming to control the receiving rate based on the 
+    current batch scheduling delays and processing times so that the system receives
+    only as fast as the system can process. Internally, this dynamically sets the 
+    maximum receiving rate of receivers. This rate is upper bounded by the values
+    `spark.streaming.receiver.maxRate` and `spark.streaming.kafka.maxRatePerPartition`
+    if they are set (see below).
+  </td>
+</tr>
 <tr>
   <td><code>spark.streaming.blockInterval</code></td>
   <td>200ms</td>

diff --git a/docs/quick-start.md b/docs/quick-start.md
@@ -126,7 +126,7 @@ scala> val wordCounts = textFile.flatMap(line => line.split(" ")).map(word => (w
 wordCounts: spark.RDD[(String, Int)] = spark.ShuffledAggregatedRDD@71f027b8
 {% endhighlight %}
 
-Here, we combined the [`flatMap`](programming-guide.html#transformations), [`map`](programming-guide.html#transformations) and [`reduceByKey`](programming-guide.html#transformations) transformations to compute the per-word counts in the file as an RDD of (String, Int) pairs. To collect the word counts in our shell, we can use the [`collect`](programming-guide.html#actions) action:
+Here, we combined the [`flatMap`](programming-guide.html#transformations), [`map`](programming-guide.html#transformations), and [`reduceByKey`](programming-guide.html#transformations) transformations to compute the per-word counts in the file as an RDD of (String, Int) pairs. To collect the word counts in our shell, we can use the [`collect`](programming-guide.html#actions) action:
 
 {% highlight scala %}
 scala> wordCounts.collect()
@@ -163,7 +163,7 @@ One common data flow pattern is MapReduce, as popularized by Hadoop. Spark can i
 >>> wordCounts = textFile.flatMap(lambda line: line.split()).map(lambda word: (word, 1)).reduceByKey(lambda a, b: a+b)
 {% endhighlight %}
 
-Here, we combined the [`flatMap`](programming-guide.html#transformations), [`map`](programming-guide.html#transformations) and [`reduceByKey`](programming-guide.html#transformations) transformations to compute the per-word counts in the file as an RDD of (string, int) pairs. To collect the word counts in our shell, we can use the [`collect`](programming-guide.html#actions) action:
+Here, we combined the [`flatMap`](programming-guide.html#transformations), [`map`](programming-guide.html#transformations), and [`reduceByKey`](programming-guide.html#transformations) transformations to compute the per-word counts in the file as an RDD of (string, int) pairs. To collect the word counts in our shell, we can use the [`collect`](programming-guide.html#actions) action:
 
 {% highlight python %}
 >>> wordCounts.collect()
@@ -217,13 +217,13 @@ a cluster, as described in the [programming guide](programming-guide.html#initia
 </div>
 
 # Self-Contained Applications
-Now say we wanted to write a self-contained application using the Spark API. We will walk through a
-simple application in both Scala (with SBT), Java (with Maven), and Python.
+Suppose we wish to write a self-contained application using the Spark API. We will walk through a
+simple application in Scala (with sbt), Java (with Maven), and Python.
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-We'll create a very simple Spark application in Scala. So simple, in fact, that it's
+We'll create a very simple Spark application in Scala--so simple, in fact, that it's
 named `SimpleApp.scala`:
 
 {% highlight scala %}
@@ -259,7 +259,7 @@ object which contains information about our
 application. 
 
 Our application depends on the Spark API, so we'll also include an sbt configuration file, 
-`simple.sbt` which explains that Spark is a dependency. This file also adds a repository that 
+`simple.sbt`, which explains that Spark is a dependency. This file also adds a repository that 
 Spark depends on:
 
 {% highlight scala %}
@@ -302,7 +302,7 @@ Lines with a: 46, Lines with b: 23
 
 </div>
 <div data-lang="java" markdown="1">
-This example will use Maven to compile an application jar, but any similar build system will work.
+This example will use Maven to compile an application JAR, but any similar build system will work.
 
 We'll create a very simple Spark application, `SimpleApp.java`:
 
@@ -374,7 +374,7 @@ $ find .
 Now, we can package the application using Maven and execute it with `./bin/spark-submit`.
 
 {% highlight bash %}
-# Package a jar containing your application
+# Package a JAR containing your application
 $ mvn package
 ...
 [INFO] Building jar: {..}/{..}/target/simple-project-1.0.jar

diff --git a/docs/streaming-flume-integration.md b/docs/streaming-flume-integration.md
@@ -5,8 +5,6 @@ title: Spark Streaming + Flume Integration Guide
 
 [Apache Flume](https://flume.apache.org/) is a distributed, reliable, and available service for efficiently collecting, aggregating, and moving large amounts of log data. Here we explain how to configure Flume and Spark Streaming to receive data from Flume. There are two approaches to this.
 
-<span class="badge" style="background-color: grey">Python API</span> Flume is not yet available in the Python API.
-
 ## Approach 1: Flume-style Push-based Approach
 Flume is designed to push data between Flume agents. In this approach, Spark Streaming essentially sets up a receiver that acts an Avro agent for Flume, to which Flume can push the data. Here are the configuration steps.
 

diff --git a/docs/streaming-programming-guide.md b/docs/streaming-programming-guide.md
@@ -50,13 +50,7 @@ all of which are presented in this guide.
 You will find tabs throughout this guide that let you choose between code snippets of
 different languages.
 
-**Note:** Python API for Spark Streaming has been introduced in Spark 1.2. It has all the DStream
-transformations and almost all the output operations available in Scala and Java interfaces.
-However, it only has support for basic sources like text files and text data over sockets.
-APIs for additional sources, like Kafka and Flume, will be available in the future.
-Further information about available features in the Python API are mentioned throughout this
-document; look out for the tag
-<span class="badge" style="background-color: grey">Python API</span>.
+**Note:** There are a few APIs that are either different or not available in Python. Throughout this guide, you will find the tag <span class="badge" style="background-color: grey">Python API</span> highlighting these differences.
 
 ***************************************************************************************************
 
@@ -683,7 +677,7 @@ for Java, and [StreamingContext](api/python/pyspark.streaming.html#pyspark.strea
 {:.no_toc}
 
 <span class="badge" style="background-color: grey">Python API</span> As of Spark {{site.SPARK_VERSION_SHORT}},
-out of these sources, *only* Kafka, Flume and MQTT are available in the Python API. We will add more advanced sources in the Python API in future.
+out of these sources, Kafka, Kinesis, Flume and MQTT are available in the Python API.
 
 This category of sources require interfacing with external non-Spark libraries, some of them with
 complex dependencies (e.g., Kafka and Flume). Hence, to minimize issues related to version conflicts
@@ -725,9 +719,9 @@ Some of these advanced sources are as follows.
 
 - **Kafka:** Spark Streaming {{site.SPARK_VERSION_SHORT}} is compatible with Kafka 0.8.2.1. See the [Kafka Integration Guide](streaming-kafka-integration.html) for more details.
 
-- **Flume:** Spark Streaming {{site.SPARK_VERSION_SHORT}} is compatible with Flume 1.4.0. See the [Flume Integration Guide](streaming-flume-integration.html) for more details.
+- **Flume:** Spark Streaming {{site.SPARK_VERSION_SHORT}} is compatible with Flume 1.6.0. See the [Flume Integration Guide](streaming-flume-integration.html) for more details.
 
-- **Kinesis:** See the [Kinesis Integration Guide](streaming-kinesis-integration.html) for more details.
+- **Kinesis:** Spark Streaming {{site.SPARK_VERSION_SHORT}} is compatible with Kinesis Client Library 1.2.1. See the [Kinesis Integration Guide](streaming-kinesis-integration.html) for more details.
 
 - **Twitter:** Spark Streaming's TwitterUtils uses Twitter4j 3.0.3 to get the public stream of tweets using
   [Twitter's Streaming API](https://dev.twitter.com/docs/streaming-apis). Authentication information
@@ -1813,7 +1807,7 @@ To run a Spark Streaming applications, you need to have the following.
     + *Mesos* - [Marathon](https://github.com/mesosphere/marathon) has been used to achieve this
       with Mesos.
 
-- *[Since Spark 1.2] Configuring write ahead logs* - Since Spark 1.2,
+- *Configuring write ahead logs* - Since Spark 1.2,
   we have introduced _write ahead logs_ for achieving strong
   fault-tolerance guarantees. If enabled,  all the data received from a receiver gets written into
   a write ahead log in the configuration checkpoint directory. This prevents data loss on driver
@@ -1828,6 +1822,17 @@ To run a Spark Streaming applications, you need to have the following.
   stored in a replicated storage system. This can be done by setting the storage level for the
   input stream to `StorageLevel.MEMORY_AND_DISK_SER`.
 
+- *Setting the max receiving rate* - If the cluster resources is not large enough for the streaming
+  application to process data as fast as it is being received, the receivers can be rate limited
+  by setting a maximum rate limit in terms of records / sec.
+  See the [configuration parameters](configuration.html#spark-streaming)
+  `spark.streaming.receiver.maxRate` for receivers and `spark.streaming.kafka.maxRatePerPartition`
+  for Direct Kafka approach. In Spark 1.5, we have introduced a feature called *backpressure* that
+  eliminate the need to set this rate limit, as Spark Streaming automatically figures out the
+  rate limits and dynamically adjusts them if the processing conditions change. This backpressure
+  can be enabled by setting the [configuration parameter](configuration.html#spark-streaming)
+  `spark.streaming.backpressure.enabled` to `true`.
+
 ### Upgrading Application Code
 {:.no_toc}
 

diff --git a/examples/pom.xml b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-candidate-csd-1-SNAPSHOT</version>
+    <version>1.5.0-csd-1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-candidate-csd-1-SNAPSHOT</version>
+    <version>1.5.0-csd-1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 

diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.10</artifactId>
-    <version>1.5.0-candidate-csd-1-SNAPSHOT</version>
+    <version>1.5.0-csd-1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>